feat(agent-reflection): durable kernel — reflection.v1 capture + risk-floor + Phase-0 (#545)

2026-06-16 21:35:40 +00:00
parent c461380a4a
commit b8807e60df
17 changed files with 1498 additions and 0 deletions
--- a/scripts/analysis/reflect-calibration.sh
+++ b/scripts/analysis/reflect-calibration.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+# reflect-calibration.sh — Phase-0 experiment P1 (confidence signal)
+#
+# Question: does an agent's self-reported confidence discriminate correct from
+# incorrect work — especially on the self-rated-HIGH subset, where a closed
+# loop would actually trust it? If confidence ≈ chance on the high subset, the
+# signal is useless and design §7–§8 should not be built.
+#
+# Method: consume a labelled corpus — JSONL of {confidence: 0..1, correct:
+# true|false}. Compute discrimination as ROC AUC over all rows, plus the
+# correct-rate (lift) on the high-confidence subset (>= threshold), and compare
+# to the pre-registered chance baseline (the overall correct-rate). HARNESS +
+# RUBRIC; the labelled corpus is supplied later.
+#
+# Usage:
+#   scripts/analysis/reflect-calibration.sh --jsonl FILE [--high 0.8] [--json|--md]
+#
+# Requirements: jq, awk.
+#
+# PRE-REGISTERED KILL CONDITION:
+#   AUC <= 0.60 OR high-subset lift <= +5pp over base rate
+#   ⇒ confidence is not a usable routing signal; do NOT build §7–§8.
+
+set -euo pipefail
+
+JSONL=""
+HIGH=0.8
+FORMAT="json"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --jsonl) JSONL="$2"; shift 2 ;;
+    --high) HIGH="$2"; shift 2 ;;
+    --json) FORMAT="json"; shift ;;
+    --md) FORMAT="md"; shift ;;
+    -h|--help) sed -n '2,27p' "$0"; exit 0 ;;
+    *) echo "unknown arg: $1" >&2; exit 2 ;;
+  esac
+done
+
+KILL_CONDITION='AUC <= 0.60 OR high-subset lift <= +5pp ⇒ do NOT build §7–§8'
+echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
+
+command -v jq >/dev/null 2>&1 || { echo "jq required" >&2; exit 3; }
+[[ -r "$JSONL" ]] || { echo "provide a readable --jsonl FILE" >&2; exit 2; }
+
+# Normalise to "<confidence> <0|1>" rows; tolerate bad lines.
+ROWS="$(jq -rs '
+  [ .[] | select((.confidence|type)=="number") |
+    "\(.confidence) \((.correct==true) | if . then 1 else 0 end)" ]
+  | .[]' "$JSONL" 2>/dev/null || true)"
+
+if [[ -z "$ROWS" ]]; then
+  echo '{ "experiment": "P1-calibration", "error": "no usable rows" }'
+  exit 0
+fi
+
+# AUC via the Mann–Whitney U relation (rank-based); base rate; high-subset lift.
+read -r N POS BASE AUC HIGH_N HIGH_CORRECT HIGH_RATE LIFT <<EOF
+$(printf '%s\n' "$ROWS" | awk -v high="$HIGH" '
+  { c=$1; y=$2; conf[NR]=c; lab[NR]=y; n++;
+    if (y==1) pos++; else neg++;
+    if (c>=high) { hn++; if (y==1) hc++ } }
+  END{
+    base = (n>0)? pos/n : 0;
+    # Rank-sum AUC: average ranks (ties → average rank).
+    # sort indices by confidence
+    for (i=1;i<=n;i++) idx[i]=i;
+    for (i=1;i<=n;i++) for (j=i+1;j<=n;j++) if (conf[idx[i]]>conf[idx[j]]) { t=idx[i]; idx[i]=idx[j]; idx[j]=t }
+    i=1;
+    while (i<=n) {
+      j=i; while (j<n && conf[idx[j+1]]==conf[idx[i]]) j++;
+      avg=(i+j)/2.0;
+      for (k=i;k<=j;k++) rank[idx[k]]=avg;
+      i=j+1;
+    }
+    rsum=0; for (i=1;i<=n;i++) if (lab[i]==1) rsum+=rank[i];
+    if (pos>0 && neg>0) auc=(rsum - pos*(pos+1)/2.0)/(pos*neg); else auc=0.5;
+    hrate=(hn>0)? hc/hn : 0;
+    lift=hrate-base;
+    printf "%d %d %.4f %.4f %d %d %.4f %.4f", n, pos, base, auc, hn, hc, hrate, lift
+  }')
+EOF
+
+verdict="$(awk -v auc="$AUC" -v lift="$LIFT" 'BEGIN{
+  print (auc <= 0.60 || lift <= 0.05) ? "KILL §7–§8 — confidence not usable" : "signal present — proceed"
+}')"
+
+if [[ "$FORMAT" == "md" ]]; then
+  cat <<EOF
+## P1 — confidence calibration
+
+- rows: **${N}** (positives ${POS}) · base correct-rate **$(awk "BEGIN{printf \"%.1f\", 100*${BASE}}")%**
+- ROC AUC: **${AUC}**
+- high-confidence subset (>= ${HIGH}): n=${HIGH_N}, correct=${HIGH_CORRECT}, rate=$(awk "BEGIN{printf \"%.1f\", 100*${HIGH_RATE}}")%
+- lift over base: **$(awk "BEGIN{printf \"%+.1f\", 100*${LIFT}}")pp**
+- kill condition: ${KILL_CONDITION}
+- verdict: **${verdict}**
+EOF
+else
+  awk -v n="$N" -v pos="$POS" -v base="$BASE" -v auc="$AUC" -v hn="$HIGH_N" \
+      -v hc="$HIGH_CORRECT" -v hr="$HIGH_RATE" -v lift="$LIFT" -v high="$HIGH" \
+      -v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{
+    printf "{\n"
+    printf "  \"experiment\": \"P1-calibration\",\n"
+    printf "  \"rows\": %d,\n", n
+    printf "  \"positives\": %d,\n", pos
+    printf "  \"base_rate\": %.4f,\n", base
+    printf "  \"auc\": %.4f,\n", auc
+    printf "  \"high_threshold\": %s,\n", high
+    printf "  \"high_subset\": { \"n\": %d, \"correct\": %d, \"rate\": %.4f },\n", hn, hc, hr
+    printf "  \"lift_over_base\": %.4f,\n", lift
+    printf "  \"kill_condition\": \"%s\",\n", kc
+    printf "  \"verdict\": \"%s\"\n", v
+    printf "}\n"
+  }'
+fi