feat(agent-reflection): durable kernel — reflection.v1 capture + risk-floor + Phase-0 (#545)

2026-06-16 21:35:40 +00:00
parent c461380a4a
commit b8807e60df
17 changed files with 1498 additions and 0 deletions
--- a/scripts/analysis/reflect-board-history.sh
+++ b/scripts/analysis/reflect-board-history.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# reflect-board-history.sh — Phase-0 experiment P3 (outcome detectability)
+#
+# Question: for completed tasks, how often does a machine-detectable
+# correct/wrong outcome signal appear within a follow-up window (default 30d)?
+# If the base rate is too low, predicted-vs-actual calibration (design §7) has
+# nothing to score against, so the kernel should capture caveat-notes only.
+#
+# Method: consume a board/task export (JSONL, one task object per line) OR fall
+# back to scanning the git history of a `data/` task directory. For each task
+# that reached a "done"-like state, decide whether a later signal marks it
+# correct or wrong (reopen, revert, follow-up "fix"/"regression", explicit
+# outcome field). Emit the detectable-outcome base rate. HARNESS + RUBRIC.
+#
+# Usage:
+#   scripts/analysis/reflect-board-history.sh --jsonl FILE [--window-days N] [--json|--md]
+#   scripts/analysis/reflect-board-history.sh --data-dir DIR [--window-days N] [--json|--md]
+#
+# JSONL fields used (best-effort): .id .status .completed_at .outcome
+#   .reopened_at .followups[] (free-form). Missing fields are tolerated.
+#
+# Requirements: jq (for --jsonl), git (for --data-dir), awk.
+#
+# PRE-REGISTERED KILL CONDITION:
+#   detectable-outcome base rate < 20% ⇒ do NOT build §7 calibration loop;
+#   capture caveat-notes only.
+
+set -euo pipefail
+
+JSONL=""
+DATA_DIR=""
+WINDOW_DAYS=30
+FORMAT="json"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --jsonl) JSONL="$2"; shift 2 ;;
+    --data-dir) DATA_DIR="$2"; shift 2 ;;
+    --window-days) WINDOW_DAYS="$2"; shift 2 ;;
+    --json) FORMAT="json"; shift ;;
+    --md) FORMAT="md"; shift ;;
+    -h|--help) sed -n '2,32p' "$0"; exit 0 ;;
+    *) echo "unknown arg: $1" >&2; exit 2 ;;
+  esac
+done
+
+KILL_CONDITION='detectable-outcome base rate < 20% ⇒ do NOT build §7 calibration loop'
+echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
+
+done_total=0
+detectable=0
+
+if [[ -n "$JSONL" ]]; then
+  command -v jq >/dev/null 2>&1 || { echo "jq required for --jsonl" >&2; exit 3; }
+  [[ -r "$JSONL" ]] || { echo "cannot read $JSONL" >&2; exit 3; }
+  # Count done tasks and those with a machine-detectable outcome signal.
+  done_total="$(jq -rs '[.[] | select((.status // "") | test("done|complete|closed"; "i"))] | length' "$JSONL" 2>/dev/null || echo 0)"
+  detectable="$(jq -rs '
+    [ .[]
+      | select((.status // "") | test("done|complete|closed"; "i"))
+      | select(
+          (.outcome // null) != null
+          or (.reopened_at // null) != null
+          or ((.followups // []) | length) > 0
+        )
+    ] | length' "$JSONL" 2>/dev/null || echo 0)"
+elif [[ -n "$DATA_DIR" ]]; then
+  command -v git >/dev/null 2>&1 || { echo "git required for --data-dir" >&2; exit 3; }
+  [[ -d "$DATA_DIR" ]] || { echo "no such dir: $DATA_DIR" >&2; exit 3; }
+  # Proxy: a task file later touched by a commit whose subject signals a
+  # correction is a "detectable outcome".
+  while IFS= read -r file; do
+    [[ -z "$file" ]] && continue
+    done_total=$((done_total + 1))
+    if git -C "$DATA_DIR" log --since="${WINDOW_DAYS} days ago" --pretty='%s' -- "$file" 2>/dev/null \
+         | grep -qiE 'reopen|revert|fix|regression|wrong|incorrect|redo'; then
+      detectable=$((detectable + 1))
+    fi
+  done < <(find "$DATA_DIR" -type f -name '*.json' 2>/dev/null)
+else
+  echo "provide --jsonl FILE or --data-dir DIR" >&2
+  exit 2
+fi
+
+rate="$(awk "BEGIN{ if ($done_total==0) print \"0.0\"; else printf \"%.1f\", 100*$detectable/$done_total }")"
+verdict="$(awk "BEGIN{print ($rate < 20.0) ? \"KILL §7 — caveat-notes only\" : \"signal present — proceed\"}")"
+
+if [[ "$FORMAT" == "md" ]]; then
+  cat <<EOF
+## P3 — outcome detectability
+
+- done-like tasks: **${done_total}**
+- with machine-detectable outcome (window ${WINDOW_DAYS}d): **${detectable}**
+- base rate: **${rate}%**
+- kill condition: ${KILL_CONDITION}
+- verdict: **${verdict}**
+EOF
+else
+  awk -v dt="$done_total" -v d="$detectable" -v r="$rate" -v w="$WINDOW_DAYS" \
+      -v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{
+    printf "{\n"
+    printf "  \"experiment\": \"P3-board-history\",\n"
+    printf "  \"window_days\": %d,\n", w
+    printf "  \"done_tasks\": %d,\n", dt
+    printf "  \"detectable_outcomes\": %d,\n", d
+    printf "  \"base_rate_pct\": %s,\n", r
+    printf "  \"kill_condition\": \"%s\",\n", kc
+    printf "  \"verdict\": \"%s\"\n", v
+    printf "}\n"
+  }'
+fi
--- a/scripts/analysis/reflect-calibration.sh
+++ b/scripts/analysis/reflect-calibration.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+# reflect-calibration.sh — Phase-0 experiment P1 (confidence signal)
+#
+# Question: does an agent's self-reported confidence discriminate correct from
+# incorrect work — especially on the self-rated-HIGH subset, where a closed
+# loop would actually trust it? If confidence ≈ chance on the high subset, the
+# signal is useless and design §7–§8 should not be built.
+#
+# Method: consume a labelled corpus — JSONL of {confidence: 0..1, correct:
+# true|false}. Compute discrimination as ROC AUC over all rows, plus the
+# correct-rate (lift) on the high-confidence subset (>= threshold), and compare
+# to the pre-registered chance baseline (the overall correct-rate). HARNESS +
+# RUBRIC; the labelled corpus is supplied later.
+#
+# Usage:
+#   scripts/analysis/reflect-calibration.sh --jsonl FILE [--high 0.8] [--json|--md]
+#
+# Requirements: jq, awk.
+#
+# PRE-REGISTERED KILL CONDITION:
+#   AUC <= 0.60 OR high-subset lift <= +5pp over base rate
+#   ⇒ confidence is not a usable routing signal; do NOT build §7–§8.
+
+set -euo pipefail
+
+JSONL=""
+HIGH=0.8
+FORMAT="json"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --jsonl) JSONL="$2"; shift 2 ;;
+    --high) HIGH="$2"; shift 2 ;;
+    --json) FORMAT="json"; shift ;;
+    --md) FORMAT="md"; shift ;;
+    -h|--help) sed -n '2,27p' "$0"; exit 0 ;;
+    *) echo "unknown arg: $1" >&2; exit 2 ;;
+  esac
+done
+
+KILL_CONDITION='AUC <= 0.60 OR high-subset lift <= +5pp ⇒ do NOT build §7–§8'
+echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
+
+command -v jq >/dev/null 2>&1 || { echo "jq required" >&2; exit 3; }
+[[ -r "$JSONL" ]] || { echo "provide a readable --jsonl FILE" >&2; exit 2; }
+
+# Normalise to "<confidence> <0|1>" rows; tolerate bad lines.
+ROWS="$(jq -rs '
+  [ .[] | select((.confidence|type)=="number") |
+    "\(.confidence) \((.correct==true) | if . then 1 else 0 end)" ]
+  | .[]' "$JSONL" 2>/dev/null || true)"
+
+if [[ -z "$ROWS" ]]; then
+  echo '{ "experiment": "P1-calibration", "error": "no usable rows" }'
+  exit 0
+fi
+
+# AUC via the Mann–Whitney U relation (rank-based); base rate; high-subset lift.
+read -r N POS BASE AUC HIGH_N HIGH_CORRECT HIGH_RATE LIFT <<EOF
+$(printf '%s\n' "$ROWS" | awk -v high="$HIGH" '
+  { c=$1; y=$2; conf[NR]=c; lab[NR]=y; n++;
+    if (y==1) pos++; else neg++;
+    if (c>=high) { hn++; if (y==1) hc++ } }
+  END{
+    base = (n>0)? pos/n : 0;
+    # Rank-sum AUC: average ranks (ties → average rank).
+    # sort indices by confidence
+    for (i=1;i<=n;i++) idx[i]=i;
+    for (i=1;i<=n;i++) for (j=i+1;j<=n;j++) if (conf[idx[i]]>conf[idx[j]]) { t=idx[i]; idx[i]=idx[j]; idx[j]=t }
+    i=1;
+    while (i<=n) {
+      j=i; while (j<n && conf[idx[j+1]]==conf[idx[i]]) j++;
+      avg=(i+j)/2.0;
+      for (k=i;k<=j;k++) rank[idx[k]]=avg;
+      i=j+1;
+    }
+    rsum=0; for (i=1;i<=n;i++) if (lab[i]==1) rsum+=rank[i];
+    if (pos>0 && neg>0) auc=(rsum - pos*(pos+1)/2.0)/(pos*neg); else auc=0.5;
+    hrate=(hn>0)? hc/hn : 0;
+    lift=hrate-base;
+    printf "%d %d %.4f %.4f %d %d %.4f %.4f", n, pos, base, auc, hn, hc, hrate, lift
+  }')
+EOF
+
+verdict="$(awk -v auc="$AUC" -v lift="$LIFT" 'BEGIN{
+  print (auc <= 0.60 || lift <= 0.05) ? "KILL §7–§8 — confidence not usable" : "signal present — proceed"
+}')"
+
+if [[ "$FORMAT" == "md" ]]; then
+  cat <<EOF
+## P1 — confidence calibration
+
+- rows: **${N}** (positives ${POS}) · base correct-rate **$(awk "BEGIN{printf \"%.1f\", 100*${BASE}}")%**
+- ROC AUC: **${AUC}**
+- high-confidence subset (>= ${HIGH}): n=${HIGH_N}, correct=${HIGH_CORRECT}, rate=$(awk "BEGIN{printf \"%.1f\", 100*${HIGH_RATE}}")%
+- lift over base: **$(awk "BEGIN{printf \"%+.1f\", 100*${LIFT}}")pp**
+- kill condition: ${KILL_CONDITION}
+- verdict: **${verdict}**
+EOF
+else
+  awk -v n="$N" -v pos="$POS" -v base="$BASE" -v auc="$AUC" -v hn="$HIGH_N" \
+      -v hc="$HIGH_CORRECT" -v hr="$HIGH_RATE" -v lift="$LIFT" -v high="$HIGH" \
+      -v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{
+    printf "{\n"
+    printf "  \"experiment\": \"P1-calibration\",\n"
+    printf "  \"rows\": %d,\n", n
+    printf "  \"positives\": %d,\n", pos
+    printf "  \"base_rate\": %.4f,\n", base
+    printf "  \"auc\": %.4f,\n", auc
+    printf "  \"high_threshold\": %s,\n", high
+    printf "  \"high_subset\": { \"n\": %d, \"correct\": %d, \"rate\": %.4f },\n", hn, hc, hr
+    printf "  \"lift_over_base\": %.4f,\n", lift
+    printf "  \"kill_condition\": \"%s\",\n", kc
+    printf "  \"verdict\": \"%s\"\n", v
+    printf "}\n"
+  }'
+fi
--- a/scripts/analysis/reflect-git-history.sh
+++ b/scripts/analysis/reflect-git-history.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# reflect-git-history.sh — Phase-0 experiment P2 ("only-self-reflection" bucket)
+#
+# Question: of the failures visible in git history, what fraction would ONLY
+# have been caught by end-of-run self-reflection — i.e. NOT by CI and NOT by
+# independent human review? If that bucket is near-empty, the closed
+# calibration / skill-synthesis loop (design §7–§8) is not worth building.
+#
+# Method: scan `git log` over a window for failure signals (reverts, and
+# fix:/hotfix commits landing shortly after a feature merge). Classify each by
+# the gate most likely to have caught it, using a pre-registered heuristic.
+# This is a HARNESS + RUBRIC; the classifier is deliberately simple and the
+# real corpus/labelling is wired later. It emits a structured tally.
+#
+# Usage:
+#   scripts/analysis/reflect-git-history.sh [--repo PATH] [--since SINCE] [--json|--md]
+#
+# Options:
+#   --repo PATH   repo to analyse (default: current repo)
+#   --since SINCE git log --since value (default: "6 months ago")
+#   --json        emit JSON (default)
+#   --md          emit markdown
+#
+# Requirements: git, awk.
+#
+# PRE-REGISTERED KILL CONDITION:
+#   bucket "only_self_reflection" is near-empty (< 10% of classified failures)
+#   ⇒ do NOT build design §7–§8 (closed loop). Caveat-notes capture only.
+
+set -euo pipefail
+
+REPO="."
+SINCE="6 months ago"
+FORMAT="json"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --repo) REPO="$2"; shift 2 ;;
+    --since) SINCE="$2"; shift 2 ;;
+    --json) FORMAT="json"; shift ;;
+    --md) FORMAT="md"; shift ;;
+    -h|--help) sed -n '2,30p' "$0"; exit 0 ;;
+    *) echo "unknown arg: $1" >&2; exit 2 ;;
+  esac
+done
+
+KILL_CONDITION='bucket only_self_reflection < 10% of classified failures ⇒ do NOT build §7–§8'
+echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
+
+command -v git >/dev/null 2>&1 || { echo "git required" >&2; exit 3; }
+
+# Collect candidate failure commits: reverts + fix/hotfix subjects.
+mapfile -t LINES < <(
+  git -C "$REPO" log --since="$SINCE" --pretty='%H%x09%s' 2>/dev/null \
+    | grep -iE 'revert|hotfix|hot-fix|regression|fix(\(|:|!| )' || true
+)
+
+total=0; ci=0; human=0; selfonly=0
+for line in "${LINES[@]}"; do
+  [[ -z "$line" ]] && continue
+  subj="${line#*$'\t'}"
+  total=$((total + 1))
+  # Pre-registered classification heuristic (gate most likely to have caught it):
+  #   - build/test/lint/type/ci signals → CI would have caught it
+  #   - security/auth/permission/data/migration → human review would flag it
+  #   - everything else (logic/UX/assumption/edge) → only-self-reflection bucket
+  if printf '%s' "$subj" | grep -qiE 'test|lint|type|build|ci|compile|typo'; then
+    ci=$((ci + 1))
+  elif printf '%s' "$subj" | grep -qiE 'security|auth|permission|rbac|secret|migration|data|sql|injection'; then
+    human=$((human + 1))
+  else
+    selfonly=$((selfonly + 1))
+  fi
+done
+
+pct() { awk "BEGIN{ if ($2==0) print \"0.0\"; else printf \"%.1f\", 100*$1/$2 }"; }
+self_pct="$(pct "$selfonly" "$total")"
+verdict="$(awk "BEGIN{print ($self_pct < 10.0) ? \"KILL §7–§8\" : \"signal present — proceed to deeper labelling\"}")"
+
+if [[ "$FORMAT" == "md" ]]; then
+  cat <<EOF
+## P2 — git-history failure-gate attribution
+
+- window: \`${SINCE}\` · repo: \`${REPO}\`
+- classified failures: **${total}**
+
+| gate | count | share |
+|---|---:|---:|
+| CI would catch | ${ci} | $(pct "$ci" "$total")% |
+| human review would catch | ${human} | $(pct "$human" "$total")% |
+| only-self-reflection | ${selfonly} | ${self_pct}% |
+
+- kill condition: ${KILL_CONDITION}
+- verdict: **${verdict}**
+EOF
+else
+  awk -v t="$total" -v c="$ci" -v h="$human" -v s="$selfonly" -v sp="$self_pct" \
+      -v v="$verdict" -v since="$SINCE" -v repo="$REPO" -v kc="$KILL_CONDITION" 'BEGIN{
+    printf "{\n"
+    printf "  \"experiment\": \"P2-git-history\",\n"
+    printf "  \"repo\": \"%s\",\n", repo
+    printf "  \"since\": \"%s\",\n", since
+    printf "  \"classified_failures\": %d,\n", t
+    printf "  \"buckets\": { \"ci\": %d, \"human_review\": %d, \"only_self_reflection\": %d },\n", c, h, s
+    printf "  \"only_self_reflection_pct\": %s,\n", sp
+    printf "  \"kill_condition\": \"%s\",\n", kc
+    printf "  \"verdict\": \"%s\"\n", v
+    printf "}\n"
+  }'
+fi