feat(agent-reflection): durable kernel — reflection.v1 capture + risk-floor + Phase-0 (#544)
Build the durable kernel of the agent reflection loop. Passive end-of-run capture of the doer's end-state as structured `reflection.v1` data, plus a deterministic diff review risk-floor. The closed calibration/skill-synthesis loop (design §7–§8) stays gated behind Phase-0 experiments P1/P2/P3. - packages/macp: evaluateRiskFloor (pure, deterministic surface classifier) + reflection.v1 JSON Schema; 15 unit tests. - packages/types: reflection.v1 zod schemas + self-report DTO; 10 unit tests. - framework: fail-closed Stop hook (reflect-stop-hook.sh) writing the sidecar, registered as hooks.Stop in runtime/claude/settings.json. Strict no-op unless REFLECTION_MODE=solo|orchestrated; never blocks or fails a session. - scripts/analysis: P1/P2/P3 experiment harnesses with pre-registered kill conditions and structured output. Mechanical fields (risk, files_changed, ids, provenance) are written by the hook; self-report fields (confidence, most_likely_wrong, known_not_in_diff) are merged from an optional $REFLECTION_INPUT, else null + provenance.degraded=true. Independent review remediations: empty/all-.mosaic diff still writes a sidecar (grep no-match no longer aborts); session_id sanitized before path use. Refs #544 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
111
scripts/analysis/reflect-board-history.sh
Executable file
111
scripts/analysis/reflect-board-history.sh
Executable file
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env bash
|
||||
# reflect-board-history.sh — Phase-0 experiment P3 (outcome detectability)
|
||||
#
|
||||
# Question: for completed tasks, how often does a machine-detectable
|
||||
# correct/wrong outcome signal appear within a follow-up window (default 30d)?
|
||||
# If the base rate is too low, predicted-vs-actual calibration (design §7) has
|
||||
# nothing to score against, so the kernel should capture caveat-notes only.
|
||||
#
|
||||
# Method: consume a board/task export (JSONL, one task object per line) OR fall
|
||||
# back to scanning the git history of a `data/` task directory. For each task
|
||||
# that reached a "done"-like state, decide whether a later signal marks it
|
||||
# correct or wrong (reopen, revert, follow-up "fix"/"regression", explicit
|
||||
# outcome field). Emit the detectable-outcome base rate. HARNESS + RUBRIC.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/analysis/reflect-board-history.sh --jsonl FILE [--window-days N] [--json|--md]
|
||||
# scripts/analysis/reflect-board-history.sh --data-dir DIR [--window-days N] [--json|--md]
|
||||
#
|
||||
# JSONL fields used (best-effort): .id .status .completed_at .outcome
|
||||
# .reopened_at .followups[] (free-form). Missing fields are tolerated.
|
||||
#
|
||||
# Requirements: jq (for --jsonl), git (for --data-dir), awk.
|
||||
#
|
||||
# PRE-REGISTERED KILL CONDITION:
|
||||
# detectable-outcome base rate < 20% ⇒ do NOT build §7 calibration loop;
|
||||
# capture caveat-notes only.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
JSONL=""
|
||||
DATA_DIR=""
|
||||
WINDOW_DAYS=30
|
||||
FORMAT="json"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--jsonl) JSONL="$2"; shift 2 ;;
|
||||
--data-dir) DATA_DIR="$2"; shift 2 ;;
|
||||
--window-days) WINDOW_DAYS="$2"; shift 2 ;;
|
||||
--json) FORMAT="json"; shift ;;
|
||||
--md) FORMAT="md"; shift ;;
|
||||
-h|--help) sed -n '2,32p' "$0"; exit 0 ;;
|
||||
*) echo "unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
KILL_CONDITION='detectable-outcome base rate < 20% ⇒ do NOT build §7 calibration loop'
|
||||
echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
|
||||
|
||||
done_total=0
|
||||
detectable=0
|
||||
|
||||
if [[ -n "$JSONL" ]]; then
|
||||
command -v jq >/dev/null 2>&1 || { echo "jq required for --jsonl" >&2; exit 3; }
|
||||
[[ -r "$JSONL" ]] || { echo "cannot read $JSONL" >&2; exit 3; }
|
||||
# Count done tasks and those with a machine-detectable outcome signal.
|
||||
done_total="$(jq -rs '[.[] | select((.status // "") | test("done|complete|closed"; "i"))] | length' "$JSONL" 2>/dev/null || echo 0)"
|
||||
detectable="$(jq -rs '
|
||||
[ .[]
|
||||
| select((.status // "") | test("done|complete|closed"; "i"))
|
||||
| select(
|
||||
(.outcome // null) != null
|
||||
or (.reopened_at // null) != null
|
||||
or ((.followups // []) | length) > 0
|
||||
)
|
||||
] | length' "$JSONL" 2>/dev/null || echo 0)"
|
||||
elif [[ -n "$DATA_DIR" ]]; then
|
||||
command -v git >/dev/null 2>&1 || { echo "git required for --data-dir" >&2; exit 3; }
|
||||
[[ -d "$DATA_DIR" ]] || { echo "no such dir: $DATA_DIR" >&2; exit 3; }
|
||||
# Proxy: a task file later touched by a commit whose subject signals a
|
||||
# correction is a "detectable outcome".
|
||||
while IFS= read -r file; do
|
||||
[[ -z "$file" ]] && continue
|
||||
done_total=$((done_total + 1))
|
||||
if git -C "$DATA_DIR" log --since="${WINDOW_DAYS} days ago" --pretty='%s' -- "$file" 2>/dev/null \
|
||||
| grep -qiE 'reopen|revert|fix|regression|wrong|incorrect|redo'; then
|
||||
detectable=$((detectable + 1))
|
||||
fi
|
||||
done < <(find "$DATA_DIR" -type f -name '*.json' 2>/dev/null)
|
||||
else
|
||||
echo "provide --jsonl FILE or --data-dir DIR" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
rate="$(awk "BEGIN{ if ($done_total==0) print \"0.0\"; else printf \"%.1f\", 100*$detectable/$done_total }")"
|
||||
verdict="$(awk "BEGIN{print ($rate < 20.0) ? \"KILL §7 — caveat-notes only\" : \"signal present — proceed\"}")"
|
||||
|
||||
if [[ "$FORMAT" == "md" ]]; then
|
||||
cat <<EOF
|
||||
## P3 — outcome detectability
|
||||
|
||||
- done-like tasks: **${done_total}**
|
||||
- with machine-detectable outcome (window ${WINDOW_DAYS}d): **${detectable}**
|
||||
- base rate: **${rate}%**
|
||||
- kill condition: ${KILL_CONDITION}
|
||||
- verdict: **${verdict}**
|
||||
EOF
|
||||
else
|
||||
awk -v dt="$done_total" -v d="$detectable" -v r="$rate" -v w="$WINDOW_DAYS" \
|
||||
-v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{
|
||||
printf "{\n"
|
||||
printf " \"experiment\": \"P3-board-history\",\n"
|
||||
printf " \"window_days\": %d,\n", w
|
||||
printf " \"done_tasks\": %d,\n", dt
|
||||
printf " \"detectable_outcomes\": %d,\n", d
|
||||
printf " \"base_rate_pct\": %s,\n", r
|
||||
printf " \"kill_condition\": \"%s\",\n", kc
|
||||
printf " \"verdict\": \"%s\"\n", v
|
||||
printf "}\n"
|
||||
}'
|
||||
fi
|
||||
117
scripts/analysis/reflect-calibration.sh
Executable file
117
scripts/analysis/reflect-calibration.sh
Executable file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env bash
|
||||
# reflect-calibration.sh — Phase-0 experiment P1 (confidence signal)
|
||||
#
|
||||
# Question: does an agent's self-reported confidence discriminate correct from
|
||||
# incorrect work — especially on the self-rated-HIGH subset, where a closed
|
||||
# loop would actually trust it? If confidence ≈ chance on the high subset, the
|
||||
# signal is useless and design §7–§8 should not be built.
|
||||
#
|
||||
# Method: consume a labelled corpus — JSONL of {confidence: 0..1, correct:
|
||||
# true|false}. Compute discrimination as ROC AUC over all rows, plus the
|
||||
# correct-rate (lift) on the high-confidence subset (>= threshold), and compare
|
||||
# to the pre-registered chance baseline (the overall correct-rate). HARNESS +
|
||||
# RUBRIC; the labelled corpus is supplied later.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/analysis/reflect-calibration.sh --jsonl FILE [--high 0.8] [--json|--md]
|
||||
#
|
||||
# Requirements: jq, awk.
|
||||
#
|
||||
# PRE-REGISTERED KILL CONDITION:
|
||||
# AUC <= 0.60 OR high-subset lift <= +5pp over base rate
|
||||
# ⇒ confidence is not a usable routing signal; do NOT build §7–§8.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
JSONL=""
|
||||
HIGH=0.8
|
||||
FORMAT="json"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--jsonl) JSONL="$2"; shift 2 ;;
|
||||
--high) HIGH="$2"; shift 2 ;;
|
||||
--json) FORMAT="json"; shift ;;
|
||||
--md) FORMAT="md"; shift ;;
|
||||
-h|--help) sed -n '2,27p' "$0"; exit 0 ;;
|
||||
*) echo "unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
KILL_CONDITION='AUC <= 0.60 OR high-subset lift <= +5pp ⇒ do NOT build §7–§8'
|
||||
echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
|
||||
|
||||
command -v jq >/dev/null 2>&1 || { echo "jq required" >&2; exit 3; }
|
||||
[[ -r "$JSONL" ]] || { echo "provide a readable --jsonl FILE" >&2; exit 2; }
|
||||
|
||||
# Normalise to "<confidence> <0|1>" rows; tolerate bad lines.
|
||||
ROWS="$(jq -rs '
|
||||
[ .[] | select((.confidence|type)=="number") |
|
||||
"\(.confidence) \((.correct==true) | if . then 1 else 0 end)" ]
|
||||
| .[]' "$JSONL" 2>/dev/null || true)"
|
||||
|
||||
if [[ -z "$ROWS" ]]; then
|
||||
echo '{ "experiment": "P1-calibration", "error": "no usable rows" }'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# AUC via the Mann–Whitney U relation (rank-based); base rate; high-subset lift.
|
||||
read -r N POS BASE AUC HIGH_N HIGH_CORRECT HIGH_RATE LIFT <<EOF
|
||||
$(printf '%s\n' "$ROWS" | awk -v high="$HIGH" '
|
||||
{ c=$1; y=$2; conf[NR]=c; lab[NR]=y; n++;
|
||||
if (y==1) pos++; else neg++;
|
||||
if (c>=high) { hn++; if (y==1) hc++ } }
|
||||
END{
|
||||
base = (n>0)? pos/n : 0;
|
||||
# Rank-sum AUC: average ranks (ties → average rank).
|
||||
# sort indices by confidence
|
||||
for (i=1;i<=n;i++) idx[i]=i;
|
||||
for (i=1;i<=n;i++) for (j=i+1;j<=n;j++) if (conf[idx[i]]>conf[idx[j]]) { t=idx[i]; idx[i]=idx[j]; idx[j]=t }
|
||||
i=1;
|
||||
while (i<=n) {
|
||||
j=i; while (j<n && conf[idx[j+1]]==conf[idx[i]]) j++;
|
||||
avg=(i+j)/2.0;
|
||||
for (k=i;k<=j;k++) rank[idx[k]]=avg;
|
||||
i=j+1;
|
||||
}
|
||||
rsum=0; for (i=1;i<=n;i++) if (lab[i]==1) rsum+=rank[i];
|
||||
if (pos>0 && neg>0) auc=(rsum - pos*(pos+1)/2.0)/(pos*neg); else auc=0.5;
|
||||
hrate=(hn>0)? hc/hn : 0;
|
||||
lift=hrate-base;
|
||||
printf "%d %d %.4f %.4f %d %d %.4f %.4f", n, pos, base, auc, hn, hc, hrate, lift
|
||||
}')
|
||||
EOF
|
||||
|
||||
verdict="$(awk -v auc="$AUC" -v lift="$LIFT" 'BEGIN{
|
||||
print (auc <= 0.60 || lift <= 0.05) ? "KILL §7–§8 — confidence not usable" : "signal present — proceed"
|
||||
}')"
|
||||
|
||||
if [[ "$FORMAT" == "md" ]]; then
|
||||
cat <<EOF
|
||||
## P1 — confidence calibration
|
||||
|
||||
- rows: **${N}** (positives ${POS}) · base correct-rate **$(awk "BEGIN{printf \"%.1f\", 100*${BASE}}")%**
|
||||
- ROC AUC: **${AUC}**
|
||||
- high-confidence subset (>= ${HIGH}): n=${HIGH_N}, correct=${HIGH_CORRECT}, rate=$(awk "BEGIN{printf \"%.1f\", 100*${HIGH_RATE}}")%
|
||||
- lift over base: **$(awk "BEGIN{printf \"%+.1f\", 100*${LIFT}}")pp**
|
||||
- kill condition: ${KILL_CONDITION}
|
||||
- verdict: **${verdict}**
|
||||
EOF
|
||||
else
|
||||
awk -v n="$N" -v pos="$POS" -v base="$BASE" -v auc="$AUC" -v hn="$HIGH_N" \
|
||||
-v hc="$HIGH_CORRECT" -v hr="$HIGH_RATE" -v lift="$LIFT" -v high="$HIGH" \
|
||||
-v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{
|
||||
printf "{\n"
|
||||
printf " \"experiment\": \"P1-calibration\",\n"
|
||||
printf " \"rows\": %d,\n", n
|
||||
printf " \"positives\": %d,\n", pos
|
||||
printf " \"base_rate\": %.4f,\n", base
|
||||
printf " \"auc\": %.4f,\n", auc
|
||||
printf " \"high_threshold\": %s,\n", high
|
||||
printf " \"high_subset\": { \"n\": %d, \"correct\": %d, \"rate\": %.4f },\n", hn, hc, hr
|
||||
printf " \"lift_over_base\": %.4f,\n", lift
|
||||
printf " \"kill_condition\": \"%s\",\n", kc
|
||||
printf " \"verdict\": \"%s\"\n", v
|
||||
printf "}\n"
|
||||
}'
|
||||
fi
|
||||
110
scripts/analysis/reflect-git-history.sh
Executable file
110
scripts/analysis/reflect-git-history.sh
Executable file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env bash
|
||||
# reflect-git-history.sh — Phase-0 experiment P2 ("only-self-reflection" bucket)
|
||||
#
|
||||
# Question: of the failures visible in git history, what fraction would ONLY
|
||||
# have been caught by end-of-run self-reflection — i.e. NOT by CI and NOT by
|
||||
# independent human review? If that bucket is near-empty, the closed
|
||||
# calibration / skill-synthesis loop (design §7–§8) is not worth building.
|
||||
#
|
||||
# Method: scan `git log` over a window for failure signals (reverts, and
|
||||
# fix:/hotfix commits landing shortly after a feature merge). Classify each by
|
||||
# the gate most likely to have caught it, using a pre-registered heuristic.
|
||||
# This is a HARNESS + RUBRIC; the classifier is deliberately simple and the
|
||||
# real corpus/labelling is wired later. It emits a structured tally.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/analysis/reflect-git-history.sh [--repo PATH] [--since SINCE] [--json|--md]
|
||||
#
|
||||
# Options:
|
||||
# --repo PATH repo to analyse (default: current repo)
|
||||
# --since SINCE git log --since value (default: "6 months ago")
|
||||
# --json emit JSON (default)
|
||||
# --md emit markdown
|
||||
#
|
||||
# Requirements: git, awk.
|
||||
#
|
||||
# PRE-REGISTERED KILL CONDITION:
|
||||
# bucket "only_self_reflection" is near-empty (< 10% of classified failures)
|
||||
# ⇒ do NOT build design §7–§8 (closed loop). Caveat-notes capture only.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO="."
|
||||
SINCE="6 months ago"
|
||||
FORMAT="json"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--repo) REPO="$2"; shift 2 ;;
|
||||
--since) SINCE="$2"; shift 2 ;;
|
||||
--json) FORMAT="json"; shift ;;
|
||||
--md) FORMAT="md"; shift ;;
|
||||
-h|--help) sed -n '2,30p' "$0"; exit 0 ;;
|
||||
*) echo "unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
KILL_CONDITION='bucket only_self_reflection < 10% of classified failures ⇒ do NOT build §7–§8'
|
||||
echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
|
||||
|
||||
command -v git >/dev/null 2>&1 || { echo "git required" >&2; exit 3; }
|
||||
|
||||
# Collect candidate failure commits: reverts + fix/hotfix subjects.
|
||||
mapfile -t LINES < <(
|
||||
git -C "$REPO" log --since="$SINCE" --pretty='%H%x09%s' 2>/dev/null \
|
||||
| grep -iE 'revert|hotfix|hot-fix|regression|fix(\(|:|!| )' || true
|
||||
)
|
||||
|
||||
total=0; ci=0; human=0; selfonly=0
|
||||
for line in "${LINES[@]}"; do
|
||||
[[ -z "$line" ]] && continue
|
||||
subj="${line#*$'\t'}"
|
||||
total=$((total + 1))
|
||||
# Pre-registered classification heuristic (gate most likely to have caught it):
|
||||
# - build/test/lint/type/ci signals → CI would have caught it
|
||||
# - security/auth/permission/data/migration → human review would flag it
|
||||
# - everything else (logic/UX/assumption/edge) → only-self-reflection bucket
|
||||
if printf '%s' "$subj" | grep -qiE 'test|lint|type|build|ci|compile|typo'; then
|
||||
ci=$((ci + 1))
|
||||
elif printf '%s' "$subj" | grep -qiE 'security|auth|permission|rbac|secret|migration|data|sql|injection'; then
|
||||
human=$((human + 1))
|
||||
else
|
||||
selfonly=$((selfonly + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
pct() { awk "BEGIN{ if ($2==0) print \"0.0\"; else printf \"%.1f\", 100*$1/$2 }"; }
|
||||
self_pct="$(pct "$selfonly" "$total")"
|
||||
verdict="$(awk "BEGIN{print ($self_pct < 10.0) ? \"KILL §7–§8\" : \"signal present — proceed to deeper labelling\"}")"
|
||||
|
||||
if [[ "$FORMAT" == "md" ]]; then
|
||||
cat <<EOF
|
||||
## P2 — git-history failure-gate attribution
|
||||
|
||||
- window: \`${SINCE}\` · repo: \`${REPO}\`
|
||||
- classified failures: **${total}**
|
||||
|
||||
| gate | count | share |
|
||||
|---|---:|---:|
|
||||
| CI would catch | ${ci} | $(pct "$ci" "$total")% |
|
||||
| human review would catch | ${human} | $(pct "$human" "$total")% |
|
||||
| only-self-reflection | ${selfonly} | ${self_pct}% |
|
||||
|
||||
- kill condition: ${KILL_CONDITION}
|
||||
- verdict: **${verdict}**
|
||||
EOF
|
||||
else
|
||||
awk -v t="$total" -v c="$ci" -v h="$human" -v s="$selfonly" -v sp="$self_pct" \
|
||||
-v v="$verdict" -v since="$SINCE" -v repo="$REPO" -v kc="$KILL_CONDITION" 'BEGIN{
|
||||
printf "{\n"
|
||||
printf " \"experiment\": \"P2-git-history\",\n"
|
||||
printf " \"repo\": \"%s\",\n", repo
|
||||
printf " \"since\": \"%s\",\n", since
|
||||
printf " \"classified_failures\": %d,\n", t
|
||||
printf " \"buckets\": { \"ci\": %d, \"human_review\": %d, \"only_self_reflection\": %d },\n", c, h, s
|
||||
printf " \"only_self_reflection_pct\": %s,\n", sp
|
||||
printf " \"kill_condition\": \"%s\",\n", kc
|
||||
printf " \"verdict\": \"%s\"\n", v
|
||||
printf "}\n"
|
||||
}'
|
||||
fi
|
||||
Reference in New Issue
Block a user