feat(agent-reflection): durable kernel — reflection.v1 capture + risk-floor + Phase-0 (#545)
This commit was merged in pull request #545.
This commit is contained in:
117
scripts/analysis/reflect-calibration.sh
Executable file
117
scripts/analysis/reflect-calibration.sh
Executable file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env bash
|
||||
# reflect-calibration.sh — Phase-0 experiment P1 (confidence signal)
|
||||
#
|
||||
# Question: does an agent's self-reported confidence discriminate correct from
|
||||
# incorrect work — especially on the self-rated-HIGH subset, where a closed
|
||||
# loop would actually trust it? If confidence ≈ chance on the high subset, the
|
||||
# signal is useless and design §7–§8 should not be built.
|
||||
#
|
||||
# Method: consume a labelled corpus — JSONL of {confidence: 0..1, correct:
|
||||
# true|false}. Compute discrimination as ROC AUC over all rows, plus the
|
||||
# correct-rate (lift) on the high-confidence subset (>= threshold), and compare
|
||||
# to the pre-registered chance baseline (the overall correct-rate). HARNESS +
|
||||
# RUBRIC; the labelled corpus is supplied later.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/analysis/reflect-calibration.sh --jsonl FILE [--high 0.8] [--json|--md]
|
||||
#
|
||||
# Requirements: jq, awk.
|
||||
#
|
||||
# PRE-REGISTERED KILL CONDITION:
|
||||
# AUC <= 0.60 OR high-subset lift <= +5pp over base rate
|
||||
# ⇒ confidence is not a usable routing signal; do NOT build §7–§8.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
JSONL=""
|
||||
HIGH=0.8
|
||||
FORMAT="json"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--jsonl) JSONL="$2"; shift 2 ;;
|
||||
--high) HIGH="$2"; shift 2 ;;
|
||||
--json) FORMAT="json"; shift ;;
|
||||
--md) FORMAT="md"; shift ;;
|
||||
-h|--help) sed -n '2,27p' "$0"; exit 0 ;;
|
||||
*) echo "unknown arg: $1" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
KILL_CONDITION='AUC <= 0.60 OR high-subset lift <= +5pp ⇒ do NOT build §7–§8'
|
||||
echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2
|
||||
|
||||
command -v jq >/dev/null 2>&1 || { echo "jq required" >&2; exit 3; }
|
||||
[[ -r "$JSONL" ]] || { echo "provide a readable --jsonl FILE" >&2; exit 2; }
|
||||
|
||||
# Normalise to "<confidence> <0|1>" rows; tolerate bad lines.
|
||||
ROWS="$(jq -rs '
|
||||
[ .[] | select((.confidence|type)=="number") |
|
||||
"\(.confidence) \((.correct==true) | if . then 1 else 0 end)" ]
|
||||
| .[]' "$JSONL" 2>/dev/null || true)"
|
||||
|
||||
if [[ -z "$ROWS" ]]; then
|
||||
echo '{ "experiment": "P1-calibration", "error": "no usable rows" }'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# AUC via the Mann–Whitney U relation (rank-based); base rate; high-subset lift.
|
||||
read -r N POS BASE AUC HIGH_N HIGH_CORRECT HIGH_RATE LIFT <<EOF
|
||||
$(printf '%s\n' "$ROWS" | awk -v high="$HIGH" '
|
||||
{ c=$1; y=$2; conf[NR]=c; lab[NR]=y; n++;
|
||||
if (y==1) pos++; else neg++;
|
||||
if (c>=high) { hn++; if (y==1) hc++ } }
|
||||
END{
|
||||
base = (n>0)? pos/n : 0;
|
||||
# Rank-sum AUC: average ranks (ties → average rank).
|
||||
# sort indices by confidence
|
||||
for (i=1;i<=n;i++) idx[i]=i;
|
||||
for (i=1;i<=n;i++) for (j=i+1;j<=n;j++) if (conf[idx[i]]>conf[idx[j]]) { t=idx[i]; idx[i]=idx[j]; idx[j]=t }
|
||||
i=1;
|
||||
while (i<=n) {
|
||||
j=i; while (j<n && conf[idx[j+1]]==conf[idx[i]]) j++;
|
||||
avg=(i+j)/2.0;
|
||||
for (k=i;k<=j;k++) rank[idx[k]]=avg;
|
||||
i=j+1;
|
||||
}
|
||||
rsum=0; for (i=1;i<=n;i++) if (lab[i]==1) rsum+=rank[i];
|
||||
if (pos>0 && neg>0) auc=(rsum - pos*(pos+1)/2.0)/(pos*neg); else auc=0.5;
|
||||
hrate=(hn>0)? hc/hn : 0;
|
||||
lift=hrate-base;
|
||||
printf "%d %d %.4f %.4f %d %d %.4f %.4f", n, pos, base, auc, hn, hc, hrate, lift
|
||||
}')
|
||||
EOF
|
||||
|
||||
verdict="$(awk -v auc="$AUC" -v lift="$LIFT" 'BEGIN{
|
||||
print (auc <= 0.60 || lift <= 0.05) ? "KILL §7–§8 — confidence not usable" : "signal present — proceed"
|
||||
}')"
|
||||
|
||||
if [[ "$FORMAT" == "md" ]]; then
|
||||
cat <<EOF
|
||||
## P1 — confidence calibration
|
||||
|
||||
- rows: **${N}** (positives ${POS}) · base correct-rate **$(awk "BEGIN{printf \"%.1f\", 100*${BASE}}")%**
|
||||
- ROC AUC: **${AUC}**
|
||||
- high-confidence subset (>= ${HIGH}): n=${HIGH_N}, correct=${HIGH_CORRECT}, rate=$(awk "BEGIN{printf \"%.1f\", 100*${HIGH_RATE}}")%
|
||||
- lift over base: **$(awk "BEGIN{printf \"%+.1f\", 100*${LIFT}}")pp**
|
||||
- kill condition: ${KILL_CONDITION}
|
||||
- verdict: **${verdict}**
|
||||
EOF
|
||||
else
|
||||
awk -v n="$N" -v pos="$POS" -v base="$BASE" -v auc="$AUC" -v hn="$HIGH_N" \
|
||||
-v hc="$HIGH_CORRECT" -v hr="$HIGH_RATE" -v lift="$LIFT" -v high="$HIGH" \
|
||||
-v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{
|
||||
printf "{\n"
|
||||
printf " \"experiment\": \"P1-calibration\",\n"
|
||||
printf " \"rows\": %d,\n", n
|
||||
printf " \"positives\": %d,\n", pos
|
||||
printf " \"base_rate\": %.4f,\n", base
|
||||
printf " \"auc\": %.4f,\n", auc
|
||||
printf " \"high_threshold\": %s,\n", high
|
||||
printf " \"high_subset\": { \"n\": %d, \"correct\": %d, \"rate\": %.4f },\n", hn, hc, hr
|
||||
printf " \"lift_over_base\": %.4f,\n", lift
|
||||
printf " \"kill_condition\": \"%s\",\n", kc
|
||||
printf " \"verdict\": \"%s\"\n", v
|
||||
printf "}\n"
|
||||
}'
|
||||
fi
|
||||
Reference in New Issue
Block a user