#!/usr/bin/env bash # reflect-calibration.sh — Phase-0 experiment P1 (confidence signal) # # Question: does an agent's self-reported confidence discriminate correct from # incorrect work — especially on the self-rated-HIGH subset, where a closed # loop would actually trust it? If confidence ≈ chance on the high subset, the # signal is useless and design §7–§8 should not be built. # # Method: consume a labelled corpus — JSONL of {confidence: 0..1, correct: # true|false}. Compute discrimination as ROC AUC over all rows, plus the # correct-rate (lift) on the high-confidence subset (>= threshold), and compare # to the pre-registered chance baseline (the overall correct-rate). HARNESS + # RUBRIC; the labelled corpus is supplied later. # # Usage: # scripts/analysis/reflect-calibration.sh --jsonl FILE [--high 0.8] [--json|--md] # # Requirements: jq, awk. # # PRE-REGISTERED KILL CONDITION: # AUC <= 0.60 OR high-subset lift <= +5pp over base rate # ⇒ confidence is not a usable routing signal; do NOT build §7–§8. set -euo pipefail JSONL="" HIGH=0.8 FORMAT="json" while [[ $# -gt 0 ]]; do case "$1" in --jsonl) JSONL="$2"; shift 2 ;; --high) HIGH="$2"; shift 2 ;; --json) FORMAT="json"; shift ;; --md) FORMAT="md"; shift ;; -h|--help) sed -n '2,27p' "$0"; exit 0 ;; *) echo "unknown arg: $1" >&2; exit 2 ;; esac done KILL_CONDITION='AUC <= 0.60 OR high-subset lift <= +5pp ⇒ do NOT build §7–§8' echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2 command -v jq >/dev/null 2>&1 || { echo "jq required" >&2; exit 3; } [[ -r "$JSONL" ]] || { echo "provide a readable --jsonl FILE" >&2; exit 2; } # Normalise to " <0|1>" rows; tolerate bad lines. ROWS="$(jq -rs ' [ .[] | select((.confidence|type)=="number") | "\(.confidence) \((.correct==true) | if . then 1 else 0 end)" ] | .[]' "$JSONL" 2>/dev/null || true)" if [[ -z "$ROWS" ]]; then echo '{ "experiment": "P1-calibration", "error": "no usable rows" }' exit 0 fi # AUC via the Mann–Whitney U relation (rank-based); base rate; high-subset lift. read -r N POS BASE AUC HIGH_N HIGH_CORRECT HIGH_RATE LIFT <=high) { hn++; if (y==1) hc++ } } END{ base = (n>0)? pos/n : 0; # Rank-sum AUC: average ranks (ties → average rank). # sort indices by confidence for (i=1;i<=n;i++) idx[i]=i; for (i=1;i<=n;i++) for (j=i+1;j<=n;j++) if (conf[idx[i]]>conf[idx[j]]) { t=idx[i]; idx[i]=idx[j]; idx[j]=t } i=1; while (i<=n) { j=i; while (j0 && neg>0) auc=(rsum - pos*(pos+1)/2.0)/(pos*neg); else auc=0.5; hrate=(hn>0)? hc/hn : 0; lift=hrate-base; printf "%d %d %.4f %.4f %d %d %.4f %.4f", n, pos, base, auc, hn, hc, hrate, lift }') EOF verdict="$(awk -v auc="$AUC" -v lift="$LIFT" 'BEGIN{ print (auc <= 0.60 || lift <= 0.05) ? "KILL §7–§8 — confidence not usable" : "signal present — proceed" }')" if [[ "$FORMAT" == "md" ]]; then cat <= ${HIGH}): n=${HIGH_N}, correct=${HIGH_CORRECT}, rate=$(awk "BEGIN{printf \"%.1f\", 100*${HIGH_RATE}}")% - lift over base: **$(awk "BEGIN{printf \"%+.1f\", 100*${LIFT}}")pp** - kill condition: ${KILL_CONDITION} - verdict: **${verdict}** EOF else awk -v n="$N" -v pos="$POS" -v base="$BASE" -v auc="$AUC" -v hn="$HIGH_N" \ -v hc="$HIGH_CORRECT" -v hr="$HIGH_RATE" -v lift="$LIFT" -v high="$HIGH" \ -v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{ printf "{\n" printf " \"experiment\": \"P1-calibration\",\n" printf " \"rows\": %d,\n", n printf " \"positives\": %d,\n", pos printf " \"base_rate\": %.4f,\n", base printf " \"auc\": %.4f,\n", auc printf " \"high_threshold\": %s,\n", high printf " \"high_subset\": { \"n\": %d, \"correct\": %d, \"rate\": %.4f },\n", hn, hc, hr printf " \"lift_over_base\": %.4f,\n", lift printf " \"kill_condition\": \"%s\",\n", kc printf " \"verdict\": \"%s\"\n", v printf "}\n" }' fi