stack/scripts/analysis/reflect-calibration.sh

#!/usr/bin/env bash
# reflect-calibration.sh — Phase-0 experiment P1 (confidence signal)
#
# Question: does an agent's self-reported confidence discriminate correct from
# incorrect work — especially on the self-rated-HIGH subset, where a closed
# loop would actually trust it? If confidence ≈ chance on the high subset, the
# signal is useless and design §7–§8 should not be built.
#
# Method: consume a labelled corpus — JSONL of {confidence: 0..1, correct:
# true|false}. Compute discrimination as ROC AUC over all rows, plus the
# correct-rate (lift) on the high-confidence subset (>= threshold), and compare
# to the pre-registered chance baseline (the overall correct-rate). HARNESS +
# RUBRIC; the labelled corpus is supplied later.
#
# Usage:
#   scripts/analysis/reflect-calibration.sh --jsonl FILE [--high 0.8] [--json|--md]
#
# Requirements: jq, awk.
#
# PRE-REGISTERED KILL CONDITION:
#   AUC <= 0.60 OR high-subset lift <= +5pp over base rate
#   ⇒ confidence is not a usable routing signal; do NOT build §7–§8.

set -euo pipefail

JSONL=""
HIGH=0.8
FORMAT="json"

while [[ $# -gt 0 ]]; do
  case "$1" in
    --jsonl) JSONL="$2"; shift 2 ;;
    --high) HIGH="$2"; shift 2 ;;
    --json) FORMAT="json"; shift ;;
    --md) FORMAT="md"; shift ;;
    -h|--help) sed -n '2,27p' "$0"; exit 0 ;;
    *) echo "unknown arg: $1" >&2; exit 2 ;;
  esac
done

KILL_CONDITION='AUC <= 0.60 OR high-subset lift <= +5pp ⇒ do NOT build §7–§8'
echo "# pre-registered kill condition: ${KILL_CONDITION}" >&2

command -v jq >/dev/null 2>&1 || { echo "jq required" >&2; exit 3; }
[[ -r "$JSONL" ]] || { echo "provide a readable --jsonl FILE" >&2; exit 2; }

# Normalise to "<confidence> <0|1>" rows; tolerate bad lines.
ROWS="$(jq -rs '
  [ .[] | select((.confidence|type)=="number") |
    "\(.confidence) \((.correct==true) | if . then 1 else 0 end)" ]
  | .[]' "$JSONL" 2>/dev/null || true)"

if [[ -z "$ROWS" ]]; then
  echo '{ "experiment": "P1-calibration", "error": "no usable rows" }'
  exit 0
fi

# AUC via the Mann–Whitney U relation (rank-based); base rate; high-subset lift.
read -r N POS BASE AUC HIGH_N HIGH_CORRECT HIGH_RATE LIFT <<EOF
$(printf '%s\n' "$ROWS" | awk -v high="$HIGH" '
  { c=$1; y=$2; conf[NR]=c; lab[NR]=y; n++;
    if (y==1) pos++; else neg++;
    if (c>=high) { hn++; if (y==1) hc++ } }
  END{
    base = (n>0)? pos/n : 0;
    # Rank-sum AUC: average ranks (ties → average rank).
    # sort indices by confidence
    for (i=1;i<=n;i++) idx[i]=i;
    for (i=1;i<=n;i++) for (j=i+1;j<=n;j++) if (conf[idx[i]]>conf[idx[j]]) { t=idx[i]; idx[i]=idx[j]; idx[j]=t }
    i=1;
    while (i<=n) {
      j=i; while (j<n && conf[idx[j+1]]==conf[idx[i]]) j++;
      avg=(i+j)/2.0;
      for (k=i;k<=j;k++) rank[idx[k]]=avg;
      i=j+1;
    }
    rsum=0; for (i=1;i<=n;i++) if (lab[i]==1) rsum+=rank[i];
    if (pos>0 && neg>0) auc=(rsum - pos*(pos+1)/2.0)/(pos*neg); else auc=0.5;
    hrate=(hn>0)? hc/hn : 0;
    lift=hrate-base;
    printf "%d %d %.4f %.4f %d %d %.4f %.4f", n, pos, base, auc, hn, hc, hrate, lift
  }')
EOF

verdict="$(awk -v auc="$AUC" -v lift="$LIFT" 'BEGIN{
  print (auc <= 0.60 || lift <= 0.05) ? "KILL §7–§8 — confidence not usable" : "signal present — proceed"
}')"

if [[ "$FORMAT" == "md" ]]; then
  cat <<EOF
## P1 — confidence calibration

- rows: **${N}** (positives ${POS}) · base correct-rate **$(awk "BEGIN{printf \"%.1f\", 100*${BASE}}")%**
- ROC AUC: **${AUC}**
- high-confidence subset (>= ${HIGH}): n=${HIGH_N}, correct=${HIGH_CORRECT}, rate=$(awk "BEGIN{printf \"%.1f\", 100*${HIGH_RATE}}")%
- lift over base: **$(awk "BEGIN{printf \"%+.1f\", 100*${LIFT}}")pp**
- kill condition: ${KILL_CONDITION}
- verdict: **${verdict}**
EOF
else
  awk -v n="$N" -v pos="$POS" -v base="$BASE" -v auc="$AUC" -v hn="$HIGH_N" \
      -v hc="$HIGH_CORRECT" -v hr="$HIGH_RATE" -v lift="$LIFT" -v high="$HIGH" \
      -v v="$verdict" -v kc="$KILL_CONDITION" 'BEGIN{
    printf "{\n"
    printf "  \"experiment\": \"P1-calibration\",\n"
    printf "  \"rows\": %d,\n", n
    printf "  \"positives\": %d,\n", pos
    printf "  \"base_rate\": %.4f,\n", base
    printf "  \"auc\": %.4f,\n", auc
    printf "  \"high_threshold\": %s,\n", high
    printf "  \"high_subset\": { \"n\": %d, \"correct\": %d, \"rate\": %.4f },\n", hn, hc, hr
    printf "  \"lift_over_base\": %.4f,\n", lift
    printf "  \"kill_condition\": \"%s\",\n", kc
    printf "  \"verdict\": \"%s\"\n", v
    printf "}\n"
  }'
fi