feat(agent-reflection): durable kernel — reflection.v1 capture + risk-floor + Phase-0 (#544)
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful

Build the durable kernel of the agent reflection loop. Passive end-of-run
capture of the doer's end-state as structured `reflection.v1` data, plus a
deterministic diff review risk-floor. The closed calibration/skill-synthesis
loop (design §7–§8) stays gated behind Phase-0 experiments P1/P2/P3.

- packages/macp: evaluateRiskFloor (pure, deterministic surface classifier)
  + reflection.v1 JSON Schema; 15 unit tests.
- packages/types: reflection.v1 zod schemas + self-report DTO; 10 unit tests.
- framework: fail-closed Stop hook (reflect-stop-hook.sh) writing the sidecar,
  registered as hooks.Stop in runtime/claude/settings.json. Strict no-op unless
  REFLECTION_MODE=solo|orchestrated; never blocks or fails a session.
- scripts/analysis: P1/P2/P3 experiment harnesses with pre-registered kill
  conditions and structured output.

Mechanical fields (risk, files_changed, ids, provenance) are written by the
hook; self-report fields (confidence, most_likely_wrong, known_not_in_diff) are
merged from an optional $REFLECTION_INPUT, else null + provenance.degraded=true.

Independent review remediations: empty/all-.mosaic diff still writes a sidecar
(grep no-match no longer aborts); session_id sanitized before path use.

Refs #544

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Hermes Agent
2026-06-16 15:55:15 -05:00
parent c461380a4a
commit b76666166e
17 changed files with 1498 additions and 0 deletions

View File

@@ -39,6 +39,11 @@ export { normalizeGate, runShell, countAIFindings, runGate, runGates } from './g
export type { NormalizedGate } from './gate-runner.js';
// Risk-floor (agent reflection loop — diff review classifier)
export { evaluateRiskFloor, DEFAULT_RISK_THRESHOLD } from './risk-floor.js';
export type { ReviewSurface, RiskFloorInput, RiskFloorVerdict } from './risk-floor.js';
// Event emitter
export { nowISO, appendEvent, emitEvent } from './event-emitter.js';

View File

@@ -0,0 +1,87 @@
import { describe, expect, it } from 'vitest';
import { DEFAULT_RISK_THRESHOLD, evaluateRiskFloor, type ReviewSurface } from './risk-floor.js';
describe('evaluateRiskFloor', () => {
it('returns a no-review "none" verdict for an empty diff', () => {
const v = evaluateRiskFloor({ filesChanged: [] });
expect(v).toEqual({
needs_review: false,
score: 0,
surface: 'none',
reason: 'no files changed',
});
});
it('ignores empty/non-string entries', () => {
const v = evaluateRiskFloor({ filesChanged: ['', ' ' as unknown as string].filter(Boolean) });
// only the whitespace string survives the Boolean filter; it classifies to none
expect(v.surface).toBe('none');
expect(v.needs_review).toBe(false);
});
it.each<[string, string, ReviewSurface, boolean]>([
['auth', 'apps/api/src/auth/session.guard.ts', 'auth', true],
['data', 'packages/db/migrations/0007_add_users.sql', 'data', true],
['infra', '.woodpecker/deploy.yml', 'infra', true],
['build', 'packages/types/tsconfig.json', 'build', true],
['ui', 'apps/web/src/components/Button.tsx', 'ui', false],
['test', 'packages/macp/src/risk-floor.spec.ts', 'test', false],
['docs', 'docs/plans/agent-reflection-loop-PRD.md', 'docs', false],
['none', 'README', 'none', false],
])(
'classifies a single %s file → surface=%s needs_review=%s',
(_label, file, surface, needsReview) => {
const v = evaluateRiskFloor({ filesChanged: [file] });
expect(v.surface).toBe(surface);
expect(v.needs_review).toBe(needsReview);
expect(v.reason).toContain(
file === 'README' ? 'no sensitive surface' : surface === 'none' ? '' : surface,
);
},
);
it('lets the highest-risk surface dominate a mixed diff', () => {
const v = evaluateRiskFloor({
filesChanged: [
'docs/readme.md',
'apps/web/src/components/Nav.tsx',
'apps/api/src/auth/token.service.ts',
],
});
expect(v.surface).toBe('auth');
expect(v.score).toBe(1.0);
expect(v.needs_review).toBe(true);
expect(v.reason).toContain('token.service.ts');
expect(v.reason).not.toContain('readme.md');
});
it('names every file that ties at the dominant surface', () => {
const v = evaluateRiskFloor({
filesChanged: ['src/login.ts', 'src/permission-check.ts'],
});
expect(v.surface).toBe('auth');
expect(v.reason).toContain('src/login.ts');
expect(v.reason).toContain('src/permission-check.ts');
});
it('treats docs+test-only diffs as below the floor', () => {
const v = evaluateRiskFloor({
filesChanged: ['docs/guide.md', 'packages/x/src/x.test.ts'],
});
expect(v.needs_review).toBe(false);
expect(v.surface).toBe('test'); // higher weight than docs
});
it('honors a custom threshold', () => {
const docsOnly = { filesChanged: ['docs/guide.md'] };
expect(evaluateRiskFloor(docsOnly, 0.05).needs_review).toBe(true);
expect(evaluateRiskFloor(docsOnly, DEFAULT_RISK_THRESHOLD).needs_review).toBe(false);
});
it('is deterministic across call order', () => {
const a = evaluateRiskFloor({ filesChanged: ['a.md', 'auth/x.ts', 'b.tsx'] });
const b = evaluateRiskFloor({ filesChanged: ['b.tsx', 'a.md', 'auth/x.ts'] });
expect(a).toEqual(b);
});
});

View File

@@ -0,0 +1,138 @@
/**
* Diff risk-floor — deterministic review-need classifier.
*
* Given the set of changed files in a diff, derive a *minimum* review
* requirement ("floor") from the change surface. This is the mechanical half
* of the agent reflection loop (design §6): risky surfaces (auth, data, infra)
* trip a review requirement regardless of what the agent self-reports.
*
* Precedence (authoritative ordering, see design §5):
* CI/tests > human merge > reviewer verdict > self-reflection
* This module sits at the *floor*. It NEVER overrides CI or a human; a
* `needs_review: false` verdict means "no surface tripped the floor", not
* "safe to merge". Consumers MUST keep CI/tests authoritative above it.
*
* Pure and deterministic: no IO, no clock, no randomness. Same input → same
* verdict. Safe to call from a Stop hook via `node -e` or to port inline.
*/
/** Review surfaces, ordered most- to least-sensitive. */
export type ReviewSurface = 'auth' | 'data' | 'infra' | 'build' | 'ui' | 'test' | 'docs' | 'none';
export interface RiskFloorInput {
/** Paths of changed files, repo-relative. Order-insensitive. */
filesChanged: string[];
/** Optional diff size signals; reserved for future weighting. */
insertions?: number;
deletions?: number;
}
export interface RiskFloorVerdict {
/** True when the change surface meets/exceeds the review threshold. */
needs_review: boolean;
/** Aggregate risk score in [0, 1] — the max surface weight across files. */
score: number;
/** The dominant (highest-weight) surface across all changed files. */
surface: ReviewSurface;
/** Human-readable explanation naming the surface and tripping files. */
reason: string;
}
/** Default review threshold; `score >= THRESHOLD` ⇒ `needs_review`. */
export const DEFAULT_RISK_THRESHOLD = 0.5;
interface SurfaceRule {
surface: ReviewSurface;
weight: number;
/** Case-insensitive regex matched against the file path. */
pattern: RegExp;
}
/**
* Surface classification rules, evaluated highest-weight first. The first
* rule whose pattern matches a path classifies that file; the file's surface
* is the highest-risk surface it matches (rules are pre-sorted by weight).
*/
const SURFACE_RULES: readonly SurfaceRule[] = [
{
surface: 'auth',
weight: 1.0,
pattern: /auth|login|session|token|permission|rbac|credential|secret/i,
},
{
surface: 'data',
weight: 0.9,
pattern: /migration|prisma|schema|\.sql|entity|repository|seed/i,
},
{
surface: 'infra',
weight: 0.85,
pattern: /docker|\.woodpecker|compose|traefik|deploy|helm|k8s|terraform/i,
},
{
surface: 'build',
weight: 0.6,
pattern: /package\.json|tsconfig|turbo\.json|pnpm-|\.config\.|eslint|vite/i,
},
{ surface: 'ui', weight: 0.4, pattern: /\.tsx|\.css|components\/|apps\/web\// },
{ surface: 'test', weight: 0.2, pattern: /\.spec\.|\.test\.|__tests__\// },
{ surface: 'docs', weight: 0.1, pattern: /\.md$|docs\// },
];
const NONE_WEIGHT = 0.0;
/** Classify a single path to its highest-risk surface and weight. */
function classify(path: string): { surface: ReviewSurface; weight: number } {
for (const rule of SURFACE_RULES) {
if (rule.pattern.test(path)) {
return { surface: rule.surface, weight: rule.weight };
}
}
return { surface: 'none', weight: NONE_WEIGHT };
}
/**
* Evaluate the review risk-floor for a diff.
*
* @param input changed files (+ optional size signals)
* @param threshold review cutoff; defaults to {@link DEFAULT_RISK_THRESHOLD}
*/
export function evaluateRiskFloor(
input: RiskFloorInput,
threshold: number = DEFAULT_RISK_THRESHOLD,
): RiskFloorVerdict {
const files = (input.filesChanged ?? []).filter((f) => typeof f === 'string' && f.length > 0);
if (files.length === 0) {
return {
needs_review: false,
score: 0,
surface: 'none',
reason: 'no files changed',
};
}
let topSurface: ReviewSurface = 'none';
let topWeight = NONE_WEIGHT;
const tripping: string[] = [];
for (const file of files) {
const { surface, weight } = classify(file);
if (weight > topWeight) {
topWeight = weight;
topSurface = surface;
tripping.length = 0;
tripping.push(file);
} else if (weight === topWeight && surface === topSurface && surface !== 'none') {
tripping.push(file);
}
}
const needs_review = topWeight >= threshold;
const reason =
topSurface === 'none'
? `no sensitive surface in ${files.length} changed file(s)`
: `${topSurface} surface (weight ${topWeight}) in: ${tripping.join(', ')}`;
return { needs_review, score: topWeight, surface: topSurface, reason };
}

View File

@@ -0,0 +1,105 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://mosaicstack.dev/schemas/reflection/reflection.v1.schema.json",
"title": "Agent Reflection (v1)",
"description": "End-of-run reflection sidecar. Mechanical fields are written by the Stop hook; self-reported fields are merged from an optional agent-supplied input and are null when absent (provenance.degraded=true).",
"type": "object",
"required": [
"schema",
"task_ref",
"agent",
"session_id",
"timestamp",
"repo",
"risk",
"files_changed",
"provenance"
],
"properties": {
"schema": {
"const": "reflection.v1"
},
"task_ref": {
"type": "string",
"description": "Canonical task ref; derived from REFLECTION_TASK_REF or repo+branch."
},
"agent": {
"type": "string",
"description": "Persona/runtime id (REFLECTION_AGENT or 'unknown')."
},
"session_id": {
"type": "string",
"description": "From the Stop payload session_id, else 'unknown'."
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "ISO-8601 UTC capture time."
},
"repo": {
"type": "string",
"description": "Repo root basename."
},
"confidence": {
"type": ["number", "null"],
"minimum": 0,
"maximum": 1,
"description": "SELF-REPORTED. Agent's overall confidence; null when not supplied."
},
"most_likely_wrong": {
"type": ["object", "null"],
"description": "SELF-REPORTED. The single most-likely way the work is wrong.",
"required": ["surface", "description"],
"properties": {
"surface": { "$ref": "#/$defs/surface" },
"description": { "type": "string" }
},
"additionalProperties": false
},
"known_not_in_diff": {
"type": ["string", "null"],
"description": "SELF-REPORTED. What the agent knows that isn't visible in the diff."
},
"risk": {
"type": "object",
"description": "MECHANICAL. Output of the diff risk-floor.",
"required": ["needs_review", "score", "surface", "reason"],
"properties": {
"needs_review": { "type": "boolean" },
"score": { "type": "number", "minimum": 0, "maximum": 1 },
"surface": { "$ref": "#/$defs/surface" },
"reason": { "type": "string" }
},
"additionalProperties": false
},
"files_changed": {
"type": "array",
"items": { "type": "string" },
"description": "MECHANICAL. git diff name-only."
},
"provenance": {
"type": "object",
"required": ["source", "reflection_attempt", "degraded", "reflection_mode"],
"properties": {
"source": { "const": "stop-hook" },
"reflection_attempt": { "type": "integer", "minimum": 1 },
"degraded": {
"type": "boolean",
"description": "True when self-report inputs were missing/unreadable."
},
"reflection_mode": {
"type": "string",
"enum": ["off", "solo", "orchestrated"]
}
},
"additionalProperties": false
}
},
"additionalProperties": false,
"$defs": {
"surface": {
"type": "string",
"enum": ["auth", "data", "infra", "build", "ui", "test", "docs", "none"]
}
}
}

View File

@@ -34,6 +34,17 @@
}
]
}
],
"Stop": [
{
"hooks": [
{
"type": "command",
"command": "~/.config/mosaic/tools/qa/reflect-stop-hook.sh",
"timeout": 15
}
]
}
]
},
"enabledPlugins": {

View File

@@ -0,0 +1,197 @@
#!/usr/bin/env bash
# reflect-stop-hook.sh — Stop hook (agent reflection loop, durable kernel)
#
# At end-of-run, capture the doer's end-state as a structured `reflection.v1`
# sidecar: the mechanical diff risk-floor plus any self-report the agent left
# behind. This is the passive capture half of the design (§10 step 1). It does
# NOT route, score, or gate — it only writes the sidecar; pickup is future work.
#
# FAIL-CLOSED: if REFLECTION_MODE is unset or "off", this is a strict no-op.
# Global registration is therefore safe; the feature only activates when a
# launcher/profile explicitly sets REFLECTION_MODE=solo|orchestrated.
#
# NON-BLOCKING: Stop hooks are observational. This script NEVER emits a
# `decision` field and ALWAYS exits 0 — it can never fail or stall a session.
#
# Environment contract:
# REFLECTION_MODE off|solo|orchestrated (default: off → no-op)
# REFLECTION_DIR output dir (default: <repo>/.mosaic/reflections)
# REFLECTION_INPUT self-report JSON (default: <repo>/.mosaic/reflection-input.json)
# REFLECTION_TASK_REF canonical task ref (default: <repo>#<branch>)
# REFLECTION_AGENT persona/runtime id (default: unknown)
# REFLECTION_RISK_THRESHOLD review cutoff [0,1] (default: 0.5)
#
# Risk-floor surface table is kept in sync with the authoritative TS
# implementation at packages/macp/src/risk-floor.ts (evaluateRiskFloor).
#
# Exit codes: always 0 (observational hook).
set -euo pipefail
# ---- fail-closed gate -------------------------------------------------------
MODE="${REFLECTION_MODE:-off}"
if [[ "$MODE" != "solo" && "$MODE" != "orchestrated" ]]; then
exit 0
fi
# Read the Stop payload (best-effort; never required).
INPUT="$(cat || true)"
# Sentinel lock path (global so the EXIT trap can clean it after main returns).
LOCKFILE=""
trap 'rm -f "${LOCKFILE:-}" 2>/dev/null || true' EXIT
main() {
command -v jq >/dev/null 2>&1 || return 0 # no jq → silently no-op
local session_id payload_cwd repo_dir repo_name branch task_ref agent
session_id="$(printf '%s' "$INPUT" | jq -r '.session_id // "unknown"' 2>/dev/null || echo unknown)"
# Sanitize: session_id is interpolated into file/lock paths — allow safe
# filename chars only (defends against ../ or / in the payload).
session_id="${session_id//[^a-zA-Z0-9_-]/}"
session_id="${session_id:-unknown}"
payload_cwd="$(printf '%s' "$INPUT" | jq -r '.cwd // empty' 2>/dev/null || true)"
# Resolve repo root: prefer git toplevel from the payload cwd, else PWD.
local start_dir="${payload_cwd:-$PWD}"
repo_dir="$(git -C "$start_dir" rev-parse --show-toplevel 2>/dev/null || echo "$start_dir")"
repo_name="$(basename "$repo_dir")"
branch="$(git -C "$repo_dir" rev-parse --abbrev-ref HEAD 2>/dev/null || echo detached)"
task_ref="${REFLECTION_TASK_REF:-${repo_name}#${branch}}"
agent="${REFLECTION_AGENT:-unknown}"
# ---- sentinel guard: avoid re-fire loops --------------------------------
local out_dir lock
out_dir="${REFLECTION_DIR:-${repo_dir}/.mosaic/reflections}"
mkdir -p "$out_dir" 2>/dev/null || return 0
lock="${out_dir}/.${session_id}.lock"
if [[ -e "$lock" ]]; then
return 0
fi
: > "$lock" 2>/dev/null || true
LOCKFILE="$lock"
# ---- mechanical: changed files ------------------------------------------
# Union of committed-vs-HEAD~ is out of scope; capture the working surface:
# staged + unstaged + untracked, best-effort.
# Exclude .mosaic/ (agent scratch: reflections, locks, self-report input) —
# it is tooling state, not part of the diff under review.
local files
files="$(
{
git -C "$repo_dir" diff --name-only HEAD 2>/dev/null || true
git -C "$repo_dir" diff --name-only --staged 2>/dev/null || true
git -C "$repo_dir" ls-files --others --exclude-standard 2>/dev/null || true
} | sed '/^$/d' | grep -v '^\.mosaic/' | sort -u || true
)"
# ---- mechanical: risk-floor (inline port of evaluateRiskFloor) ----------
local threshold="${REFLECTION_RISK_THRESHOLD:-0.5}"
local top_surface="none" top_weight="0.0" tripping=""
local f surface weight
while IFS= read -r f; do
[[ -z "$f" ]] && continue
surface="$(classify_surface "$f")"
weight="$(surface_weight "$surface")"
if awk "BEGIN{exit !($weight > $top_weight)}"; then
top_weight="$weight"; top_surface="$surface"; tripping="$f"
elif [[ "$surface" == "$top_surface" && "$surface" != "none" ]] && awk "BEGIN{exit !($weight == $top_weight)}"; then
tripping="${tripping:+$tripping, }$f"
fi
done <<< "$files"
local needs_review reason file_count
file_count="$(printf '%s\n' "$files" | sed '/^$/d' | wc -l | tr -d ' ')"
if awk "BEGIN{exit !($top_weight >= $threshold)}"; then needs_review=true; else needs_review=false; fi
if [[ "$top_surface" == "none" ]]; then
if [[ "$file_count" -eq 0 ]]; then reason="no files changed"; else reason="no sensitive surface in ${file_count} changed file(s)"; fi
else
reason="${top_surface} surface (weight ${top_weight}) in: ${tripping}"
fi
# ---- self-report merge (optional) ---------------------------------------
local input_file degraded self_json
input_file="${REFLECTION_INPUT:-${repo_dir}/.mosaic/reflection-input.json}"
degraded=true
self_json='{"confidence":null,"most_likely_wrong":null,"known_not_in_diff":null}'
if [[ -r "$input_file" ]] && jq -e . "$input_file" >/dev/null 2>&1; then
self_json="$(jq '{
confidence: (.confidence // null),
most_likely_wrong: (.most_likely_wrong // null),
known_not_in_diff: (.known_not_in_diff // null)
}' "$input_file" 2>/dev/null || echo "$self_json")"
degraded=false
fi
# ---- assemble + atomic write --------------------------------------------
local ts files_json record tmp final
ts="$(date -u +%Y-%m-%dT%H:%M:%S.000Z)"
files_json="$(printf '%s\n' "$files" | jq -R . | jq -s 'map(select(length>0))')"
record="$(jq -n \
--arg task_ref "$task_ref" \
--arg agent "$agent" \
--arg session_id "$session_id" \
--arg ts "$ts" \
--arg repo "$repo_name" \
--argjson needs_review "$needs_review" \
--argjson score "$top_weight" \
--arg surface "$top_surface" \
--arg reason "$reason" \
--argjson files "$files_json" \
--argjson self "$self_json" \
--argjson degraded "$degraded" \
--arg mode "$MODE" \
'{
schema: "reflection.v1",
task_ref: $task_ref,
agent: $agent,
session_id: $session_id,
timestamp: $ts,
repo: $repo,
confidence: $self.confidence,
most_likely_wrong: $self.most_likely_wrong,
known_not_in_diff: $self.known_not_in_diff,
risk: { needs_review: $needs_review, score: $score, surface: $surface, reason: $reason },
files_changed: $files,
provenance: { source: "stop-hook", reflection_attempt: 1, degraded: $degraded, reflection_mode: $mode }
}' 2>/dev/null || true)"
[[ -z "$record" ]] && return 0
final="${out_dir}/${session_id}-${ts//[:]/}.reflection.json"
tmp="${final}.tmp"
printf '%s\n' "$record" > "$tmp" 2>/dev/null || return 0
mv -f "$tmp" "$final" 2>/dev/null || true
}
# classify_surface PATH → surface name (highest-risk match wins, mirrors TS)
classify_surface() {
local p="$1"
if printf '%s' "$p" | grep -qiE 'auth|login|session|token|permission|rbac|credential|secret'; then echo auth; return; fi
if printf '%s' "$p" | grep -qiE 'migration|prisma|schema|\.sql|entity|repository|seed'; then echo data; return; fi
if printf '%s' "$p" | grep -qiE 'docker|\.woodpecker|compose|traefik|deploy|helm|k8s|terraform'; then echo infra; return; fi
if printf '%s' "$p" | grep -qiE 'package\.json|tsconfig|turbo\.json|pnpm-|\.config\.|eslint|vite'; then echo build; return; fi
if printf '%s' "$p" | grep -qE '\.tsx|\.css|components/|apps/web/'; then echo ui; return; fi
if printf '%s' "$p" | grep -qE '\.spec\.|\.test\.|__tests__/'; then echo test; return; fi
if printf '%s' "$p" | grep -qE '\.md$|docs/'; then echo docs; return; fi
echo none
}
# surface_weight SURFACE → numeric weight (mirrors TS SURFACE_RULES)
surface_weight() {
case "$1" in
auth) echo 1.0 ;;
data) echo 0.9 ;;
infra) echo 0.85 ;;
build) echo 0.6 ;;
ui) echo 0.4 ;;
test) echo 0.2 ;;
docs) echo 0.1 ;;
*) echo 0.0 ;;
esac
}
main || true
exit 0

View File

@@ -6,3 +6,4 @@ export * from './provider/index.js';
export * from './routing/index.js';
export * from './commands/index.js';
export * from './federation/index.js';
export * from './reflection/index.js';

View File

@@ -0,0 +1,146 @@
/**
* Unit tests for the reflection.v1 schema + self-report boundary.
*
* The runtime source of truth is the zod schema set in `reflection.ts`. The
* class-validator `ReflectionSelfReportDto` is the NestJS-side boundary type
* (exercised under the gateway app's reflect-metadata runtime, mirroring how
* `chat.dto.ts` is tested in apps/gateway); here we validate the self-report
* input with its zod counterpart, which is what the Stop hook actually uses.
*
* Coverage:
* - REVIEW_SURFACES canonical ordering (the enum both zod + JSON Schema mirror)
* - ReflectionV1Schema accepts a fully-populated record
* - ReflectionV1Schema accepts a degraded record (self-report fields null)
* - ReflectionV1Schema rejects bad schema literal / out-of-range confidence / bad surface
* - ReflectionSelfReportSchema accepts valid + empty, rejects bad input
*/
import { describe, expect, it } from 'vitest';
import {
REVIEW_SURFACES,
ReflectionV1Schema,
ReflectionSelfReportSchema,
type ReflectionV1,
} from '../index.js';
const baseMechanical = {
schema: 'reflection.v1' as const,
task_ref: 'stack#544',
agent: 'claude',
session_id: 'sess-abc',
timestamp: '2026-06-16T00:00:00.000Z',
repo: 'stack',
risk: {
needs_review: true,
score: 1.0,
surface: 'auth' as const,
reason: 'auth surface (weight 1) in: src/auth.ts',
},
files_changed: ['src/auth.ts'],
provenance: {
source: 'stop-hook' as const,
reflection_attempt: 1,
degraded: false,
reflection_mode: 'solo' as const,
},
};
describe('REVIEW_SURFACES', () => {
it('keeps the canonical most→least-sensitive ordering', () => {
expect(REVIEW_SURFACES).toEqual([
'auth',
'data',
'infra',
'build',
'ui',
'test',
'docs',
'none',
]);
});
});
describe('ReflectionV1Schema', () => {
it('accepts a fully-populated record', () => {
const rec: ReflectionV1 = {
...baseMechanical,
confidence: 0.7,
most_likely_wrong: { surface: 'auth', description: 'token refresh untested' },
known_not_in_diff: 'manual QA only on the happy path',
};
expect(() => ReflectionV1Schema.parse(rec)).not.toThrow();
});
it('accepts a degraded record with null self-report fields', () => {
const rec: ReflectionV1 = {
...baseMechanical,
confidence: null,
most_likely_wrong: null,
known_not_in_diff: null,
provenance: { ...baseMechanical.provenance, degraded: true },
};
expect(() => ReflectionV1Schema.parse(rec)).not.toThrow();
});
it('rejects a wrong schema literal', () => {
const bad = {
...baseMechanical,
schema: 'reflection.v2',
confidence: null,
most_likely_wrong: null,
known_not_in_diff: null,
};
expect(() => ReflectionV1Schema.parse(bad)).toThrow();
});
it('rejects out-of-range confidence', () => {
const bad = {
...baseMechanical,
confidence: 1.5,
most_likely_wrong: null,
known_not_in_diff: null,
};
expect(() => ReflectionV1Schema.parse(bad)).toThrow();
});
it('rejects an unknown surface', () => {
const bad = {
...baseMechanical,
risk: { ...baseMechanical.risk, surface: 'network' },
confidence: null,
most_likely_wrong: null,
known_not_in_diff: null,
};
expect(() => ReflectionV1Schema.parse(bad)).toThrow();
});
});
describe('ReflectionSelfReportSchema', () => {
it('accepts a valid self-report', () => {
const ok = ReflectionSelfReportSchema.safeParse({
confidence: 0.8,
most_likely_wrong: {
surface: 'data',
description: 'migration not run against prod-sized data',
},
known_not_in_diff: 'rollback path untested',
});
expect(ok.success).toBe(true);
});
it('accepts an empty self-report (all optional)', () => {
expect(ReflectionSelfReportSchema.safeParse({}).success).toBe(true);
});
it('rejects confidence above 1', () => {
expect(ReflectionSelfReportSchema.safeParse({ confidence: 2 }).success).toBe(false);
});
it('rejects an unknown most_likely_wrong.surface', () => {
const res = ReflectionSelfReportSchema.safeParse({
most_likely_wrong: { surface: 'network', description: 'x' },
});
expect(res.success).toBe(false);
});
});

View File

@@ -0,0 +1,30 @@
/**
* Agent reflection (v1) — public barrel.
*
* reflection.ts — zod schemas (runtime source of truth) + inferred types
* reflection.dto.ts — class-validator DTO for the agent self-report input
*/
export {
REVIEW_SURFACES,
ReviewSurfaceSchema,
MostLikelyWrongSchema,
ReflectionRiskSchema,
ReflectionModeSchema,
ReflectionProvenanceSchema,
ReflectionSelfReportSchema,
ReflectionV1Schema,
REFLECTION_SCHEMA_ID,
} from './reflection.js';
export type {
ReviewSurface,
MostLikelyWrong,
ReflectionRisk,
ReflectionMode,
ReflectionProvenance,
ReflectionSelfReport,
ReflectionV1,
} from './reflection.js';
export { MostLikelyWrongDto, ReflectionSelfReportDto } from './reflection.dto.js';

View File

@@ -0,0 +1,55 @@
/**
* Reflection self-report DTO — class-validator boundary.
*
* Validates the agent-supplied self-report input (the optional
* `$REFLECTION_INPUT` file, default `<repo>/.mosaic/reflection-input.json`)
* before it is merged into a `reflection.v1` record. This is the only
* externally-authored input on the reflection path, so it gets a DTO per the
* Mosaic module-boundary rule.
*
* Class-validator only (no class-transformer `@Type`) — matching `chat.dto.ts`
* — so the module is safe to import without a `reflect-metadata` shim. Deep
* nested validation of `most_likely_wrong` is owned by the zod
* `ReflectionSelfReportSchema` in `reflection.ts`, which is what the Stop hook
* actually enforces at runtime.
*/
import {
IsIn,
IsNumber,
IsObject,
IsOptional,
IsString,
Max,
Min,
MaxLength,
} from 'class-validator';
import { REVIEW_SURFACES } from './reflection.js';
/** Shape of `most_likely_wrong`; validated structurally by zod at runtime. */
export class MostLikelyWrongDto {
@IsIn(REVIEW_SURFACES as unknown as string[])
surface!: string;
@IsString()
@MaxLength(4_000)
description!: string;
}
export class ReflectionSelfReportDto {
@IsOptional()
@IsNumber()
@Min(0)
@Max(1)
confidence?: number;
@IsOptional()
@IsObject()
most_likely_wrong?: MostLikelyWrongDto;
@IsOptional()
@IsString()
@MaxLength(8_000)
known_not_in_diff?: string;
}

View File

@@ -0,0 +1,90 @@
/**
* Agent reflection (v1) — wire schema.
*
* Runtime source of truth for the `reflection.v1` sidecar emitted at end-of-run
* by the Stop hook (design §10 step 1). The JSON Schema artifact at
* `@mosaicstack/macp` `src/schemas/reflection.v1.schema.json` is the documented
* contract; this zod schema is the executable one and MUST agree with it.
*
* Field provenance:
* - MECHANICAL (risk, files_changed, ids, provenance): written by the hook.
* - SELF-REPORTED (confidence, most_likely_wrong, known_not_in_diff): merged
* from an optional agent-supplied input; null when absent.
*
* Pure — no NestJS, no DB, no Node-only APIs. Safe for browser/edge.
*/
import { z } from 'zod';
/** Review surfaces, ordered most- to least-sensitive. Mirrors macp risk-floor. */
export const REVIEW_SURFACES = [
'auth',
'data',
'infra',
'build',
'ui',
'test',
'docs',
'none',
] as const;
export const ReviewSurfaceSchema = z.enum(REVIEW_SURFACES);
export type ReviewSurface = z.infer<typeof ReviewSurfaceSchema>;
/** SELF-REPORTED: the single most-likely way the work is wrong. */
export const MostLikelyWrongSchema = z.object({
surface: ReviewSurfaceSchema,
description: z.string(),
});
export type MostLikelyWrong = z.infer<typeof MostLikelyWrongSchema>;
/** MECHANICAL: output of the diff risk-floor (see `@mosaicstack/macp`). */
export const ReflectionRiskSchema = z.object({
needs_review: z.boolean(),
score: z.number().min(0).max(1),
surface: ReviewSurfaceSchema,
reason: z.string(),
});
export type ReflectionRisk = z.infer<typeof ReflectionRiskSchema>;
export const ReflectionModeSchema = z.enum(['off', 'solo', 'orchestrated']);
export type ReflectionMode = z.infer<typeof ReflectionModeSchema>;
export const ReflectionProvenanceSchema = z.object({
source: z.literal('stop-hook'),
reflection_attempt: z.number().int().min(1),
degraded: z.boolean(),
reflection_mode: ReflectionModeSchema,
});
export type ReflectionProvenance = z.infer<typeof ReflectionProvenanceSchema>;
/**
* The self-reported half of a reflection. Supplied by the agent out-of-band
* (e.g. `<repo>/.mosaic/reflection-input.json`) and merged by the hook. All
* fields optional; missing fields become `null` in the assembled record.
*/
export const ReflectionSelfReportSchema = z.object({
confidence: z.number().min(0).max(1).nullable().optional(),
most_likely_wrong: MostLikelyWrongSchema.nullable().optional(),
known_not_in_diff: z.string().nullable().optional(),
});
export type ReflectionSelfReport = z.infer<typeof ReflectionSelfReportSchema>;
/** The full assembled `reflection.v1` sidecar. */
export const ReflectionV1Schema = z.object({
schema: z.literal('reflection.v1'),
task_ref: z.string(),
agent: z.string(),
session_id: z.string(),
timestamp: z.string(),
repo: z.string(),
confidence: z.number().min(0).max(1).nullable(),
most_likely_wrong: MostLikelyWrongSchema.nullable(),
known_not_in_diff: z.string().nullable(),
risk: ReflectionRiskSchema,
files_changed: z.array(z.string()),
provenance: ReflectionProvenanceSchema,
});
export type ReflectionV1 = z.infer<typeof ReflectionV1Schema>;
export const REFLECTION_SCHEMA_ID = 'reflection.v1' as const;