From f91bbeea48187d9fc16b4d3b9a66d013113058e0 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Wed, 24 Jun 2026 08:48:23 -0500 Subject: [PATCH] fix(fleet): report idle agents as available --- docs/scratchpads/h2-readiness-available.md | 70 ++++++++++++++ packages/mosaic/src/commands/fleet.spec.ts | 106 ++++++++++----------- packages/mosaic/src/commands/fleet.ts | 23 +---- 3 files changed, 123 insertions(+), 76 deletions(-) create mode 100644 docs/scratchpads/h2-readiness-available.md diff --git a/docs/scratchpads/h2-readiness-available.md b/docs/scratchpads/h2-readiness-available.md new file mode 100644 index 0000000..17af373 --- /dev/null +++ b/docs/scratchpads/h2-readiness-available.md @@ -0,0 +1,70 @@ +# H2 — readiness semantics: available, not stuck + +## Objective + +Correct fleet readiness semantics so a healthy long-idle agent is reported as `available` (good/assignable) instead of `stuck` (fault). Reserve `stuck` in the type/JSON value space for future positive block evidence. + +## Scope + +- `packages/mosaic/src/commands/fleet.ts` + - replace `idle` readiness state with `available` + - keep `stuck` in the union but stop emitting it from idle-only heuristics + - remove stuck threshold helper/env handling + - remove IDLE/STUCK alarm flags from table rendering +- `packages/mosaic/src/commands/fleet.spec.ts` + - update classifier branch/boundary tests + - assert very long idle maps to `available`, not `stuck` + - update table/JSON assertions for available with no alarm flags + - remove stuck threshold helper tests + +## Acceptance Criteria + +- `classifyReadiness()` remains pure/total/never-throw and maps: + - dead/stale/unknown unchanged + - busy/null/undefined/non-finite idle to `working` + - idle >= activity threshold to `available` + - idle < activity threshold to `working` +- No idle-derived path emits `stuck`. +- `MOSAIC_HEARTBEAT_IDLE_THRESHOLD` remains backward compatible as the working→available activity threshold. +- `MOSAIC_HEARTBEAT_STUCK_THRESHOLD` and helper/default are removed. +- `fleet ps` keeps the idle-seconds column header `IDLE`, renders `available` in HB label, and does not add IDLE/STUCK warning flags. +- Local gates green: build precheck, typecheck, lint, format:check, fleet vitest. +- PR opened against `main`; no merge by worker. + +## Constraints / Assumptions + +- Source branch: `origin/main` @ `1020cfa`. +- `docs/TASKS.md` is orchestrator-owned; worker will not modify it. +- Documentation impact is captured in this scratchpad and PR description; no user/admin guide behavior beyond CLI readiness label semantics. + +## Plan + +1. Install dependencies with requested PNPM environment. +2. Inspect current H1/H1b readiness implementation and tests. +3. Update classifier types/helpers/rendering. +4. Update focused tests. +5. Run build precheck + required gates. +6. Run automated code review, remediate any findings. +7. Queue guard, push, open PR. + +## Progress + +- 2026-06-24: Branch created from `origin/main` @ `1020cfa`. +- 2026-06-24: Replaced idle-derived `idle`/`stuck` outputs with `available`; retained `stuck` in type union for future positive block evidence. +- 2026-06-24: Removed stuck threshold env/helper plumbing and IDLE/STUCK alarm flags. +- 2026-06-24: Updated classifier and table-render tests for available semantics. + +## Verification Evidence + +- `pnpm install --store-dir "$HOME/.pnpm-store"` — pass. +- `npx turbo build --filter=@mosaicstack/mosaic^...` — pass, 12/12 tasks successful. +- `pnpm typecheck` — pass, 41/41 tasks successful. +- `pnpm lint` — pass, 23/23 tasks successful. +- `pnpm format:check` — pass, all matched files use Prettier style. +- `pnpm --filter @mosaicstack/mosaic exec vitest run src/commands/fleet.spec.ts` — pass, 177 tests. +- `~/.config/mosaic/tools/codex/codex-code-review.sh --uncommitted` — approve, 0 findings (reviewed supplied diff; sandbox file-inspection limitation noted by tool). + +## Risks / Blockers + +- No current blocker. +- Review tool could not inspect repo files directly due sandbox wrapper limitation, but it reviewed the supplied diff and approved with no findings. diff --git a/packages/mosaic/src/commands/fleet.spec.ts b/packages/mosaic/src/commands/fleet.spec.ts index ad87490..be3b085 100644 --- a/packages/mosaic/src/commands/fleet.spec.ts +++ b/packages/mosaic/src/commands/fleet.spec.ts @@ -27,7 +27,6 @@ import { enableFleetUnits, FLEET_PROFILES, HEARTBEAT_IDLE_THRESHOLD_SECONDS, - HEARTBEAT_STUCK_THRESHOLD_SECONDS, generateAgentEnv, getDefaultOperatorSourceLabel, getDefaultTenantAndHost, @@ -48,7 +47,6 @@ import { resolvePresetFilename, RUNTIME_ACCEPTABLE_COMMANDS, serializeRosterToYaml, - stuckThresholdSeconds, VERIFY_DEFAULT_TIMEOUT_MS, VERIFY_POLL_INTERVAL_MS, type AgentPsRow, @@ -940,42 +938,33 @@ describe('fleet ps — heartbeat parsing', () => { describe('fleet ps — readiness thresholds', () => { const savedIdle = process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD; - const savedStuck = process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD; afterEach(() => { if (savedIdle === undefined) delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD; else process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = savedIdle; - if (savedStuck === undefined) delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD; - else process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = savedStuck; }); - it('uses default readiness thresholds when env is unset', () => { + it('uses the default activity threshold when env is unset', () => { delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD; - delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD; expect(idleThresholdSeconds()).toBe(HEARTBEAT_IDLE_THRESHOLD_SECONDS); - expect(stuckThresholdSeconds()).toBe(HEARTBEAT_STUCK_THRESHOLD_SECONDS); }); - it('honors positive integer readiness thresholds from env', () => { + it('honors a positive integer activity threshold from env', () => { process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '120'; - process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = '480'; expect(idleThresholdSeconds()).toBe(120); - expect(stuckThresholdSeconds()).toBe(480); }); - it('falls back to defaults for invalid readiness thresholds', () => { + it('falls back to the default for invalid activity thresholds', () => { process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '0'; - process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = 'not-a-number'; expect(idleThresholdSeconds()).toBe(HEARTBEAT_IDLE_THRESHOLD_SECONDS); - expect(stuckThresholdSeconds()).toBe(HEARTBEAT_STUCK_THRESHOLD_SECONDS); }); }); describe('fleet ps — readiness classification', () => { - const thresholds = { idleThresholdSeconds: 300, stuckThresholdSeconds: 900 }; + const thresholds = { idleThresholdSeconds: 300 }; it('reports dead when the pane is not alive', () => { expect( @@ -1004,7 +993,7 @@ describe('fleet ps — readiness classification', () => { ).toBe('stale'); }); - it('reports working when heartbeat status is busy, even past stuck threshold', () => { + it('reports working when heartbeat status is busy, even after the activity threshold', () => { expect( classifyReadiness( { paneAlive: true, hbHealth: 'healthy', hbStatus: 'busy', idleSeconds: 2_000 }, @@ -1013,7 +1002,7 @@ describe('fleet ps — readiness classification', () => { ).toBe('working'); }); - it('reports working when pane idle seconds are unavailable', () => { + it('reports working when pane idle seconds are null', () => { expect( classifyReadiness( { paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: null }, @@ -1022,25 +1011,31 @@ describe('fleet ps — readiness classification', () => { ).toBe('working'); }); - it('reports stuck at the stuck threshold boundary', () => { + it('reports working when pane idle seconds are undefined', () => { expect( - classifyReadiness( - { paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 900 }, - thresholds, - ), - ).toBe('stuck'); + classifyReadiness({ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok' }, thresholds), + ).toBe('working'); }); - it('reports idle at the idle threshold boundary', () => { + it('reports working when pane idle seconds are non-finite', () => { + expect( + classifyReadiness( + { paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: Number.NaN }, + thresholds, + ), + ).toBe('working'); + }); + + it('reports available at the activity threshold boundary', () => { expect( classifyReadiness( { paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 300 }, thresholds, ), - ).toBe('idle'); + ).toBe('available'); }); - it('reports working below the idle threshold', () => { + it('reports working below the activity threshold', () => { expect( classifyReadiness( { paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 299 }, @@ -1049,13 +1044,14 @@ describe('fleet ps — readiness classification', () => { ).toBe('working'); }); - it('checks stuck before idle when thresholds are inverted', () => { - expect( - classifyReadiness( - { paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 350 }, - { idleThresholdSeconds: 900, stuckThresholdSeconds: 300 }, - ), - ).toBe('stuck'); + it('reports very long idle as available, not stuck', () => { + const readiness = classifyReadiness( + { paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 100_000 }, + thresholds, + ); + + expect(readiness).toBe('available'); + expect(readiness).not.toBe('stuck'); }); }); @@ -1554,7 +1550,7 @@ describe('fleet ps — command sequences issued', () => { }); describe('fleet ps — readiness table output', () => { - it('renders readiness in HB column and flags idle/stuck rows', async () => { + it('renders available in HB column without idle/stuck alarm flags', async () => { const home = await mkdtemp(join(tmpdir(), 'mosaic-fleet-')); const rosterPath = join(home, 'fleet', 'roster.yaml'); const runDir = join(home, 'fleet', 'run'); @@ -1565,36 +1561,34 @@ describe('fleet ps — readiness table output', () => { 'version: 1', 'transport: tmux', 'agents:', - ' - name: idle-agent', + ' - name: working-agent', ' runtime: pi', - ' - name: stuck-agent', + ' - name: available-agent', ' runtime: pi', ].join('\n'), ); const nowMs = 1_700_000_000_000; - const idleActivityEpoch = Math.floor((nowMs - 10_000) / 1000); - const stuckActivityEpoch = Math.floor((nowMs - 40_000) / 1000); + const workingActivityEpoch = Math.floor((nowMs - 2_000) / 1000); + const availableActivityEpoch = Math.floor((nowMs - 40_000) / 1000); const hbTs = new Date(nowMs - 1_000).toISOString(); - await writeFile(join(runDir, 'idle-agent.hb'), `ts=${hbTs}\npid=111\nstatus=ok\n`); - await writeFile(join(runDir, 'stuck-agent.hb'), `ts=${hbTs}\npid=222\nstatus=ok\n`); + await writeFile(join(runDir, 'working-agent.hb'), `ts=${hbTs}\npid=111\nstatus=ok\n`); + await writeFile(join(runDir, 'available-agent.hb'), `ts=${hbTs}\npid=222\nstatus=ok\n`); const savedIdle = process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD; - const savedStuck = process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD; process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '5'; - process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = '30'; const dateNow = vi.spyOn(Date, 'now').mockReturnValue(nowMs); const runner: CommandRunner = async (command, args) => { const full = [command, ...args].join(' '); if (full.includes('list-sessions')) { - return { stdout: 'idle-agent\nstuck-agent\n', stderr: '', exitCode: 0 }; + return { stdout: 'working-agent\navailable-agent\n', stderr: '', exitCode: 0 }; } - if (full.includes('=idle-agent:0.0')) { - return { stdout: `111 pi 0 ${idleActivityEpoch}\n`, stderr: '', exitCode: 0 }; + if (full.includes('=working-agent:0.0')) { + return { stdout: `111 pi 0 ${workingActivityEpoch}\n`, stderr: '', exitCode: 0 }; } - if (full.includes('=stuck-agent:0.0')) { - return { stdout: `222 pi 0 ${stuckActivityEpoch}\n`, stderr: '', exitCode: 0 }; + if (full.includes('=available-agent:0.0')) { + return { stdout: `222 pi 0 ${availableActivityEpoch}\n`, stderr: '', exitCode: 0 }; } if (full.includes('systemctl') && full.includes('show')) { return { @@ -1623,19 +1617,17 @@ describe('fleet ps — readiness table output', () => { dateNow.mockRestore(); if (savedIdle === undefined) delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD; else process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = savedIdle; - if (savedStuck === undefined) delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD; - else process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = savedStuck; await rm(home, { recursive: true, force: true }); } - const idleLine = lines.find((line) => line.includes('idle-agent')); - const stuckLine = lines.find((line) => line.includes('stuck-agent')); - expect(idleLine).toBeDefined(); - expect(idleLine).toContain('1s/idle'); - expect(idleLine).toMatch(/\bIDLE\b/); - expect(stuckLine).toBeDefined(); - expect(stuckLine).toContain('1s/stuck'); - expect(stuckLine).toMatch(/\bSTUCK\b/); + const workingLine = lines.find((line) => line.includes('working-agent')); + const availableLine = lines.find((line) => line.includes('available-agent')); + expect(workingLine).toBeDefined(); + expect(workingLine).toContain('1s/working'); + expect(availableLine).toBeDefined(); + expect(availableLine).toContain('1s/available'); + expect(availableLine).not.toMatch(/\bIDLE\b/); + expect(availableLine).not.toMatch(/\bSTUCK\b/); }); }); diff --git a/packages/mosaic/src/commands/fleet.ts b/packages/mosaic/src/commands/fleet.ts index 073f6c6..a43a94b 100644 --- a/packages/mosaic/src/commands/fleet.ts +++ b/packages/mosaic/src/commands/fleet.ts @@ -395,7 +395,6 @@ export function buildAgentTailCommand(agentName: string, lines: number, socketNa export const HEARTBEAT_INTERVAL_MS = 15_000; export const HEARTBEAT_IDLE_THRESHOLD_SECONDS = 300; -export const HEARTBEAT_STUCK_THRESHOLD_SECONDS = 900; /** * Heartbeat interval in ms, honoring MOSAIC_HEARTBEAT_INTERVAL (seconds) so the @@ -407,20 +406,14 @@ export function heartbeatIntervalMs(): number { return Number.isFinite(sec) && sec > 0 ? sec * 1000 : HEARTBEAT_INTERVAL_MS; } -/** Idle threshold in seconds, honoring MOSAIC_HEARTBEAT_IDLE_THRESHOLD. */ +/** Activity threshold in seconds, honoring MOSAIC_HEARTBEAT_IDLE_THRESHOLD. */ export function idleThresholdSeconds(): number { const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD ?? '', 10); return Number.isFinite(sec) && sec > 0 ? sec : HEARTBEAT_IDLE_THRESHOLD_SECONDS; } - -/** Stuck threshold in seconds, honoring MOSAIC_HEARTBEAT_STUCK_THRESHOLD. */ -export function stuckThresholdSeconds(): number { - const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD ?? '', 10); - return Number.isFinite(sec) && sec > 0 ? sec : HEARTBEAT_STUCK_THRESHOLD_SECONDS; -} export const HEARTBEAT_HEALTHY_MULTIPLIER = 3; -export type ReadinessState = 'working' | 'idle' | 'stuck' | 'stale' | 'dead' | 'unknown'; +export type ReadinessState = 'working' | 'available' | 'stuck' | 'stale' | 'dead' | 'unknown'; export interface ReadinessSignals { paneAlive: boolean; @@ -431,7 +424,6 @@ export interface ReadinessSignals { export interface ReadinessThresholds { idleThresholdSeconds: number; - stuckThresholdSeconds: number; } /** @@ -456,12 +448,8 @@ export function classifyReadiness( const idleThreshold = Number.isFinite(thresholds?.idleThresholdSeconds) ? Number(thresholds?.idleThresholdSeconds) : idleThresholdSeconds(); - const stuckThreshold = Number.isFinite(thresholds?.stuckThresholdSeconds) - ? Number(thresholds?.stuckThresholdSeconds) - : stuckThresholdSeconds(); - - if (idleSeconds >= stuckThreshold) return 'stuck'; - if (idleSeconds >= idleThreshold) return 'idle'; + // Follow-up: stuck pending per-agent assignment awareness: assigned task + idle past threshold => stuck. + if (idleSeconds >= idleThreshold) return 'available'; return 'working'; } catch { return 'unknown'; @@ -1089,7 +1077,6 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps = const rows: AgentPsRow[] = []; const readinessThresholds = { idleThresholdSeconds: idleThresholdSeconds(), - stuckThresholdSeconds: stuckThresholdSeconds(), }; // Build the set of roster agent names for quick lookup when filtering socket sessions. @@ -1264,8 +1251,6 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps = if (!row.managed) flags.push('UNMANAGED'); if (row.driftFlag) flags.push('DRIFT'); if (row.bootEnableWarning) flags.push('BOOT-ENABLE'); - if (row.readiness === 'idle') flags.push('IDLE'); - if (row.readiness === 'stuck') flags.push('STUCK'); console.log( [