feat(fleet): classify agent readiness in fleet ps (#649)
This commit was merged in pull request #649.
This commit is contained in:
@@ -19,17 +19,21 @@ import {
|
||||
buildSystemdShowCommand,
|
||||
buildTmuxListPanesCommand,
|
||||
buildTmuxListSessionsCommand,
|
||||
classifyReadiness,
|
||||
classifySendResult,
|
||||
countOrchestrators,
|
||||
countEnhancers,
|
||||
detectDrift,
|
||||
enableFleetUnits,
|
||||
FLEET_PROFILES,
|
||||
HEARTBEAT_IDLE_THRESHOLD_SECONDS,
|
||||
HEARTBEAT_STUCK_THRESHOLD_SECONDS,
|
||||
generateAgentEnv,
|
||||
getDefaultOperatorSourceLabel,
|
||||
getDefaultTenantAndHost,
|
||||
getRosterAgent,
|
||||
heartbeatPath,
|
||||
idleThresholdSeconds,
|
||||
isSendAccepted,
|
||||
loadFleetRoster,
|
||||
mergeAgentEnv,
|
||||
@@ -44,6 +48,7 @@ import {
|
||||
resolvePresetFilename,
|
||||
RUNTIME_ACCEPTABLE_COMMANDS,
|
||||
serializeRosterToYaml,
|
||||
stuckThresholdSeconds,
|
||||
VERIFY_DEFAULT_TIMEOUT_MS,
|
||||
VERIFY_POLL_INTERVAL_MS,
|
||||
type AgentPsRow,
|
||||
@@ -933,6 +938,127 @@ describe('fleet ps — heartbeat parsing', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — readiness thresholds', () => {
|
||||
const savedIdle = process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
const savedStuck = process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
|
||||
afterEach(() => {
|
||||
if (savedIdle === undefined) delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = savedIdle;
|
||||
if (savedStuck === undefined) delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = savedStuck;
|
||||
});
|
||||
|
||||
it('uses default readiness thresholds when env is unset', () => {
|
||||
delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
|
||||
expect(idleThresholdSeconds()).toBe(HEARTBEAT_IDLE_THRESHOLD_SECONDS);
|
||||
expect(stuckThresholdSeconds()).toBe(HEARTBEAT_STUCK_THRESHOLD_SECONDS);
|
||||
});
|
||||
|
||||
it('honors positive integer readiness thresholds from env', () => {
|
||||
process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '120';
|
||||
process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = '480';
|
||||
|
||||
expect(idleThresholdSeconds()).toBe(120);
|
||||
expect(stuckThresholdSeconds()).toBe(480);
|
||||
});
|
||||
|
||||
it('falls back to defaults for invalid readiness thresholds', () => {
|
||||
process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '0';
|
||||
process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = 'not-a-number';
|
||||
|
||||
expect(idleThresholdSeconds()).toBe(HEARTBEAT_IDLE_THRESHOLD_SECONDS);
|
||||
expect(stuckThresholdSeconds()).toBe(HEARTBEAT_STUCK_THRESHOLD_SECONDS);
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — readiness classification', () => {
|
||||
const thresholds = { idleThresholdSeconds: 300, stuckThresholdSeconds: 900 };
|
||||
|
||||
it('reports dead when the pane is not alive', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: false, hbHealth: 'healthy', hbStatus: 'busy', idleSeconds: 0 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('dead');
|
||||
});
|
||||
|
||||
it('reports unknown when heartbeat health is unknown', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'unknown', hbStatus: null, idleSeconds: 0 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('unknown');
|
||||
});
|
||||
|
||||
it('reports stale when heartbeat health is stale', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'stale', hbStatus: 'busy', idleSeconds: 1_000 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('stale');
|
||||
});
|
||||
|
||||
it('reports working when heartbeat status is busy, even past stuck threshold', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'busy', idleSeconds: 2_000 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('working');
|
||||
});
|
||||
|
||||
it('reports working when pane idle seconds are unavailable', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: null },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('working');
|
||||
});
|
||||
|
||||
it('reports stuck at the stuck threshold boundary', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 900 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('stuck');
|
||||
});
|
||||
|
||||
it('reports idle at the idle threshold boundary', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 300 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('idle');
|
||||
});
|
||||
|
||||
it('reports working below the idle threshold', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 299 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('working');
|
||||
});
|
||||
|
||||
it('checks stuck before idle when thresholds are inverted', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 350 },
|
||||
{ idleThresholdSeconds: 900, stuckThresholdSeconds: 300 },
|
||||
),
|
||||
).toBe('stuck');
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — systemd show parsing', () => {
|
||||
it('parses ActiveState, SubState, UnitFileState from systemctl show output', () => {
|
||||
const output = 'ActiveState=active\nSubState=running\nUnitFileState=enabled\n';
|
||||
@@ -1324,8 +1450,9 @@ describe('fleet ps — JSON output shape (FR-6)', () => {
|
||||
// boot-enable warning: active + disabled
|
||||
expect(row.bootEnableWarning).toBe(true);
|
||||
|
||||
// heartbeat missing → unknown
|
||||
// heartbeat missing → unknown readiness preserves existing display semantics
|
||||
expect(row.heartbeat.health).toBe('unknown');
|
||||
expect(row.readiness).toBe('unknown');
|
||||
|
||||
expect(row.name).toBe('canary-pi');
|
||||
expect(row.runtime).toBe('pi');
|
||||
@@ -1387,6 +1514,92 @@ describe('fleet ps — command sequences issued', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — readiness table output', () => {
|
||||
it('renders readiness in HB column and flags idle/stuck rows', async () => {
|
||||
const home = await mkdtemp(join(tmpdir(), 'mosaic-fleet-'));
|
||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||
const runDir = join(home, 'fleet', 'run');
|
||||
await mkdir(runDir, { recursive: true });
|
||||
await writeFile(
|
||||
rosterPath,
|
||||
[
|
||||
'version: 1',
|
||||
'transport: tmux',
|
||||
'agents:',
|
||||
' - name: idle-agent',
|
||||
' runtime: pi',
|
||||
' - name: stuck-agent',
|
||||
' runtime: pi',
|
||||
].join('\n'),
|
||||
);
|
||||
|
||||
const nowMs = 1_700_000_000_000;
|
||||
const idleActivityEpoch = Math.floor((nowMs - 10_000) / 1000);
|
||||
const stuckActivityEpoch = Math.floor((nowMs - 40_000) / 1000);
|
||||
const hbTs = new Date(nowMs - 1_000).toISOString();
|
||||
await writeFile(join(runDir, 'idle-agent.hb'), `ts=${hbTs}\npid=111\nstatus=ok\n`);
|
||||
await writeFile(join(runDir, 'stuck-agent.hb'), `ts=${hbTs}\npid=222\nstatus=ok\n`);
|
||||
|
||||
const savedIdle = process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
const savedStuck = process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '5';
|
||||
process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = '30';
|
||||
|
||||
const dateNow = vi.spyOn(Date, 'now').mockReturnValue(nowMs);
|
||||
const runner: CommandRunner = async (command, args) => {
|
||||
const full = [command, ...args].join(' ');
|
||||
if (full.includes('list-sessions')) {
|
||||
return { stdout: 'idle-agent\nstuck-agent\n', stderr: '', exitCode: 0 };
|
||||
}
|
||||
if (full.includes('=idle-agent:0.0')) {
|
||||
return { stdout: `111 pi 0 ${idleActivityEpoch}\n`, stderr: '', exitCode: 0 };
|
||||
}
|
||||
if (full.includes('=stuck-agent:0.0')) {
|
||||
return { stdout: `222 pi 0 ${stuckActivityEpoch}\n`, stderr: '', exitCode: 0 };
|
||||
}
|
||||
if (full.includes('systemctl') && full.includes('show')) {
|
||||
return {
|
||||
stdout: 'ActiveState=active\nSubState=running\nUnitFileState=enabled\n',
|
||||
stderr: '',
|
||||
exitCode: 0,
|
||||
};
|
||||
}
|
||||
return { stdout: '', stderr: '', exitCode: 0 };
|
||||
};
|
||||
|
||||
const lines: string[] = [];
|
||||
const origLog = console.log;
|
||||
console.log = (msg: string) => {
|
||||
lines.push(msg);
|
||||
};
|
||||
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerFleetCommand(program, { runner, mosaicHome: home });
|
||||
|
||||
try {
|
||||
await program.parseAsync(['node', 'mosaic', 'fleet', 'ps']);
|
||||
} finally {
|
||||
console.log = origLog;
|
||||
dateNow.mockRestore();
|
||||
if (savedIdle === undefined) delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = savedIdle;
|
||||
if (savedStuck === undefined) delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = savedStuck;
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
const idleLine = lines.find((line) => line.includes('idle-agent'));
|
||||
const stuckLine = lines.find((line) => line.includes('stuck-agent'));
|
||||
expect(idleLine).toBeDefined();
|
||||
expect(idleLine).toContain('1s/idle');
|
||||
expect(idleLine).toMatch(/\bIDLE\b/);
|
||||
expect(stuckLine).toBeDefined();
|
||||
expect(stuckLine).toContain('1s/stuck');
|
||||
expect(stuckLine).toMatch(/\bSTUCK\b/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildTmuxListSessionsCommand', () => {
|
||||
it('builds exact list-sessions command with session_name format', () => {
|
||||
expect(buildTmuxListSessionsCommand('mosaic-fleet')).toEqual([
|
||||
@@ -1514,6 +1727,7 @@ describe('fleet ps — unmanaged socket sessions', () => {
|
||||
|
||||
// driftFlag must be false for unmanaged (no roster runtime to compare)
|
||||
expect(unmanagedRow.driftFlag).toBe(false);
|
||||
expect(unmanagedRow.readiness).toBe('unknown');
|
||||
});
|
||||
|
||||
it('shows UNMANAGED flag in table output for unmanaged sessions', async () => {
|
||||
|
||||
Reference in New Issue
Block a user