feat(fleet): classify agent readiness in fleet ps (#649)
All checks were successful
ci/woodpecker/push/publish Pipeline was successful
ci/woodpecker/push/ci Pipeline was successful

This commit was merged in pull request #649.
This commit is contained in:
2026-06-24 05:55:47 +00:00
parent e3adc6a1bc
commit d887555852
3 changed files with 377 additions and 4 deletions

View File

@@ -394,6 +394,8 @@ export function buildAgentTailCommand(agentName: string, lines: number, socketNa
// ---------------------------------------------------------------------------
export const HEARTBEAT_INTERVAL_MS = 15_000;
export const HEARTBEAT_IDLE_THRESHOLD_SECONDS = 300;
export const HEARTBEAT_STUCK_THRESHOLD_SECONDS = 900;
/**
* Heartbeat interval in ms, honoring MOSAIC_HEARTBEAT_INTERVAL (seconds) so the
@@ -404,8 +406,68 @@ export function heartbeatIntervalMs(): number {
const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_INTERVAL ?? '', 10);
return Number.isFinite(sec) && sec > 0 ? sec * 1000 : HEARTBEAT_INTERVAL_MS;
}
/** Idle threshold in seconds, honoring MOSAIC_HEARTBEAT_IDLE_THRESHOLD. */
export function idleThresholdSeconds(): number {
const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD ?? '', 10);
return Number.isFinite(sec) && sec > 0 ? sec : HEARTBEAT_IDLE_THRESHOLD_SECONDS;
}
/** Stuck threshold in seconds, honoring MOSAIC_HEARTBEAT_STUCK_THRESHOLD. */
export function stuckThresholdSeconds(): number {
const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD ?? '', 10);
return Number.isFinite(sec) && sec > 0 ? sec : HEARTBEAT_STUCK_THRESHOLD_SECONDS;
}
export const HEARTBEAT_HEALTHY_MULTIPLIER = 3;
export type ReadinessState = 'working' | 'idle' | 'stuck' | 'stale' | 'dead' | 'unknown';
export interface ReadinessSignals {
paneAlive: boolean;
hbHealth: 'healthy' | 'stale' | 'unknown';
hbStatus: 'ok' | 'busy' | null;
idleSeconds: number | null;
}
export interface ReadinessThresholds {
idleThresholdSeconds: number;
stuckThresholdSeconds: number;
}
/**
* Classify whether an agent is progressing based on already-parsed heartbeat/tmux signals.
* Best-effort and runtime-agnostic: it never probes, never throws, and preserves existing
* unknown/stale behavior when heartbeat data is absent or old.
*/
export function classifyReadiness(
signals: Partial<ReadinessSignals> | null | undefined,
thresholds: Partial<ReadinessThresholds> | null | undefined = {},
): ReadinessState {
try {
if (signals?.paneAlive !== true) return 'dead';
if (signals.hbHealth === 'unknown' || signals.hbHealth === undefined) return 'unknown';
if (signals.hbHealth === 'stale') return 'stale';
if (signals.hbStatus === 'busy') return 'working';
if (signals.idleSeconds === null || signals.idleSeconds === undefined) return 'working';
const idleSeconds = Number.isFinite(signals.idleSeconds) ? signals.idleSeconds : null;
if (idleSeconds === null) return 'working';
const idleThreshold = Number.isFinite(thresholds?.idleThresholdSeconds)
? Number(thresholds?.idleThresholdSeconds)
: idleThresholdSeconds();
const stuckThreshold = Number.isFinite(thresholds?.stuckThresholdSeconds)
? Number(thresholds?.stuckThresholdSeconds)
: stuckThresholdSeconds();
if (idleSeconds >= stuckThreshold) return 'stuck';
if (idleSeconds >= idleThreshold) return 'idle';
return 'working';
} catch {
return 'unknown';
}
}
export interface HeartbeatInfo {
ts: Date | null;
pid: number | null;
@@ -429,6 +491,7 @@ export interface AgentPsRow {
paneCommand: string | null;
idleSeconds: number | null;
heartbeat: HeartbeatInfo;
readiness: ReadinessState;
/** roster runtime !== actual pane command */
driftFlag: boolean;
/** active but UnitFileState=disabled */
@@ -1022,6 +1085,10 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
const nowMs = Date.now();
const rows: AgentPsRow[] = [];
const readinessThresholds = {
idleThresholdSeconds: idleThresholdSeconds(),
stuckThresholdSeconds: stuckThresholdSeconds(),
};
// Build the set of roster agent names for quick lookup when filtering socket sessions.
const rosterAgentNames = new Set(roster.agents.map((a) => a.name));
@@ -1052,6 +1119,17 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
const bootEnableWarning =
sysInfo.ActiveState === 'active' && sysInfo.UnitFileState === 'disabled';
const paneAlive = !paneInfo.dead;
const readiness = classifyReadiness(
{
paneAlive,
hbHealth: hb.health,
hbStatus: hb.status,
idleSeconds: paneInfo.idleSeconds,
},
readinessThresholds,
);
rows.push({
name: agent.name,
tenant_id,
@@ -1059,11 +1137,12 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
runtime: agent.runtime,
systemdActive: sysInfo.ActiveState,
systemdEnabled: sysInfo.UnitFileState,
paneAlive: !paneInfo.dead,
paneAlive,
panePid: paneInfo.pid,
paneCommand: paneInfo.command,
idleSeconds: paneInfo.idleSeconds,
heartbeat: hb,
readiness,
driftFlag,
bootEnableWarning,
managed: true,
@@ -1110,6 +1189,17 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
const bootEnableWarning =
sysInfo.ActiveState === 'active' && sysInfo.UnitFileState === 'disabled';
const paneAlive = !paneInfo.dead;
const readiness = classifyReadiness(
{
paneAlive,
hbHealth: hb.health,
hbStatus: hb.status,
idleSeconds: paneInfo.idleSeconds,
},
readinessThresholds,
);
rows.push({
name: sessionName,
tenant_id,
@@ -1118,11 +1208,12 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
runtime: 'unknown',
systemdActive: sysInfo.ActiveState,
systemdEnabled: sysInfo.UnitFileState,
paneAlive: !paneInfo.dead,
paneAlive,
panePid: paneInfo.pid,
paneCommand: paneInfo.command,
idleSeconds: paneInfo.idleSeconds,
heartbeat: hb,
readiness,
// No roster runtime to compare — drift is not meaningful for unmanaged sessions
driftFlag: false,
bootEnableWarning,
@@ -1164,13 +1255,15 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
const idle = row.idleSeconds !== null ? `${row.idleSeconds}s` : '-';
const hbAge =
row.heartbeat.ageMs !== null
? `${Math.round(row.heartbeat.ageMs / 1000)}s/${row.heartbeat.health}`
? `${Math.round(row.heartbeat.ageMs / 1000)}s/${row.readiness}`
: `unknown`;
const model = row.heartbeat.model ?? '-';
const flags: string[] = [];
if (!row.managed) flags.push('UNMANAGED');
if (row.driftFlag) flags.push('DRIFT');
if (row.bootEnableWarning) flags.push('BOOT-ENABLE');
if (row.readiness === 'idle') flags.push('IDLE');
if (row.readiness === 'stuck') flags.push('STUCK');
console.log(
[