feat(fleet): auto-enable units on install + drift recognizes wrapped runtimes (#583)
Some checks failed
ci/woodpecker/push/ci Pipeline failed
ci/woodpecker/push/publish Pipeline was successful

This commit was merged in pull request #583.
This commit is contained in:
2026-06-21 20:02:19 +00:00
parent c2c0b5fe8d
commit afcbbb302f
2 changed files with 240 additions and 17 deletions

View File

@@ -210,6 +210,93 @@ export function buildFleetServiceCommand(action: FleetServiceAction, agentName?:
return ['systemctl', '--user', action, service];
}
/**
* Returns the systemctl --user enable command for a given unit.
* Used by the install auto-enable step to persist units across reboots.
*/
export function buildSystemdEnableCommand(unit: string): string[] {
return ['systemctl', '--user', 'enable', unit];
}
/**
* Returns the loginctl enable-linger command for a given user.
* Linger allows user systemd services to survive logout.
*/
export function buildEnableLingerCommand(user: string): string[] {
return ['loginctl', 'enable-linger', user];
}
/**
* Enable fleet units for boot-survival after install.
* Non-fatal: if systemctl enable returns non-zero, a warning is printed and we continue.
* If opts.enable === false (--no-enable flag), the whole step is skipped.
*/
export async function enableFleetUnits(
runner: CommandRunner,
roster: FleetRoster,
opts: { enable?: boolean },
): Promise<void> {
if (opts.enable === false) {
return;
}
try {
let succeeded = 0;
let failed = 0;
const holderResult = await runner(
...splitCommand(buildSystemdEnableCommand('mosaic-tmux-holder.service')),
);
if (holderResult.exitCode === 0) {
succeeded++;
} else {
failed++;
process.stderr.write(
`Warning: could not enable mosaic-tmux-holder.service: ${holderResult.stderr || holderResult.stdout || 'non-zero exit'}\n`,
);
}
for (const agent of roster.agents) {
const unit = `mosaic-agent@${agent.name}.service`;
const result = await runner(...splitCommand(buildSystemdEnableCommand(unit)));
if (result.exitCode === 0) {
succeeded++;
} else {
failed++;
process.stderr.write(
`Warning: could not enable ${unit}: ${result.stderr || result.stdout || 'non-zero exit'}\n`,
);
}
}
if (succeeded > 0) {
console.log(`Enabled ${succeeded} unit(s) for boot-survival.`);
}
if (failed > 0) {
process.stderr.write(
`Warning: ${failed} unit(s) could not be enabled (systemctl unavailable?). Run manually if needed.\n`,
);
}
// Best-effort linger
let username: string;
try {
username = userInfo().username;
} catch {
username = process.env['USER'] ?? process.env['LOGNAME'] ?? 'unknown';
}
const lingerResult = await runner(...splitCommand(buildEnableLingerCommand(username)));
if (lingerResult.exitCode !== 0) {
process.stderr.write(
`Hint: run 'loginctl enable-linger ${username}' as root to survive logout.\n`,
);
}
} catch (err) {
process.stderr.write(
`Warning: auto-enable step failed unexpectedly: ${err instanceof Error ? err.message : String(err)}\n`,
);
}
}
export function buildAgentSendCommand(
paths: FleetPaths,
agentName: string,
@@ -437,32 +524,41 @@ export function parseTmuxListPanes(
return { pid, command, dead, idleSeconds };
}
/**
* Maps each known runtime to the set of acceptable pane commands.
* A pane running any of these commands for the given runtime is NOT considered drifted.
* Runtimes launched via `mosaic yolo` wrap in node, so 'node' is acceptable for most.
* The dogfood runtime accepts python3/python (the canary-pi dogfood stub).
*/
export const RUNTIME_ACCEPTABLE_COMMANDS: Record<string, readonly string[]> = {
claude: ['claude', 'node'],
codex: ['codex', 'node'],
opencode: ['opencode', 'node'],
pi: ['pi', 'node'],
dogfood: ['python3', 'python'],
};
/**
* Determine if there is a runtime drift: roster says one runtime but the pane
* is actually running something from a different runtime. We detect this by
* checking if the pane command doesn't match a known canonical command for the
* checking if the pane command doesn't match a known acceptable command for the
* roster's declared runtime.
*
* Known canonical commands per runtime:
* claude → claude
* codex → codex
* opencode → opencode
* pi → pi
* Known acceptable commands per runtime (see RUNTIME_ACCEPTABLE_COMMANDS):
* claude → claude, node (node covers mosaic yolo wrapper)
* codex → codex, node
* opencode → opencode, node
* pi → pi, node (python3 still flags drift for canary-pi dogfood stub)
* dogfood → python3, python
*
* If the pane is running something else (e.g., python3/dogfood-agent.py) for
* an agent whose roster runtime is "pi", that's a drift.
*/
export function detectDrift(rosterRuntime: string, paneCommand: string | null): boolean {
if (!paneCommand) return false;
const knownCommands: Record<string, string[]> = {
claude: ['claude'],
codex: ['codex'],
opencode: ['opencode'],
pi: ['pi'],
};
const expected = knownCommands[rosterRuntime];
if (!expected) return false;
return !expected.includes(paneCommand);
const acceptable = RUNTIME_ACCEPTABLE_COMMANDS[rosterRuntime];
if (!acceptable) return false;
return !acceptable.includes(paneCommand);
}
/**
@@ -706,12 +802,22 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
cmd
.command('install')
.description('Install local fleet tools and user systemd units')
.action(async () => installFleet(cmd, frameworkRoot));
.option('--no-enable', 'Skip enabling units for boot-survival')
.action(async (opts: { enable?: boolean }) => {
await installFleet(cmd, frameworkRoot);
const roster = await loadRosterForCommand(cmd);
await enableFleetUnits(runner, roster, opts);
});
cmd
.command('install-systemd')
.description('Install local fleet tools and user systemd units')
.action(async () => installFleet(cmd, frameworkRoot));
.option('--no-enable', 'Skip enabling units for boot-survival')
.action(async (opts: { enable?: boolean }) => {
await installFleet(cmd, frameworkRoot);
const roster = await loadRosterForCommand(cmd);
await enableFleetUnits(runner, roster, opts);
});
for (const action of ['start', 'stop', 'restart'] as const) {
cmd