diff --git a/docs/TASKS.md b/docs/TASKS.md index e9dda35..d2a935c 100644 --- a/docs/TASKS.md +++ b/docs/TASKS.md @@ -58,3 +58,7 @@ Active workstream is **W1 — Federation v1**. Workers should: ## F3-m3 — mosaic update re-seeds framework + relaunches agents (#609) — feat/f3-m3-update-reseed - Status: implemented + tested. Closes R13: `mosaic update` now re-seeds the framework (data-safe MOSAIC_SYNC_ONLY) after the CLI install so shipped launcher/runtime changes activate; `--relaunch` restarts rostered agents; `--no-reseed` opts out. Detail: scratchpads/f3-m3-update-reseed.md. + +## Fleet-polish bundle — boot-survival symmetry (#611) — feat/fleet-polish-bundle + +- Status: implemented + tested. disable-on-remove (boot-resurrection bug, TDD) + add-enable + init-R5 hard guarantee. 4 new + 147 existing fleet tests green. Detail: scratchpads/fleet-polish-bundle.md. diff --git a/docs/scratchpads/fleet-polish-bundle.md b/docs/scratchpads/fleet-polish-bundle.md new file mode 100644 index 0000000..b76e4f9 --- /dev/null +++ b/docs/scratchpads/fleet-polish-bundle.md @@ -0,0 +1,20 @@ +# Fleet-polish bundle — boot-survival symmetry (#611) + +- **Issue:** #611 · **Branch:** `feat/fleet-polish-bundle` · From the Lead's Codex symmetry-gap finding. + +## Three fixes + +1. **disable-on-remove (BUG, TDD).** `fleet remove` stopped + deleted roster/env/heartbeat but never + `systemctl --user disable mosaic-agent@NAME.service` → a removed-but-enabled unit could resurrect on + reboot pointing at deleted config. Fix: `buildSystemdDisableCommand` + disable in `remove` + (best-effort, gated on !--keep-files). +2. **add-enable.** `fleet add` now enables the new agent's unit for boot-survival (best-effort, + independent of --start) — symmetry with disable-on-remove. +3. **init-R5 guarantee.** `fleet init --write` now FAILS HARD when a non-minimal profile doesn't yield + exactly one orchestrator (was a soft warning). `minimal` (sanctioned no-orchestrator) still allowed. + +## Verification + +- 4 new tests (disable builder; remove-invokes-disable; add-invokes-enable; init general → exactly 1 + orchestrator) + 147 existing fleet tests green (151 total). tsc/eslint/prettier clean. +- TDD on the disable bug per contract. diff --git a/packages/mosaic/src/commands/fleet.spec.ts b/packages/mosaic/src/commands/fleet.spec.ts index f0c3262..9714c77 100644 --- a/packages/mosaic/src/commands/fleet.spec.ts +++ b/packages/mosaic/src/commands/fleet.spec.ts @@ -14,6 +14,7 @@ import { buildEnableLingerCommand, buildFleetServiceCommand, buildSystemdEnableCommand, + buildSystemdDisableCommand, buildSystemdShowCommand, buildTmuxListPanesCommand, buildTmuxListSessionsCommand, @@ -983,6 +984,127 @@ describe('fleet ps — drift detection', () => { }); }); +describe('fleet-polish bundle — boot-survival symmetry', () => { + async function rosterHome(agents: string): Promise { + const home = await tempDir(); + await mkdir(join(home, 'fleet'), { recursive: true }); + await writeFile(join(home, 'fleet', 'roster.yaml'), agents); + return home; + } + + it('buildSystemdDisableCommand returns the systemctl --user disable array', () => { + expect(buildSystemdDisableCommand('mosaic-agent@coder0.service')).toEqual([ + 'systemctl', + '--user', + 'disable', + 'mosaic-agent@coder0.service', + ]); + }); + + it('fleet remove DISABLES the unit so a removed agent cannot resurrect on boot', async () => { + const home = await rosterHome( + [ + 'version: 1', + 'transport: tmux', + 'agents:', + ' - name: orchestrator', + ' runtime: pi', + ' class: orchestrator', + ' - name: coder0', + ' runtime: codex', + ' class: worker', + ].join('\n') + '\n', + ); + const calls: string[][] = []; + const runner: CommandRunner = async (command, args) => { + calls.push([command, ...args]); + return { stdout: '', stderr: '', exitCode: 0 }; + }; + const program = new Command(); + program.exitOverride(); + registerFleetCommand(program, { runner, mosaicHome: home }); + try { + await program.parseAsync(['node', 'mosaic', 'fleet', 'remove', 'coder0']); + expect(calls).toContainEqual([ + 'systemctl', + '--user', + 'disable', + 'mosaic-agent@coder0.service', + ]); + // stop must still happen too + expect(calls).toContainEqual(['systemctl', '--user', 'stop', 'mosaic-agent@coder0.service']); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + + it('fleet add ENABLES the new agent unit for boot-survival', async () => { + const home = await rosterHome( + ['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join( + '\n', + ) + '\n', + ); + const calls: string[][] = []; + const runner: CommandRunner = async (command, args) => { + calls.push([command, ...args]); + return { stdout: '', stderr: '', exitCode: 0 }; + }; + const program = new Command(); + program.exitOverride(); + registerFleetCommand(program, { runner, mosaicHome: home }); + try { + await program.parseAsync([ + 'node', + 'mosaic', + 'fleet', + 'add', + 'coder1', + '--runtime', + 'codex', + '--class', + 'worker', + '--no-start', + ]); + expect(calls).toContainEqual([ + 'systemctl', + '--user', + 'enable', + 'mosaic-agent@coder1.service', + ]); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + + it('fleet init --write fails hard when a non-minimal profile lacks exactly one orchestrator', async () => { + // The general profile must yield exactly one orchestrator; the guarantee is + // enforced (not just warned). We assert the happy path writes cleanly. + const home = await tempDir(); + const program = new Command(); + program.exitOverride(); + registerFleetCommand(program, { + runner: async () => ({ stdout: '', stderr: '', exitCode: 0 }), + mosaicHome: home, + }); + try { + await program.parseAsync([ + 'node', + 'mosaic', + 'fleet', + 'init', + '--profile', + 'general', + '--write', + ]); + const written = await readFile(join(home, 'fleet', 'roster.yaml'), 'utf8'); + const orchestrators = (written.match(/class:\s*orchestrator/g) ?? []).length; + expect(orchestrators).toBe(1); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); +}); + describe('fleet install — auto-enable units for boot-survival', () => { it('buildSystemdEnableCommand and buildEnableLingerCommand return correct command arrays', () => { expect(buildSystemdEnableCommand('mosaic-tmux-holder.service')).toEqual([ diff --git a/packages/mosaic/src/commands/fleet.ts b/packages/mosaic/src/commands/fleet.ts index e51ef0b..cbba0cb 100644 --- a/packages/mosaic/src/commands/fleet.ts +++ b/packages/mosaic/src/commands/fleet.ts @@ -227,6 +227,15 @@ export function buildSystemdEnableCommand(unit: string): string[] { return ['systemctl', '--user', 'enable', unit]; } +/** + * Returns the systemctl --user disable command for a given unit. + * Used by `fleet remove` so a removed agent's enabled unit cannot resurrect on + * boot pointing at deleted config (boot-survival symmetry with enable-on-add). + */ +export function buildSystemdDisableCommand(unit: string): string[] { + return ['systemctl', '--user', 'disable', unit]; +} + /** * Returns the loginctl enable-linger command for a given user. * Linger allows user systemd services to survive logout. @@ -872,15 +881,19 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps = await mkdir(dirname(destination), { recursive: true }); await writeFile(destination, content); - // Validate: exactly one orchestrator required (R5) — friendly summary on success. + // Guarantee R5: exactly one orchestrator for every profile except the + // sanctioned no-orchestrator `minimal` preset. A mismatch means a + // corrupted/edited preset — fail hard rather than write a malformed fleet. const written = await loadFleetRoster(destination); const orchCount = countOrchestrators(written); - if (orchCount !== 1) { - process.stderr.write( - `Warning: fleet roster at ${destination} has ${orchCount} orchestrator agent(s) (expected exactly 1).\n`, - ); + if (profile === 'minimal') { console.log( - `Initialized ${profile} fleet: ${written.agents.length} agent(s). Next: mosaic fleet install`, + `Initialized ${profile} fleet: ${written.agents.length} agent(s) (no orchestrator). Next: mosaic fleet install`, + ); + } else if (orchCount !== 1) { + throw new Error( + `Fleet init failed: the "${profile}" roster has ${orchCount} orchestrator agent(s), ` + + `expected exactly 1 (R5). The preset may be corrupted — re-install the framework.`, ); } else { const workerCount = written.agents.length - 1; @@ -1218,6 +1231,24 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps = console.log(`Added ${name} (${opts.runtime}/${opts.class}) to the fleet.`); + // Enable the unit for boot-survival (non-fatal) — symmetry with + // disable-on-remove. Independent of --start so a queued agent still + // survives a reboot once its unit exists. + try { + const enableResult = await runner( + ...splitCommand(buildSystemdEnableCommand(`mosaic-agent@${name}.service`)), + ); + if (enableResult.exitCode !== 0) { + process.stderr.write( + `Warning: could not enable mosaic-agent@${name}.service: ${enableResult.stderr || enableResult.stdout || 'non-zero exit'}\n`, + ); + } + } catch (err) { + process.stderr.write( + `Warning: enable command failed for ${name}: ${err instanceof Error ? err.message : String(err)}\n`, + ); + } + if (opts.start !== false) { await runChecked(runner, buildFleetServiceCommand('start', name)); console.log(`Started mosaic-agent@${name}.service.`); @@ -1254,6 +1285,26 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps = ); } + // Disable the unit (non-fatal) so an enabled instance cannot resurrect on + // boot pointing at the now-deleted config — boot-survival symmetry with + // enable-on-add. Skipped only when --keep-files keeps the config in place. + if (!opts.keepFiles) { + try { + const disableResult = await runner( + ...splitCommand(buildSystemdDisableCommand(`mosaic-agent@${name}.service`)), + ); + if (disableResult.exitCode !== 0) { + process.stderr.write( + `Warning: could not disable mosaic-agent@${name}.service: ${disableResult.stderr || disableResult.stdout || 'non-zero exit'}\n`, + ); + } + } catch (err) { + process.stderr.write( + `Warning: disable command failed for ${name}: ${err instanceof Error ? err.message : String(err)}\n`, + ); + } + } + // Write updated roster await writeFile(rosterPath, serializeRosterToYaml(updatedRoster));