From 9c2e4f0b2dfdc3983538a56dd529976768b9f750 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Wed, 24 Jun 2026 17:05:36 -0500 Subject: [PATCH 1/3] fix(fleet): guard `mosaic fleet restart` against tight-loop re-entry race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `mosaic fleet restart` runs as a fresh process each invocation, issuing `systemctl --user restart` for the tmux holder and then each agent. The agent sessions live inside the holder's tmux, so restarting the holder tears them down. With no mutual exclusion, a second restart entering while the first is still mid-teardown (upgrade `--relaunch`, a watchdog, or a hurried operator) interleaves: agents relaunch against a half-torn-down holder, fail, and tight-loop. Add a cross-process teardown-settle guard: a lock file under `/fleet/run/restart.lock` acquired with O_CREAT|O_EXCL. A re-entrant restart waits (bounded, injectable sleep) for the in-flight restart to release the lock before relaunching, breaks a stale lock left by a crashed owner, and after a max wait breaks the lock to avoid a permanent deadlock. Both full-fleet and single-agent restart paths are guarded; start/stop/status are unchanged. Regression test reproduces the race: with an in-flight lock held, the restart must wait before issuing any systemctl command — fails on the unguarded code path, passes with the guard. Adds stale-lock-break and lock-release coverage. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/mosaic/src/commands/fleet.spec.ts | 140 ++++++++++++++++++++ packages/mosaic/src/commands/fleet.ts | 142 ++++++++++++++++++++- 2 files changed, 281 insertions(+), 1 deletion(-) diff --git a/packages/mosaic/src/commands/fleet.spec.ts b/packages/mosaic/src/commands/fleet.spec.ts index b1bf5ea..9545170 100644 --- a/packages/mosaic/src/commands/fleet.spec.ts +++ b/packages/mosaic/src/commands/fleet.spec.ts @@ -45,6 +45,8 @@ import { removeAgentFromRoster, resolveFleetPaths, resolvePresetFilename, + restartLockPath, + RESTART_LOCK_STALE_MS, RUNTIME_ACCEPTABLE_COMMANDS, serializeRosterToYaml, VERIFY_DEFAULT_TIMEOUT_MS, @@ -678,6 +680,144 @@ describe('fleet command construction', () => { } }); + it('waits for an in-flight restart to clear before relaunching (re-entry guard)', async () => { + const home = await tempDir(); + const rosterPath = join(home, 'fleet', 'roster.yaml'); + await mkdir(join(home, 'fleet'), { recursive: true }); + await writeFile( + rosterPath, + ['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join( + '\n', + ), + ); + + // Simulate another `mosaic fleet restart` process mid-teardown: a fresh lock + // (recent timestamp, so it is NOT treated as stale) already held. + const lockPath = restartLockPath(home); + await mkdir(dirname(lockPath), { recursive: true }); + await writeFile(lockPath, `4242\n${Date.now()}\n`); + + const events: string[] = []; + const runner: CommandRunner = async (command, args) => { + events.push(`run:${args[args.length - 1]}`); + return { stdout: '', stderr: '', exitCode: 0 }; + }; + // The injected sleep stands in for time passing while we wait; the in-flight + // restart "finishes" (releases its lock) after the first poll. + let sleeps = 0; + const sleepFn: SleepFn = async () => { + sleeps += 1; + events.push(`sleep:${sleeps}`); + await rm(lockPath, { force: true }); + }; + + const program = new Command(); + program.exitOverride(); + registerFleetCommand(program, { runner, sleepFn, mosaicHome: home }); + + try { + await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']); + + // It must have waited at least once before issuing any systemctl restart. + expect(sleeps).toBeGreaterThan(0); + const firstSleep = events.findIndex((e) => e.startsWith('sleep:')); + const firstRun = events.findIndex((e) => e.startsWith('run:')); + expect(firstSleep).toBeGreaterThanOrEqual(0); + expect(firstRun).toBeGreaterThan(firstSleep); + + // And it still performs the full restart once the lock clears. + expect(events).toContain('run:mosaic-tmux-holder.service'); + expect(events).toContain('run:mosaic-agent@coder0.service'); + + // The lock is released after the restart completes. + await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' }); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + + it('breaks a stale restart lock and proceeds without waiting', async () => { + const home = await tempDir(); + const rosterPath = join(home, 'fleet', 'roster.yaml'); + await mkdir(join(home, 'fleet'), { recursive: true }); + await writeFile( + rosterPath, + ['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join( + '\n', + ), + ); + + // A lock left behind by a crashed owner: timestamp older than the stale window. + const lockPath = restartLockPath(home); + await mkdir(dirname(lockPath), { recursive: true }); + await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n`); + + const calls: string[][] = []; + const runner: CommandRunner = async (command, args) => { + calls.push([command, ...args]); + return { stdout: '', stderr: '', exitCode: 0 }; + }; + const sleepFn = vi.fn(async () => {}); + + const program = new Command(); + program.exitOverride(); + registerFleetCommand(program, { runner, sleepFn, mosaicHome: home }); + + try { + await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']); + + // Stale lock is broken immediately — no waiting. + expect(sleepFn).not.toHaveBeenCalled(); + expect(calls).toEqual([ + ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'], + ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'], + ]); + // The stale lock is gone once the restart completes. + await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' }); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + + it('releases the restart lock so a subsequent restart is not blocked', async () => { + const home = await tempDir(); + const rosterPath = join(home, 'fleet', 'roster.yaml'); + await mkdir(join(home, 'fleet'), { recursive: true }); + await writeFile( + rosterPath, + ['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join( + '\n', + ), + ); + + const calls: string[][] = []; + const runner: CommandRunner = async (command, args) => { + calls.push([command, ...args]); + return { stdout: '', stderr: '', exitCode: 0 }; + }; + const sleepFn = vi.fn(async () => {}); + + const program = new Command(); + program.exitOverride(); + registerFleetCommand(program, { runner, sleepFn, mosaicHome: home }); + + try { + await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']); + await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']); + + // Two sequential restarts both run fully and neither has to wait. + expect(sleepFn).not.toHaveBeenCalled(); + expect(calls).toEqual([ + ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'], + ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'], + ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'], + ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'], + ]); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => { const home = await tempDir(); const rosterPath = join(home, 'fleet', 'roster.yaml'); diff --git a/packages/mosaic/src/commands/fleet.ts b/packages/mosaic/src/commands/fleet.ts index 3149114..d343640 100644 --- a/packages/mosaic/src/commands/fleet.ts +++ b/packages/mosaic/src/commands/fleet.ts @@ -1,5 +1,14 @@ import { constants } from 'node:fs'; -import { access, chmod, copyFile, mkdir, readFile, unlink, writeFile } from 'node:fs/promises'; +import { + access, + chmod, + copyFile, + mkdir, + open, + readFile, + unlink, + writeFile, +} from 'node:fs/promises'; import { homedir, hostname, userInfo } from 'node:os'; import { dirname, join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -533,6 +542,108 @@ export function buildFleetServiceCommand(action: FleetServiceAction, agentName?: return ['systemctl', '--user', action, service]; } +/** Poll interval (ms) while waiting for an in-flight restart's lock to clear. */ +export const RESTART_LOCK_POLL_INTERVAL_MS = 250; +/** + * Maximum time (ms) a re-entrant restart waits for the in-flight restart to + * finish before it breaks the lock and proceeds anyway. A bound is required so + * a crashed holder of the lock can never deadlock the fleet permanently. + */ +export const RESTART_LOCK_MAX_WAIT_MS = 30_000; +/** + * Age (ms) past which a restart lock is treated as stale (its owner died + * without releasing it) and is broken immediately rather than waited on. + */ +export const RESTART_LOCK_STALE_MS = 60_000; + +/** + * Resolves the path of the cross-process restart lock for a given Mosaic home. + * Kept strictly under `/fleet/run` (not the heartbeat env override) + * so the lock is scoped to the same fleet the restart acts on. + */ +export function restartLockPath(mosaicHome: string): string { + return join(mosaicHome, 'fleet', 'run', 'restart.lock'); +} + +/** A held restart lock; `release()` removes the lock file (idempotent). */ +interface RestartGuard { + release(): Promise; +} + +/** + * Returns true when an existing lock file is stale: older than + * RESTART_LOCK_STALE_MS, or unreadable/unparseable (a corrupt or partially + * written lock left by a crashed owner). A vanished lock (ENOENT) is not stale — + * the next acquire attempt will simply succeed. + */ +async function isRestartLockStale(lockPath: string, now: number): Promise { + let raw: string; + try { + raw = await readFile(lockPath, 'utf8'); + } catch (err) { + if ((err as NodeJS.ErrnoException).code === 'ENOENT') { + return false; + } + return true; + } + const stampLine = raw.split('\n')[1] ?? ''; + const stamp = Number.parseInt(stampLine.trim(), 10); + if (!Number.isFinite(stamp)) { + return true; + } + return now - stamp >= RESTART_LOCK_STALE_MS; +} + +/** + * Acquire the fleet restart lock, serializing concurrent `mosaic fleet restart` + * invocations across processes. Each restart tears the tmux holder (and the + * agent sessions inside it) down and back up; without this guard a re-entrant + * restart relaunches agents against a half-torn-down holder, which fails and + * tight-loops. A re-entrant caller waits for the in-flight restart to release + * the lock (clean shutdown settled) before proceeding, breaks a stale lock left + * by a crashed owner, and after RESTART_LOCK_MAX_WAIT_MS breaks the lock to + * avoid a permanent deadlock. + */ +async function acquireRestartLock(mosaicHome: string, sleepFn: SleepFn): Promise { + const lockPath = restartLockPath(mosaicHome); + await mkdir(dirname(lockPath), { recursive: true }); + const release = async (): Promise => { + try { + await unlink(lockPath); + } catch { + // Already gone (broken as stale by another waiter, or never written) — fine. + } + }; + const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS; + for (;;) { + try { + const handle = await open(lockPath, 'wx'); + await handle.writeFile(`${process.pid}\n${Date.now()}\n`); + await handle.close(); + return { release }; + } catch (err) { + if ((err as NodeJS.ErrnoException).code !== 'EEXIST') { + throw err; + } + // A restart is already in flight (or its lock was left behind). + if (await isRestartLockStale(lockPath, Date.now())) { + process.stderr.write('Breaking stale fleet restart lock and proceeding.\n'); + await release(); + continue; + } + if (Date.now() >= deadline) { + process.stderr.write( + `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` + + 'restart; breaking the lock and proceeding.\n', + ); + await release(); + continue; + } + await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS); + } + } +} + /** * Returns the systemctl --user enable command for a given unit. * Used by the install auto-enable step to persist units across reboots. @@ -1172,6 +1283,7 @@ export function isSendAccepted(capturedOutput: string): SendVerifyResult { export function registerFleetCommand(program: Command, deps: FleetCommandDeps = {}): Command { const runner = deps.runner ?? runCommand; + const sleepFn = deps.sleepFn ?? defaultSleep; const paths = resolveFleetPaths(deps.mosaicHome); const frameworkRoot = deps.frameworkRoot ?? resolveFrameworkRoot(); @@ -1285,9 +1397,22 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps = .command(`${action} [agent]`) .description(`${action} the fleet holder or one agent`) .action(async (agent?: string) => { + const commandOpts = cmd.opts<{ mosaicHome: string; roster?: string }>(); + const activePaths = resolveFleetPaths(commandOpts.mosaicHome); const roster = await loadRosterForCommand(cmd); if (agent) { getRosterAgent(roster, agent); + // Single-agent restart is guarded too: it can race a full restart that + // is tearing the shared holder down. + if (action === 'restart') { + const guard = await acquireRestartLock(activePaths.mosaicHome, sleepFn); + try { + await runChecked(runner, buildFleetServiceCommand(action, agent)); + } finally { + await guard.release(); + } + return; + } await runChecked(runner, buildFleetServiceCommand(action, agent)); return; } @@ -1298,6 +1423,21 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps = ); return; } + if (action === 'restart') { + // Serialize the holder+agents teardown/relaunch behind the restart lock + // so a re-entrant restart waits for clean shutdown before relaunching, + // instead of racing a half-torn-down holder into a tight loop. + const guard = await acquireRestartLock(activePaths.mosaicHome, sleepFn); + try { + await runChecked(runner, buildFleetServiceCommand(action)); + for (const rosterAgent of roster.agents) { + await runChecked(runner, buildFleetServiceCommand(action, rosterAgent.name)); + } + } finally { + await guard.release(); + } + return; + } await runChecked(runner, buildFleetServiceCommand(action)); for (const rosterAgent of roster.agents) { await runChecked(runner, buildFleetServiceCommand(action, rosterAgent.name)); -- 2.49.1 From 43ad813e0d4bb0c648b09b8288a48dc1681135a0 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Wed, 24 Jun 2026 17:19:31 -0500 Subject: [PATCH 2/3] fix(fleet): make restart-lock release/break ownership-safe (review #680) Addresses the reviewer's blocker (comment 15915): release() unconditionally unlinked restart.lock, so after a stale/max-wait break an OLD owner could delete a NEWER owner's lock, letting a third restart interleave and defeating the guard. - Each acquire writes a unique owner token (randomUUID) into the lock file. - release() only unlinks while that token is still on disk; once another caller has broken and re-owned the lock, the timed-out original owner's release() is a no-op and leaves the new owner's lock intact. - Breaking a stale/timed-out lock now takes ownership atomically via write-temp + rename (atomic replace) instead of a blind unlink-then-recreate; a breaker that loses a concurrent takeover reads back a foreign token and keeps waiting rather than assuming ownership. Regression test (does not let a timed-out owner drop a lock another restart broke and re-owned) reproduces the three-restart interleave: R1 hangs (stale), R2 breaks + re-owns, R1.release() must NOT drop R2's lock. Fails on the old blind-unlink path (ENOENT), passes now. Also adds explicit single-agent restart-path guard coverage (review should-fix). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/mosaic/src/commands/fleet.spec.ts | 88 ++++++++++++++++++++ packages/mosaic/src/commands/fleet.ts | 97 ++++++++++++++++++---- 2 files changed, 170 insertions(+), 15 deletions(-) diff --git a/packages/mosaic/src/commands/fleet.spec.ts b/packages/mosaic/src/commands/fleet.spec.ts index 9545170..28bf7b0 100644 --- a/packages/mosaic/src/commands/fleet.spec.ts +++ b/packages/mosaic/src/commands/fleet.spec.ts @@ -4,6 +4,7 @@ import { dirname, join, resolve } from 'node:path'; import { Command } from 'commander'; import { afterEach, describe, expect, it, vi } from 'vitest'; import { + acquireRestartLock, addAgentToRoster, buildAgentSendCommand, buildAgentWatchAttachCommand, @@ -818,6 +819,93 @@ describe('fleet command construction', () => { } }); + it('guards the single-agent restart path behind the in-flight restart lock', async () => { + const home = await tempDir(); + const rosterPath = join(home, 'fleet', 'roster.yaml'); + await mkdir(join(home, 'fleet'), { recursive: true }); + await writeFile( + rosterPath, + ['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join( + '\n', + ), + ); + + // A full restart is mid-flight (lock held); a single-agent restart re-enters. + const lockPath = restartLockPath(home); + await mkdir(dirname(lockPath), { recursive: true }); + await writeFile(lockPath, `4242\n${Date.now()}\n`); + + const events: string[] = []; + const runner: CommandRunner = async (command, args) => { + events.push(`run:${args[args.length - 1]}`); + return { stdout: '', stderr: '', exitCode: 0 }; + }; + let sleeps = 0; + const sleepFn: SleepFn = async () => { + sleeps += 1; + events.push(`sleep:${sleeps}`); + await rm(lockPath, { force: true }); + }; + + const program = new Command(); + program.exitOverride(); + registerFleetCommand(program, { runner, sleepFn, mosaicHome: home }); + + try { + await program.parseAsync(['node', 'mosaic', 'fleet', 'restart', 'coder0']); + + // The single-agent restart waits for the in-flight restart before acting. + expect(sleeps).toBeGreaterThan(0); + const firstSleep = events.findIndex((e) => e.startsWith('sleep:')); + const firstRun = events.findIndex((e) => e.startsWith('run:')); + expect(firstSleep).toBeGreaterThanOrEqual(0); + expect(firstRun).toBeGreaterThan(firstSleep); + // Only the named agent is restarted; the holder is untouched. + expect(events).toContain('run:mosaic-agent@coder0.service'); + expect(events).not.toContain('run:mosaic-tmux-holder.service'); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + + it('does not let a timed-out owner drop a lock another restart broke and re-owned', async () => { + const home = await tempDir(); + const runDir = join(home, 'fleet', 'run'); + await mkdir(runDir, { recursive: true }); + const lockPath = restartLockPath(home); + const tokenOf = async (): Promise => { + const raw = await readFile(lockPath, 'utf8'); + return raw.split('\n')[2]?.trim() ?? ''; + }; + const sleepFn = vi.fn(async () => {}); + + // R1 acquires the lock and begins a restart that then hangs. + const r1 = await acquireRestartLock(home, sleepFn); + const tokenR1 = await tokenOf(); + expect(tokenR1).not.toBe(''); + + // The hung R1 leaves a stale lock: rewrite its timestamp into the past while + // preserving R1's token — exactly the on-disk state a stuck owner leaves. + await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n${tokenR1}\n`); + + // R2 re-enters, sees the stale lock, and atomically takes ownership. + const r2 = await acquireRestartLock(home, sleepFn); + const tokenR2 = await tokenOf(); + expect(tokenR2).not.toBe(tokenR1); + expect(sleepFn).not.toHaveBeenCalled(); + + // R1 finally finishes and releases. It must NOT delete R2's lock — otherwise + // a third restart (R3) could acquire and interleave with R2 still running. + await r1.release(); + expect(await tokenOf()).toBe(tokenR2); + + // R2 releases cleanly and the lock is gone. + await r2.release(); + await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' }); + + await rm(home, { recursive: true, force: true }); + }); + it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => { const home = await tempDir(); const rosterPath = join(home, 'fleet', 'roster.yaml'); diff --git a/packages/mosaic/src/commands/fleet.ts b/packages/mosaic/src/commands/fleet.ts index d343640..08260a6 100644 --- a/packages/mosaic/src/commands/fleet.ts +++ b/packages/mosaic/src/commands/fleet.ts @@ -6,9 +6,11 @@ import { mkdir, open, readFile, + rename, unlink, writeFile, } from 'node:fs/promises'; +import { randomUUID } from 'node:crypto'; import { homedir, hostname, userInfo } from 'node:os'; import { dirname, join, resolve } from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -565,11 +567,32 @@ export function restartLockPath(mosaicHome: string): string { return join(mosaicHome, 'fleet', 'run', 'restart.lock'); } -/** A held restart lock; `release()` removes the lock file (idempotent). */ +/** A held restart lock; `release()` removes the lock file iff we still own it. */ interface RestartGuard { release(): Promise; } +/** Lock-file contents: pid (informational), timestamp, and a unique owner token. */ +function formatRestartLockContent(token: string): string { + return `${process.pid}\n${Date.now()}\n${token}\n`; +} + +/** + * Reads the owner token (line 3) from a lock file, or null if the file is + * missing/unreadable/tokenless. The token is what makes release and break + * ownership-safe: a process only ever acts on a lock whose token matches its own. + */ +async function readRestartLockToken(lockPath: string): Promise { + let raw: string; + try { + raw = await readFile(lockPath, 'utf8'); + } catch { + return null; + } + const token = raw.split('\n')[2]?.trim(); + return token ? token : null; +} + /** * Returns true when an existing lock file is stale: older than * RESTART_LOCK_STALE_MS, or unreadable/unparseable (a corrupt or partially @@ -594,6 +617,31 @@ async function isRestartLockStale(lockPath: string, now: number): Promise= RESTART_LOCK_STALE_MS; } +/** + * Atomically take over an existing (stale or timed-out) lock WITHOUT blind + * unlinking it: write our own token to a temp file and `rename()` it over the + * lock. rename is atomic, so it replaces the prior owner's content in one step + * rather than the unsafe unlink-then-recreate (which a third restart could slip + * between). Returns true only if our token is the one on disk afterwards — if a + * concurrent breaker raced and won, we read back their token and return false so + * the caller keeps waiting instead of assuming ownership. + */ +async function breakAndOwnRestartLock( + lockPath: string, + token: string, + content: string, +): Promise { + const tmpPath = `${lockPath}.${token}`; + await writeFile(tmpPath, content); + try { + await rename(tmpPath, lockPath); + } catch (err) { + await unlink(tmpPath).catch(() => {}); + throw err; + } + return (await readRestartLockToken(lockPath)) === token; +} + /** * Acquire the fleet restart lock, serializing concurrent `mosaic fleet restart` * invocations across processes. Each restart tears the tmux holder (and the @@ -603,22 +651,38 @@ async function isRestartLockStale(lockPath: string, now: number): Promise { +export async function acquireRestartLock( + mosaicHome: string, + sleepFn: SleepFn, +): Promise { + const token = randomUUID(); const lockPath = restartLockPath(mosaicHome); await mkdir(dirname(lockPath), { recursive: true }); const release = async (): Promise => { + // Ownership-safe: only remove the lock if it is still ours. If another + // caller broke and re-owned it (after a stale/timeout break), the token no + // longer matches and we must leave their lock intact. + if ((await readRestartLockToken(lockPath)) !== token) { + return; + } try { await unlink(lockPath); } catch { - // Already gone (broken as stale by another waiter, or never written) — fine. + // Raced away between the token check and unlink — nothing more to do. } }; const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS; for (;;) { try { const handle = await open(lockPath, 'wx'); - await handle.writeFile(`${process.pid}\n${Date.now()}\n`); + await handle.writeFile(formatRestartLockContent(token)); await handle.close(); return { release }; } catch (err) { @@ -626,17 +690,20 @@ async function acquireRestartLock(mosaicHome: string, sleepFn: SleepFn): Promise throw err; } // A restart is already in flight (or its lock was left behind). - if (await isRestartLockStale(lockPath, Date.now())) { - process.stderr.write('Breaking stale fleet restart lock and proceeding.\n'); - await release(); - continue; - } - if (Date.now() >= deadline) { - process.stderr.write( - `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` + - 'restart; breaking the lock and proceeding.\n', - ); - await release(); + const stale = await isRestartLockStale(lockPath, Date.now()); + const timedOut = Date.now() >= deadline; + if (stale || timedOut) { + if (await breakAndOwnRestartLock(lockPath, token, formatRestartLockContent(token))) { + process.stderr.write( + stale + ? 'Breaking stale fleet restart lock and proceeding.\n' + : `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` + + 'restart; breaking the lock and proceeding.\n', + ); + return { release }; + } + // A concurrent breaker won the takeover; back off and re-evaluate. + await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS); continue; } await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS); -- 2.49.1 From 786762587d435856382a758607cc7894bf68c52f Mon Sep 17 00:00:00 2001 From: Jarvis Date: Wed, 24 Jun 2026 20:26:39 -0500 Subject: [PATCH 3/3] fix(fleet): serialize restart-lock transitions to close concurrent-breaker race (review #680) Stale/max-wait takeover was not safe against concurrent breakers: two breakers could both judge the lock stale and both proceed, re-introducing the tight-loop. POSIX/Node has no content- or inode-conditional unlink or rename, so "judge stale, then replace" can never be atomic with pure path ops. Serialize ALL lock transitions (acquire, release, takeover) under one short-lived registry mutex held only across a few fs ops, never across the restart itself. This makes check-then-mutate atomic, so exactly one breaker can take over a stale lock while the others wait and re-evaluate. The mutex itself uses mtime-based staleness (open('wx') creates an empty inode before the token is written; a content check would reap a lock that is still being acquired). The mutex populates-or-cleans-up on write failure so a half-created mutex never leaks. Regression coverage at two widths: a 2-breaker barrier test (exactly one takes over, the other waits) and the existing 3-breaker test (maxActive===1, distinct tokens, final lock released). Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/mosaic/src/commands/fleet.spec.ts | 133 +++++++++++ packages/mosaic/src/commands/fleet.ts | 254 +++++++++++++++------ 2 files changed, 321 insertions(+), 66 deletions(-) diff --git a/packages/mosaic/src/commands/fleet.spec.ts b/packages/mosaic/src/commands/fleet.spec.ts index 28bf7b0..641c060 100644 --- a/packages/mosaic/src/commands/fleet.spec.ts +++ b/packages/mosaic/src/commands/fleet.spec.ts @@ -906,6 +906,139 @@ describe('fleet command construction', () => { await rm(home, { recursive: true, force: true }); }); + it('lets only one of several concurrent breakers proceed past a stale lock', async () => { + const home = await tempDir(); + const lockPath = restartLockPath(home); + await mkdir(dirname(lockPath), { recursive: true }); + + // A stale lock left by a crashed owner: every concurrent re-entrant restart + // will judge it stale and try to break it at the same instant. Breaking must + // NOT grant ownership — only the atomic re-create may — so exactly one + // contender can ever hold the lock at a time. (The v2 fix wrote our own token + // during the break and read it back, so two breakers each saw their own token + // and BOTH proceeded; this guards that regression.) + await writeFile( + lockPath, + `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`, + ); + + // Yielding sleep so a waiting contender lets the current owner finish and + // release before it re-contends, instead of spinning the microtask queue. + const sleepFn: SleepFn = async () => { + await new Promise((res) => setTimeout(res, 0)); + }; + + let active = 0; + let maxActive = 0; + const tokens: string[] = []; + const tokenOf = async (): Promise => { + const raw = await readFile(lockPath, 'utf8'); + return raw.split('\n')[2]?.trim() ?? ''; + }; + + // One "restart" = acquire the lock, do work in the critical section, release. + const restartOnce = async (): Promise => { + const guard = await acquireRestartLock(home, sleepFn); + active += 1; + maxActive = Math.max(maxActive, active); + // Record the token we own while we hold it, then yield to interleave with + // any other contender that might (wrongly) believe it owns the lock too. + tokens.push(await tokenOf()); + await new Promise((res) => setTimeout(res, 0)); + active -= 1; + await guard.release(); + }; + + try { + // Three breakers race the single stale lock simultaneously. + await Promise.all([restartOnce(), restartOnce(), restartOnce()]); + + // Mutual exclusion held: never two owners at once despite concurrent breaks. + expect(maxActive).toBe(1); + // Each acquire owned with its own distinct token — no two ever shared it. + expect(new Set(tokens).size).toBe(3); + // The lock is fully released at the end. + await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' }); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + + it('lets exactly one of two breakers take over a stale lock while the other waits', async () => { + const home = await tempDir(); + const lockPath = restartLockPath(home); + await mkdir(dirname(lockPath), { recursive: true }); + + // A single stale lock both contenders will judge stale at the same instant. + // Every transition runs under the registry mutex, so only one may take the + // lock over; the other must observe a now-fresh owner and WAIT/re-evaluate + // rather than also taking over. (A content-blind clobber let both believe + // they owned it — this asserts the mutex-gated CAS takeover instead.) + await writeFile( + lockPath, + `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`, + ); + + // Barrier the winner holds against until the loser has observed the lock + // fresh and waited at least once — forcing the exact interleaving where one + // proceeds while the other waits, deterministically rather than by timing. + let resolveLoserWaited: () => void = () => {}; + const loserWaited = new Promise((res) => { + resolveLoserWaited = res; + }); + let sleeps = 0; + const sleepFn: SleepFn = async () => { + sleeps += 1; + resolveLoserWaited(); + await new Promise((res) => setTimeout(res, 0)); + }; + + let active = 0; + let maxActive = 0; + const tokens: string[] = []; + const tokenOf = async (): Promise => { + const raw = await readFile(lockPath, 'utf8'); + return raw.split('\n')[2]?.trim() ?? ''; + }; + + let firstOwner = true; + const restartOnce = async (): Promise => { + const guard = await acquireRestartLock(home, sleepFn); + active += 1; + maxActive = Math.max(maxActive, active); + tokens.push(await tokenOf()); + if (firstOwner) { + // Winner: keep holding the lock until the loser has waited once, so the + // loser is guaranteed to see a FRESH owner (not the stale one) and back + // off — proving it could not also take over. + firstOwner = false; + await loserWaited; + } else { + await new Promise((res) => setTimeout(res, 0)); + } + active -= 1; + await guard.release(); + }; + + try { + // Exactly two breakers race the single stale lock. + await Promise.all([restartOnce(), restartOnce()]); + + // Mutual exclusion: never two owners at once (if both took over the stale + // lock, this would be 2). + expect(maxActive).toBe(1); + // Both eventually owned, each with its own distinct token. + expect(new Set(tokens).size).toBe(2); + // The loser observed the winner's fresh lock and waited — it did NOT also + // take over the stale lock. + expect(sleeps).toBeGreaterThanOrEqual(1); + // The lock is fully released at the end. + await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' }); + } finally { + await rm(home, { recursive: true, force: true }); + } + }); + it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => { const home = await tempDir(); const rosterPath = join(home, 'fleet', 'roster.yaml'); diff --git a/packages/mosaic/src/commands/fleet.ts b/packages/mosaic/src/commands/fleet.ts index 08260a6..d287db5 100644 --- a/packages/mosaic/src/commands/fleet.ts +++ b/packages/mosaic/src/commands/fleet.ts @@ -6,7 +6,7 @@ import { mkdir, open, readFile, - rename, + stat, unlink, writeFile, } from 'node:fs/promises'; @@ -594,21 +594,10 @@ async function readRestartLockToken(lockPath: string): Promise { } /** - * Returns true when an existing lock file is stale: older than - * RESTART_LOCK_STALE_MS, or unreadable/unparseable (a corrupt or partially - * written lock left by a crashed owner). A vanished lock (ENOENT) is not stale — - * the next acquire attempt will simply succeed. + * Returns true when a lock's contents are stale: older than RESTART_LOCK_STALE_MS, + * or unparseable (a corrupt or partially written lock left by a crashed owner). */ -async function isRestartLockStale(lockPath: string, now: number): Promise { - let raw: string; - try { - raw = await readFile(lockPath, 'utf8'); - } catch (err) { - if ((err as NodeJS.ErrnoException).code === 'ENOENT') { - return false; - } - return true; - } +function isRestartLockContentStale(raw: string, now: number): boolean { const stampLine = raw.split('\n')[1] ?? ''; const stamp = Number.parseInt(stampLine.trim(), 10); if (!Number.isFinite(stamp)) { @@ -618,28 +607,139 @@ async function isRestartLockStale(lockPath: string, now: number): Promise { - const tmpPath = `${lockPath}.${token}`; - await writeFile(tmpPath, content); +function restartMutexPath(lockPath: string): string { + return `${lockPath}.mutex`; +} + +/** Brief back-off between registry-mutex acquisition attempts (held microseconds). */ +const RESTART_MUTEX_RETRY_MS = 20; + +/** + * Staleness for the internal mutex / reclaim locks, judged by the file's mtime + * rather than its CONTENT. `open(path, 'wx')` creates the inode (with a fresh + * mtime) before any token/timestamp is written into it, so a content-based check + * would momentarily see that empty file as corrupt-and-stale and could reap a + * lock another contender is still acquiring. mtime is set atomically at creation, + * so a just-created lock always reads as live; only a lock whose holder died and + * stopped touching it ages past the threshold. These locks are never held across + * the restart itself (only a couple of filesystem ops), so any mtime this old can + * belong only to a dead holder. + */ +async function isRestartLockPathStale(path: string, now: number): Promise { try { - await rename(tmpPath, lockPath); + const info = await stat(path); + return now - info.mtimeMs >= RESTART_LOCK_STALE_MS; } catch (err) { - await unlink(tmpPath).catch(() => {}); - throw err; + if ((err as NodeJS.ErrnoException).code === 'ENOENT') { + return false; // Gone, not stale — the caller will re-contend. + } + return false; // Can't stat — treat as live and back off rather than reap. + } +} + +/** Path of the reclaim lock that serializes reaping of a crashed-holder mutex. */ +function restartReclaimPath(mutexPath: string): string { + return `${mutexPath}.reclaim`; +} + +/** + * Reap a registry mutex left behind by a process that CRASHED mid-transition — + * one whose file has aged past RESTART_LOCK_STALE_MS. Because the mutex is held + * only for a couple of filesystem ops (no sleeps, never across the restart), a + * mutex this old can only belong to a dead holder. + * + * The reap removes the dead mutex but never CREATES/holds it — acquisition stays + * the single `open('wx')` create in {@link acquireRestartMutex}, so exactly one + * contender wins ownership no matter how the reap and acquires interleave. The + * removal is made conditional by a dedicated reclaim lock: while it is held the + * dead mutex is stable (its dead holder will never touch it, and no other + * reclaimer can race), so re-reading it and removing it only if it is STILL stale + * is a true compare — a live holder's fresh mutex is never removed. This closes + * the reclaim race a content-blind rename-and-restore left open (a third + * contender slipping into the gap while a fresh mutex was moved aside). + */ +async function reclaimStaleRestartMutex(mutexPath: string): Promise { + const reclaimPath = restartReclaimPath(mutexPath); + let handle: Awaited>; + try { + handle = await open(reclaimPath, 'wx'); + } catch (err) { + if ((err as NodeJS.ErrnoException).code !== 'EEXIST') { + throw err; + } + // Someone is already reclaiming. If their reclaim lock is itself stale by + // mtime, its holder crashed mid-reap (the lock spans only a stat + unlink, + // microseconds) — clear it so a later pass can retry. Otherwise a live + // reclaimer has it; back off. Either way we do not reap the mutex this pass. + if (await isRestartLockPathStale(reclaimPath, Date.now())) { + await unlink(reclaimPath).catch(() => {}); + } + return; + } + try { + // Re-check the mutex UNDER the reclaim lock and remove it only if it is STILL + // stale by mtime. A live holder's mutex is fresh and is left untouched; a dead + // holder's mutex is stable here (its holder is gone and no other reclaimer can + // race us), so this re-check is authoritative. + if (await isRestartLockPathStale(mutexPath, Date.now())) { + await unlink(mutexPath).catch(() => {}); + } + } finally { + await handle.close(); + await unlink(reclaimPath).catch(() => {}); + } +} + +/** + * Acquire the registry mutex, BLOCKING (with brief back-offs) until held, and + * return a token-gated release. This is the single point of mutual exclusion for + * the restart lock: acquire, release, and stale/timeout takeover all run under it, + * so "read the lock, then mutate it" is atomic — no acquirer, releaser, or breaker + * can ever interleave with another. A mutex left by a crashed holder is reclaimed + * once it ages past the stale threshold. + */ +async function acquireRestartMutex( + mutexPath: string, + token: string, +): Promise { + for (;;) { + let handle: Awaited>; + try { + handle = await open(mutexPath, 'wx'); + } catch (err) { + if ((err as NodeJS.ErrnoException).code !== 'EEXIST') { + throw err; + } + // Staleness is judged by mtime, not content, so a mutex that exists but has + // not yet had its token written (the open-before-write window) reads as live + // and is never wrongly reaped. + if (!(await isRestartLockPathStale(mutexPath, Date.now()))) { + // A live holder has it — it will be gone in microseconds. Back off briefly. + await new Promise((resolve) => setTimeout(resolve, RESTART_MUTEX_RETRY_MS)); + continue; + } + await reclaimStaleRestartMutex(mutexPath); + continue; + } + // We created the mutex. Populate it with our token; if writing fails, clean up + // our own file so we never leak an empty mutex that a peer would have to reap. + try { + await handle.writeFile(formatRestartLockContent(token)); + await handle.close(); + } catch (err) { + await handle.close().catch(() => {}); + await unlink(mutexPath).catch(() => {}); + throw err; + } + return async (): Promise => { + if ((await readRestartLockToken(mutexPath)) !== token) return; + await unlink(mutexPath).catch(() => {}); + }; } - return (await readRestartLockToken(lockPath)) === token; } /** @@ -652,11 +752,16 @@ async function breakAndOwnRestartLock( * by a crashed owner, and after RESTART_LOCK_MAX_WAIT_MS breaks the lock to * avoid a permanent deadlock. * - * Ownership is tracked by a unique per-acquire token written into the lock. - * `release()` only unlinks the lock while our token is still on disk, and a - * break takes ownership atomically — so once another caller has broken and - * re-owned the lock, neither the timed-out original owner's `release()` nor a - * stale `break` can drop the new owner's lock and let a third restart interleave. + * Correctness rests on a single invariant: EVERY transition of the lock — taking + * a free lock, taking over a stale/timed-out one, and releasing — happens under + * the registry mutex. Because the check ("is the lock free / stale / fresh?") and + * the mutation that follows it both run while the mutex is held, they are atomic: + * no other acquirer, releaser, or breaker can slip in between. That is what makes + * takeover a true compare-and-swap rather than a content-blind clobber — a normal + * `open('wx')` acquirer cannot create a fresh lock in a gap, and the original + * owner's `release()` (also mutex-gated and token-checked) cannot drop a lock a + * breaker already took over. So no interleaving lets two restarts both own the + * lock and run concurrently. */ export async function acquireRestartLock( mosaicHome: string, @@ -664,50 +769,67 @@ export async function acquireRestartLock( ): Promise { const token = randomUUID(); const lockPath = restartLockPath(mosaicHome); + const mutexPath = restartMutexPath(lockPath); await mkdir(dirname(lockPath), { recursive: true }); const release = async (): Promise => { - // Ownership-safe: only remove the lock if it is still ours. If another - // caller broke and re-owned it (after a stale/timeout break), the token no - // longer matches and we must leave their lock intact. - if ((await readRestartLockToken(lockPath)) !== token) { - return; - } + // Mutex-gated and token-gated: only remove the lock if it is still ours. If + // another caller took it over (after a stale/timeout break) the token no + // longer matches and we leave their lock intact. + const releaseMutex = await acquireRestartMutex(mutexPath, token); try { - await unlink(lockPath); - } catch { - // Raced away between the token check and unlink — nothing more to do. + if ((await readRestartLockToken(lockPath)) === token) { + await unlink(lockPath).catch(() => {}); + } + } finally { + await releaseMutex(); } }; const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS; for (;;) { + let owned = false; + const releaseMutex = await acquireRestartMutex(mutexPath, token); try { - const handle = await open(lockPath, 'wx'); - await handle.writeFile(formatRestartLockContent(token)); - await handle.close(); - return { release }; - } catch (err) { - if ((err as NodeJS.ErrnoException).code !== 'EEXIST') { - throw err; + // Read and (if appropriate) mutate the lock atomically under the mutex. + let current: string | null = null; + let absent = false; + try { + current = await readFile(lockPath, 'utf8'); + } catch (readErr) { + if ((readErr as NodeJS.ErrnoException).code === 'ENOENT') { + absent = true; + } else { + current = null; // Unreadable/corrupt: treat as stale. + } } - // A restart is already in flight (or its lock was left behind). - const stale = await isRestartLockStale(lockPath, Date.now()); - const timedOut = Date.now() >= deadline; - if (stale || timedOut) { - if (await breakAndOwnRestartLock(lockPath, token, formatRestartLockContent(token))) { + const now = Date.now(); + if (absent) { + // Lock is free — take it. + await writeFile(lockPath, formatRestartLockContent(token)); + owned = true; + } else { + const stale = current === null || isRestartLockContentStale(current, now); + const timedOut = now >= deadline; + if (stale || timedOut) { process.stderr.write( stale - ? 'Breaking stale fleet restart lock and proceeding.\n' + ? 'Breaking stale fleet restart lock.\n' : `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` + - 'restart; breaking the lock and proceeding.\n', + 'restart; breaking the lock.\n', ); - return { release }; + // Takeover is just an overwrite — safe because we hold the mutex, so no + // acquirer or releaser can touch the lock between our read and this write. + await writeFile(lockPath, formatRestartLockContent(token)); + owned = true; } - // A concurrent breaker won the takeover; back off and re-evaluate. - await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS); - continue; + // else: a fresh restart owns it — wait below and re-evaluate. } - await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS); + } finally { + await releaseMutex(); } + if (owned) { + return { release }; + } + await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS); } } -- 2.49.1