fix(fleet): make restart-lock release/break ownership-safe (review #680)
Some checks failed
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/pr/ci Pipeline was canceled

Addresses the reviewer's blocker (comment 15915): release() unconditionally
unlinked restart.lock, so after a stale/max-wait break an OLD owner could
delete a NEWER owner's lock, letting a third restart interleave and defeating
the guard.

- Each acquire writes a unique owner token (randomUUID) into the lock file.
- release() only unlinks while that token is still on disk; once another caller
  has broken and re-owned the lock, the timed-out original owner's release() is
  a no-op and leaves the new owner's lock intact.
- Breaking a stale/timed-out lock now takes ownership atomically via
  write-temp + rename (atomic replace) instead of a blind unlink-then-recreate;
  a breaker that loses a concurrent takeover reads back a foreign token and
  keeps waiting rather than assuming ownership.

Regression test (does not let a timed-out owner drop a lock another restart
broke and re-owned) reproduces the three-restart interleave: R1 hangs (stale),
R2 breaks + re-owns, R1.release() must NOT drop R2's lock. Fails on the old
blind-unlink path (ENOENT), passes now. Also adds explicit single-agent
restart-path guard coverage (review should-fix).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jarvis
2026-06-24 17:19:31 -05:00
parent 9c2e4f0b2d
commit 43ad813e0d
2 changed files with 170 additions and 15 deletions

View File

@@ -4,6 +4,7 @@ import { dirname, join, resolve } from 'node:path';
import { Command } from 'commander';
import { afterEach, describe, expect, it, vi } from 'vitest';
import {
acquireRestartLock,
addAgentToRoster,
buildAgentSendCommand,
buildAgentWatchAttachCommand,
@@ -818,6 +819,93 @@ describe('fleet command construction', () => {
}
});
it('guards the single-agent restart path behind the in-flight restart lock', async () => {
const home = await tempDir();
const rosterPath = join(home, 'fleet', 'roster.yaml');
await mkdir(join(home, 'fleet'), { recursive: true });
await writeFile(
rosterPath,
['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join(
'\n',
),
);
// A full restart is mid-flight (lock held); a single-agent restart re-enters.
const lockPath = restartLockPath(home);
await mkdir(dirname(lockPath), { recursive: true });
await writeFile(lockPath, `4242\n${Date.now()}\n`);
const events: string[] = [];
const runner: CommandRunner = async (command, args) => {
events.push(`run:${args[args.length - 1]}`);
return { stdout: '', stderr: '', exitCode: 0 };
};
let sleeps = 0;
const sleepFn: SleepFn = async () => {
sleeps += 1;
events.push(`sleep:${sleeps}`);
await rm(lockPath, { force: true });
};
const program = new Command();
program.exitOverride();
registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
try {
await program.parseAsync(['node', 'mosaic', 'fleet', 'restart', 'coder0']);
// The single-agent restart waits for the in-flight restart before acting.
expect(sleeps).toBeGreaterThan(0);
const firstSleep = events.findIndex((e) => e.startsWith('sleep:'));
const firstRun = events.findIndex((e) => e.startsWith('run:'));
expect(firstSleep).toBeGreaterThanOrEqual(0);
expect(firstRun).toBeGreaterThan(firstSleep);
// Only the named agent is restarted; the holder is untouched.
expect(events).toContain('run:mosaic-agent@coder0.service');
expect(events).not.toContain('run:mosaic-tmux-holder.service');
} finally {
await rm(home, { recursive: true, force: true });
}
});
it('does not let a timed-out owner drop a lock another restart broke and re-owned', async () => {
const home = await tempDir();
const runDir = join(home, 'fleet', 'run');
await mkdir(runDir, { recursive: true });
const lockPath = restartLockPath(home);
const tokenOf = async (): Promise<string> => {
const raw = await readFile(lockPath, 'utf8');
return raw.split('\n')[2]?.trim() ?? '';
};
const sleepFn = vi.fn<SleepFn>(async () => {});
// R1 acquires the lock and begins a restart that then hangs.
const r1 = await acquireRestartLock(home, sleepFn);
const tokenR1 = await tokenOf();
expect(tokenR1).not.toBe('');
// The hung R1 leaves a stale lock: rewrite its timestamp into the past while
// preserving R1's token — exactly the on-disk state a stuck owner leaves.
await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n${tokenR1}\n`);
// R2 re-enters, sees the stale lock, and atomically takes ownership.
const r2 = await acquireRestartLock(home, sleepFn);
const tokenR2 = await tokenOf();
expect(tokenR2).not.toBe(tokenR1);
expect(sleepFn).not.toHaveBeenCalled();
// R1 finally finishes and releases. It must NOT delete R2's lock — otherwise
// a third restart (R3) could acquire and interleave with R2 still running.
await r1.release();
expect(await tokenOf()).toBe(tokenR2);
// R2 releases cleanly and the lock is gone.
await r2.release();
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
await rm(home, { recursive: true, force: true });
});
it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => {
const home = await tempDir();
const rosterPath = join(home, 'fleet', 'roster.yaml');