fix(fleet): guard mosaic fleet restart against tight-loop re-entry race (#680)
This commit was merged in pull request #680.
This commit is contained in:
@@ -4,6 +4,7 @@ import { dirname, join, resolve } from 'node:path';
|
||||
import { Command } from 'commander';
|
||||
import { afterEach, describe, expect, it, vi } from 'vitest';
|
||||
import {
|
||||
acquireRestartLock,
|
||||
addAgentToRoster,
|
||||
buildAgentSendCommand,
|
||||
buildAgentWatchAttachCommand,
|
||||
@@ -45,6 +46,8 @@ import {
|
||||
removeAgentFromRoster,
|
||||
resolveFleetPaths,
|
||||
resolvePresetFilename,
|
||||
restartLockPath,
|
||||
RESTART_LOCK_STALE_MS,
|
||||
RUNTIME_ACCEPTABLE_COMMANDS,
|
||||
serializeRosterToYaml,
|
||||
VERIFY_DEFAULT_TIMEOUT_MS,
|
||||
@@ -678,6 +681,364 @@ describe('fleet command construction', () => {
|
||||
}
|
||||
});
|
||||
|
||||
it('waits for an in-flight restart to clear before relaunching (re-entry guard)', async () => {
|
||||
const home = await tempDir();
|
||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||
await mkdir(join(home, 'fleet'), { recursive: true });
|
||||
await writeFile(
|
||||
rosterPath,
|
||||
['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join(
|
||||
'\n',
|
||||
),
|
||||
);
|
||||
|
||||
// Simulate another `mosaic fleet restart` process mid-teardown: a fresh lock
|
||||
// (recent timestamp, so it is NOT treated as stale) already held.
|
||||
const lockPath = restartLockPath(home);
|
||||
await mkdir(dirname(lockPath), { recursive: true });
|
||||
await writeFile(lockPath, `4242\n${Date.now()}\n`);
|
||||
|
||||
const events: string[] = [];
|
||||
const runner: CommandRunner = async (command, args) => {
|
||||
events.push(`run:${args[args.length - 1]}`);
|
||||
return { stdout: '', stderr: '', exitCode: 0 };
|
||||
};
|
||||
// The injected sleep stands in for time passing while we wait; the in-flight
|
||||
// restart "finishes" (releases its lock) after the first poll.
|
||||
let sleeps = 0;
|
||||
const sleepFn: SleepFn = async () => {
|
||||
sleeps += 1;
|
||||
events.push(`sleep:${sleeps}`);
|
||||
await rm(lockPath, { force: true });
|
||||
};
|
||||
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
|
||||
|
||||
try {
|
||||
await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
|
||||
|
||||
// It must have waited at least once before issuing any systemctl restart.
|
||||
expect(sleeps).toBeGreaterThan(0);
|
||||
const firstSleep = events.findIndex((e) => e.startsWith('sleep:'));
|
||||
const firstRun = events.findIndex((e) => e.startsWith('run:'));
|
||||
expect(firstSleep).toBeGreaterThanOrEqual(0);
|
||||
expect(firstRun).toBeGreaterThan(firstSleep);
|
||||
|
||||
// And it still performs the full restart once the lock clears.
|
||||
expect(events).toContain('run:mosaic-tmux-holder.service');
|
||||
expect(events).toContain('run:mosaic-agent@coder0.service');
|
||||
|
||||
// The lock is released after the restart completes.
|
||||
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
} finally {
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('breaks a stale restart lock and proceeds without waiting', async () => {
|
||||
const home = await tempDir();
|
||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||
await mkdir(join(home, 'fleet'), { recursive: true });
|
||||
await writeFile(
|
||||
rosterPath,
|
||||
['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join(
|
||||
'\n',
|
||||
),
|
||||
);
|
||||
|
||||
// A lock left behind by a crashed owner: timestamp older than the stale window.
|
||||
const lockPath = restartLockPath(home);
|
||||
await mkdir(dirname(lockPath), { recursive: true });
|
||||
await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n`);
|
||||
|
||||
const calls: string[][] = [];
|
||||
const runner: CommandRunner = async (command, args) => {
|
||||
calls.push([command, ...args]);
|
||||
return { stdout: '', stderr: '', exitCode: 0 };
|
||||
};
|
||||
const sleepFn = vi.fn<SleepFn>(async () => {});
|
||||
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
|
||||
|
||||
try {
|
||||
await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
|
||||
|
||||
// Stale lock is broken immediately — no waiting.
|
||||
expect(sleepFn).not.toHaveBeenCalled();
|
||||
expect(calls).toEqual([
|
||||
['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
|
||||
['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
|
||||
]);
|
||||
// The stale lock is gone once the restart completes.
|
||||
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
} finally {
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('releases the restart lock so a subsequent restart is not blocked', async () => {
|
||||
const home = await tempDir();
|
||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||
await mkdir(join(home, 'fleet'), { recursive: true });
|
||||
await writeFile(
|
||||
rosterPath,
|
||||
['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join(
|
||||
'\n',
|
||||
),
|
||||
);
|
||||
|
||||
const calls: string[][] = [];
|
||||
const runner: CommandRunner = async (command, args) => {
|
||||
calls.push([command, ...args]);
|
||||
return { stdout: '', stderr: '', exitCode: 0 };
|
||||
};
|
||||
const sleepFn = vi.fn<SleepFn>(async () => {});
|
||||
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
|
||||
|
||||
try {
|
||||
await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
|
||||
await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
|
||||
|
||||
// Two sequential restarts both run fully and neither has to wait.
|
||||
expect(sleepFn).not.toHaveBeenCalled();
|
||||
expect(calls).toEqual([
|
||||
['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
|
||||
['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
|
||||
['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
|
||||
['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
|
||||
]);
|
||||
} finally {
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('guards the single-agent restart path behind the in-flight restart lock', async () => {
|
||||
const home = await tempDir();
|
||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||
await mkdir(join(home, 'fleet'), { recursive: true });
|
||||
await writeFile(
|
||||
rosterPath,
|
||||
['version: 1', 'transport: tmux', 'agents:', ' - name: coder0', ' runtime: codex'].join(
|
||||
'\n',
|
||||
),
|
||||
);
|
||||
|
||||
// A full restart is mid-flight (lock held); a single-agent restart re-enters.
|
||||
const lockPath = restartLockPath(home);
|
||||
await mkdir(dirname(lockPath), { recursive: true });
|
||||
await writeFile(lockPath, `4242\n${Date.now()}\n`);
|
||||
|
||||
const events: string[] = [];
|
||||
const runner: CommandRunner = async (command, args) => {
|
||||
events.push(`run:${args[args.length - 1]}`);
|
||||
return { stdout: '', stderr: '', exitCode: 0 };
|
||||
};
|
||||
let sleeps = 0;
|
||||
const sleepFn: SleepFn = async () => {
|
||||
sleeps += 1;
|
||||
events.push(`sleep:${sleeps}`);
|
||||
await rm(lockPath, { force: true });
|
||||
};
|
||||
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
|
||||
|
||||
try {
|
||||
await program.parseAsync(['node', 'mosaic', 'fleet', 'restart', 'coder0']);
|
||||
|
||||
// The single-agent restart waits for the in-flight restart before acting.
|
||||
expect(sleeps).toBeGreaterThan(0);
|
||||
const firstSleep = events.findIndex((e) => e.startsWith('sleep:'));
|
||||
const firstRun = events.findIndex((e) => e.startsWith('run:'));
|
||||
expect(firstSleep).toBeGreaterThanOrEqual(0);
|
||||
expect(firstRun).toBeGreaterThan(firstSleep);
|
||||
// Only the named agent is restarted; the holder is untouched.
|
||||
expect(events).toContain('run:mosaic-agent@coder0.service');
|
||||
expect(events).not.toContain('run:mosaic-tmux-holder.service');
|
||||
} finally {
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('does not let a timed-out owner drop a lock another restart broke and re-owned', async () => {
|
||||
const home = await tempDir();
|
||||
const runDir = join(home, 'fleet', 'run');
|
||||
await mkdir(runDir, { recursive: true });
|
||||
const lockPath = restartLockPath(home);
|
||||
const tokenOf = async (): Promise<string> => {
|
||||
const raw = await readFile(lockPath, 'utf8');
|
||||
return raw.split('\n')[2]?.trim() ?? '';
|
||||
};
|
||||
const sleepFn = vi.fn<SleepFn>(async () => {});
|
||||
|
||||
// R1 acquires the lock and begins a restart that then hangs.
|
||||
const r1 = await acquireRestartLock(home, sleepFn);
|
||||
const tokenR1 = await tokenOf();
|
||||
expect(tokenR1).not.toBe('');
|
||||
|
||||
// The hung R1 leaves a stale lock: rewrite its timestamp into the past while
|
||||
// preserving R1's token — exactly the on-disk state a stuck owner leaves.
|
||||
await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n${tokenR1}\n`);
|
||||
|
||||
// R2 re-enters, sees the stale lock, and atomically takes ownership.
|
||||
const r2 = await acquireRestartLock(home, sleepFn);
|
||||
const tokenR2 = await tokenOf();
|
||||
expect(tokenR2).not.toBe(tokenR1);
|
||||
expect(sleepFn).not.toHaveBeenCalled();
|
||||
|
||||
// R1 finally finishes and releases. It must NOT delete R2's lock — otherwise
|
||||
// a third restart (R3) could acquire and interleave with R2 still running.
|
||||
await r1.release();
|
||||
expect(await tokenOf()).toBe(tokenR2);
|
||||
|
||||
// R2 releases cleanly and the lock is gone.
|
||||
await r2.release();
|
||||
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
|
||||
await rm(home, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('lets only one of several concurrent breakers proceed past a stale lock', async () => {
|
||||
const home = await tempDir();
|
||||
const lockPath = restartLockPath(home);
|
||||
await mkdir(dirname(lockPath), { recursive: true });
|
||||
|
||||
// A stale lock left by a crashed owner: every concurrent re-entrant restart
|
||||
// will judge it stale and try to break it at the same instant. Breaking must
|
||||
// NOT grant ownership — only the atomic re-create may — so exactly one
|
||||
// contender can ever hold the lock at a time. (The v2 fix wrote our own token
|
||||
// during the break and read it back, so two breakers each saw their own token
|
||||
// and BOTH proceeded; this guards that regression.)
|
||||
await writeFile(
|
||||
lockPath,
|
||||
`4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
|
||||
);
|
||||
|
||||
// Yielding sleep so a waiting contender lets the current owner finish and
|
||||
// release before it re-contends, instead of spinning the microtask queue.
|
||||
const sleepFn: SleepFn = async () => {
|
||||
await new Promise((res) => setTimeout(res, 0));
|
||||
};
|
||||
|
||||
let active = 0;
|
||||
let maxActive = 0;
|
||||
const tokens: string[] = [];
|
||||
const tokenOf = async (): Promise<string> => {
|
||||
const raw = await readFile(lockPath, 'utf8');
|
||||
return raw.split('\n')[2]?.trim() ?? '';
|
||||
};
|
||||
|
||||
// One "restart" = acquire the lock, do work in the critical section, release.
|
||||
const restartOnce = async (): Promise<void> => {
|
||||
const guard = await acquireRestartLock(home, sleepFn);
|
||||
active += 1;
|
||||
maxActive = Math.max(maxActive, active);
|
||||
// Record the token we own while we hold it, then yield to interleave with
|
||||
// any other contender that might (wrongly) believe it owns the lock too.
|
||||
tokens.push(await tokenOf());
|
||||
await new Promise((res) => setTimeout(res, 0));
|
||||
active -= 1;
|
||||
await guard.release();
|
||||
};
|
||||
|
||||
try {
|
||||
// Three breakers race the single stale lock simultaneously.
|
||||
await Promise.all([restartOnce(), restartOnce(), restartOnce()]);
|
||||
|
||||
// Mutual exclusion held: never two owners at once despite concurrent breaks.
|
||||
expect(maxActive).toBe(1);
|
||||
// Each acquire owned with its own distinct token — no two ever shared it.
|
||||
expect(new Set(tokens).size).toBe(3);
|
||||
// The lock is fully released at the end.
|
||||
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
} finally {
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('lets exactly one of two breakers take over a stale lock while the other waits', async () => {
|
||||
const home = await tempDir();
|
||||
const lockPath = restartLockPath(home);
|
||||
await mkdir(dirname(lockPath), { recursive: true });
|
||||
|
||||
// A single stale lock both contenders will judge stale at the same instant.
|
||||
// Every transition runs under the registry mutex, so only one may take the
|
||||
// lock over; the other must observe a now-fresh owner and WAIT/re-evaluate
|
||||
// rather than also taking over. (A content-blind clobber let both believe
|
||||
// they owned it — this asserts the mutex-gated CAS takeover instead.)
|
||||
await writeFile(
|
||||
lockPath,
|
||||
`4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
|
||||
);
|
||||
|
||||
// Barrier the winner holds against until the loser has observed the lock
|
||||
// fresh and waited at least once — forcing the exact interleaving where one
|
||||
// proceeds while the other waits, deterministically rather than by timing.
|
||||
let resolveLoserWaited: () => void = () => {};
|
||||
const loserWaited = new Promise<void>((res) => {
|
||||
resolveLoserWaited = res;
|
||||
});
|
||||
let sleeps = 0;
|
||||
const sleepFn: SleepFn = async () => {
|
||||
sleeps += 1;
|
||||
resolveLoserWaited();
|
||||
await new Promise((res) => setTimeout(res, 0));
|
||||
};
|
||||
|
||||
let active = 0;
|
||||
let maxActive = 0;
|
||||
const tokens: string[] = [];
|
||||
const tokenOf = async (): Promise<string> => {
|
||||
const raw = await readFile(lockPath, 'utf8');
|
||||
return raw.split('\n')[2]?.trim() ?? '';
|
||||
};
|
||||
|
||||
let firstOwner = true;
|
||||
const restartOnce = async (): Promise<void> => {
|
||||
const guard = await acquireRestartLock(home, sleepFn);
|
||||
active += 1;
|
||||
maxActive = Math.max(maxActive, active);
|
||||
tokens.push(await tokenOf());
|
||||
if (firstOwner) {
|
||||
// Winner: keep holding the lock until the loser has waited once, so the
|
||||
// loser is guaranteed to see a FRESH owner (not the stale one) and back
|
||||
// off — proving it could not also take over.
|
||||
firstOwner = false;
|
||||
await loserWaited;
|
||||
} else {
|
||||
await new Promise((res) => setTimeout(res, 0));
|
||||
}
|
||||
active -= 1;
|
||||
await guard.release();
|
||||
};
|
||||
|
||||
try {
|
||||
// Exactly two breakers race the single stale lock.
|
||||
await Promise.all([restartOnce(), restartOnce()]);
|
||||
|
||||
// Mutual exclusion: never two owners at once (if both took over the stale
|
||||
// lock, this would be 2).
|
||||
expect(maxActive).toBe(1);
|
||||
// Both eventually owned, each with its own distinct token.
|
||||
expect(new Set(tokens).size).toBe(2);
|
||||
// The loser observed the winner's fresh lock and waited — it did NOT also
|
||||
// take over the stale lock.
|
||||
expect(sleeps).toBeGreaterThanOrEqual(1);
|
||||
// The lock is fully released at the end.
|
||||
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||
} finally {
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => {
|
||||
const home = await tempDir();
|
||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||
|
||||
Reference in New Issue
Block a user