fix(fleet): serialize restart-lock transitions to close concurrent-breaker race (review #680)
Stale/max-wait takeover was not safe against concurrent breakers: two
breakers could both judge the lock stale and both proceed, re-introducing
the tight-loop. POSIX/Node has no content- or inode-conditional unlink or
rename, so "judge stale, then replace" can never be atomic with pure path
ops.
Serialize ALL lock transitions (acquire, release, takeover) under one
short-lived registry mutex held only across a few fs ops, never across the
restart itself. This makes check-then-mutate atomic, so exactly one breaker
can take over a stale lock while the others wait and re-evaluate.
The mutex itself uses mtime-based staleness (open('wx') creates an empty
inode before the token is written; a content check would reap a lock that is
still being acquired). The mutex populates-or-cleans-up on write failure so a
half-created mutex never leaks.
Regression coverage at two widths: a 2-breaker barrier test (exactly one
takes over, the other waits) and the existing 3-breaker test (maxActive===1,
distinct tokens, final lock released).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -906,6 +906,139 @@ describe('fleet command construction', () => {
|
|||||||
await rm(home, { recursive: true, force: true });
|
await rm(home, { recursive: true, force: true });
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('lets only one of several concurrent breakers proceed past a stale lock', async () => {
|
||||||
|
const home = await tempDir();
|
||||||
|
const lockPath = restartLockPath(home);
|
||||||
|
await mkdir(dirname(lockPath), { recursive: true });
|
||||||
|
|
||||||
|
// A stale lock left by a crashed owner: every concurrent re-entrant restart
|
||||||
|
// will judge it stale and try to break it at the same instant. Breaking must
|
||||||
|
// NOT grant ownership — only the atomic re-create may — so exactly one
|
||||||
|
// contender can ever hold the lock at a time. (The v2 fix wrote our own token
|
||||||
|
// during the break and read it back, so two breakers each saw their own token
|
||||||
|
// and BOTH proceeded; this guards that regression.)
|
||||||
|
await writeFile(
|
||||||
|
lockPath,
|
||||||
|
`4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Yielding sleep so a waiting contender lets the current owner finish and
|
||||||
|
// release before it re-contends, instead of spinning the microtask queue.
|
||||||
|
const sleepFn: SleepFn = async () => {
|
||||||
|
await new Promise((res) => setTimeout(res, 0));
|
||||||
|
};
|
||||||
|
|
||||||
|
let active = 0;
|
||||||
|
let maxActive = 0;
|
||||||
|
const tokens: string[] = [];
|
||||||
|
const tokenOf = async (): Promise<string> => {
|
||||||
|
const raw = await readFile(lockPath, 'utf8');
|
||||||
|
return raw.split('\n')[2]?.trim() ?? '';
|
||||||
|
};
|
||||||
|
|
||||||
|
// One "restart" = acquire the lock, do work in the critical section, release.
|
||||||
|
const restartOnce = async (): Promise<void> => {
|
||||||
|
const guard = await acquireRestartLock(home, sleepFn);
|
||||||
|
active += 1;
|
||||||
|
maxActive = Math.max(maxActive, active);
|
||||||
|
// Record the token we own while we hold it, then yield to interleave with
|
||||||
|
// any other contender that might (wrongly) believe it owns the lock too.
|
||||||
|
tokens.push(await tokenOf());
|
||||||
|
await new Promise((res) => setTimeout(res, 0));
|
||||||
|
active -= 1;
|
||||||
|
await guard.release();
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Three breakers race the single stale lock simultaneously.
|
||||||
|
await Promise.all([restartOnce(), restartOnce(), restartOnce()]);
|
||||||
|
|
||||||
|
// Mutual exclusion held: never two owners at once despite concurrent breaks.
|
||||||
|
expect(maxActive).toBe(1);
|
||||||
|
// Each acquire owned with its own distinct token — no two ever shared it.
|
||||||
|
expect(new Set(tokens).size).toBe(3);
|
||||||
|
// The lock is fully released at the end.
|
||||||
|
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||||
|
} finally {
|
||||||
|
await rm(home, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('lets exactly one of two breakers take over a stale lock while the other waits', async () => {
|
||||||
|
const home = await tempDir();
|
||||||
|
const lockPath = restartLockPath(home);
|
||||||
|
await mkdir(dirname(lockPath), { recursive: true });
|
||||||
|
|
||||||
|
// A single stale lock both contenders will judge stale at the same instant.
|
||||||
|
// Every transition runs under the registry mutex, so only one may take the
|
||||||
|
// lock over; the other must observe a now-fresh owner and WAIT/re-evaluate
|
||||||
|
// rather than also taking over. (A content-blind clobber let both believe
|
||||||
|
// they owned it — this asserts the mutex-gated CAS takeover instead.)
|
||||||
|
await writeFile(
|
||||||
|
lockPath,
|
||||||
|
`4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Barrier the winner holds against until the loser has observed the lock
|
||||||
|
// fresh and waited at least once — forcing the exact interleaving where one
|
||||||
|
// proceeds while the other waits, deterministically rather than by timing.
|
||||||
|
let resolveLoserWaited: () => void = () => {};
|
||||||
|
const loserWaited = new Promise<void>((res) => {
|
||||||
|
resolveLoserWaited = res;
|
||||||
|
});
|
||||||
|
let sleeps = 0;
|
||||||
|
const sleepFn: SleepFn = async () => {
|
||||||
|
sleeps += 1;
|
||||||
|
resolveLoserWaited();
|
||||||
|
await new Promise((res) => setTimeout(res, 0));
|
||||||
|
};
|
||||||
|
|
||||||
|
let active = 0;
|
||||||
|
let maxActive = 0;
|
||||||
|
const tokens: string[] = [];
|
||||||
|
const tokenOf = async (): Promise<string> => {
|
||||||
|
const raw = await readFile(lockPath, 'utf8');
|
||||||
|
return raw.split('\n')[2]?.trim() ?? '';
|
||||||
|
};
|
||||||
|
|
||||||
|
let firstOwner = true;
|
||||||
|
const restartOnce = async (): Promise<void> => {
|
||||||
|
const guard = await acquireRestartLock(home, sleepFn);
|
||||||
|
active += 1;
|
||||||
|
maxActive = Math.max(maxActive, active);
|
||||||
|
tokens.push(await tokenOf());
|
||||||
|
if (firstOwner) {
|
||||||
|
// Winner: keep holding the lock until the loser has waited once, so the
|
||||||
|
// loser is guaranteed to see a FRESH owner (not the stale one) and back
|
||||||
|
// off — proving it could not also take over.
|
||||||
|
firstOwner = false;
|
||||||
|
await loserWaited;
|
||||||
|
} else {
|
||||||
|
await new Promise((res) => setTimeout(res, 0));
|
||||||
|
}
|
||||||
|
active -= 1;
|
||||||
|
await guard.release();
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Exactly two breakers race the single stale lock.
|
||||||
|
await Promise.all([restartOnce(), restartOnce()]);
|
||||||
|
|
||||||
|
// Mutual exclusion: never two owners at once (if both took over the stale
|
||||||
|
// lock, this would be 2).
|
||||||
|
expect(maxActive).toBe(1);
|
||||||
|
// Both eventually owned, each with its own distinct token.
|
||||||
|
expect(new Set(tokens).size).toBe(2);
|
||||||
|
// The loser observed the winner's fresh lock and waited — it did NOT also
|
||||||
|
// take over the stale lock.
|
||||||
|
expect(sleeps).toBeGreaterThanOrEqual(1);
|
||||||
|
// The lock is fully released at the end.
|
||||||
|
await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
|
||||||
|
} finally {
|
||||||
|
await rm(home, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => {
|
it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => {
|
||||||
const home = await tempDir();
|
const home = await tempDir();
|
||||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import {
|
|||||||
mkdir,
|
mkdir,
|
||||||
open,
|
open,
|
||||||
readFile,
|
readFile,
|
||||||
rename,
|
stat,
|
||||||
unlink,
|
unlink,
|
||||||
writeFile,
|
writeFile,
|
||||||
} from 'node:fs/promises';
|
} from 'node:fs/promises';
|
||||||
@@ -594,21 +594,10 @@ async function readRestartLockToken(lockPath: string): Promise<string | null> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true when an existing lock file is stale: older than
|
* Returns true when a lock's contents are stale: older than RESTART_LOCK_STALE_MS,
|
||||||
* RESTART_LOCK_STALE_MS, or unreadable/unparseable (a corrupt or partially
|
* or unparseable (a corrupt or partially written lock left by a crashed owner).
|
||||||
* written lock left by a crashed owner). A vanished lock (ENOENT) is not stale —
|
|
||||||
* the next acquire attempt will simply succeed.
|
|
||||||
*/
|
*/
|
||||||
async function isRestartLockStale(lockPath: string, now: number): Promise<boolean> {
|
function isRestartLockContentStale(raw: string, now: number): boolean {
|
||||||
let raw: string;
|
|
||||||
try {
|
|
||||||
raw = await readFile(lockPath, 'utf8');
|
|
||||||
} catch (err) {
|
|
||||||
if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
const stampLine = raw.split('\n')[1] ?? '';
|
const stampLine = raw.split('\n')[1] ?? '';
|
||||||
const stamp = Number.parseInt(stampLine.trim(), 10);
|
const stamp = Number.parseInt(stampLine.trim(), 10);
|
||||||
if (!Number.isFinite(stamp)) {
|
if (!Number.isFinite(stamp)) {
|
||||||
@@ -618,28 +607,139 @@ async function isRestartLockStale(lockPath: string, now: number): Promise<boolea
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Atomically take over an existing (stale or timed-out) lock WITHOUT blind
|
* Path of the short-lived registry mutex that guards EVERY transition of the
|
||||||
* unlinking it: write our own token to a temp file and `rename()` it over the
|
* restart lock (acquire, release, takeover). Held only across a few filesystem
|
||||||
* lock. rename is atomic, so it replaces the prior owner's content in one step
|
* ops — never across the restart itself — so contention clears in microseconds.
|
||||||
* rather than the unsafe unlink-then-recreate (which a third restart could slip
|
|
||||||
* between). Returns true only if our token is the one on disk afterwards — if a
|
|
||||||
* concurrent breaker raced and won, we read back their token and return false so
|
|
||||||
* the caller keeps waiting instead of assuming ownership.
|
|
||||||
*/
|
*/
|
||||||
async function breakAndOwnRestartLock(
|
function restartMutexPath(lockPath: string): string {
|
||||||
lockPath: string,
|
return `${lockPath}.mutex`;
|
||||||
token: string,
|
}
|
||||||
content: string,
|
|
||||||
): Promise<boolean> {
|
/** Brief back-off between registry-mutex acquisition attempts (held microseconds). */
|
||||||
const tmpPath = `${lockPath}.${token}`;
|
const RESTART_MUTEX_RETRY_MS = 20;
|
||||||
await writeFile(tmpPath, content);
|
|
||||||
|
/**
|
||||||
|
* Staleness for the internal mutex / reclaim locks, judged by the file's mtime
|
||||||
|
* rather than its CONTENT. `open(path, 'wx')` creates the inode (with a fresh
|
||||||
|
* mtime) before any token/timestamp is written into it, so a content-based check
|
||||||
|
* would momentarily see that empty file as corrupt-and-stale and could reap a
|
||||||
|
* lock another contender is still acquiring. mtime is set atomically at creation,
|
||||||
|
* so a just-created lock always reads as live; only a lock whose holder died and
|
||||||
|
* stopped touching it ages past the threshold. These locks are never held across
|
||||||
|
* the restart itself (only a couple of filesystem ops), so any mtime this old can
|
||||||
|
* belong only to a dead holder.
|
||||||
|
*/
|
||||||
|
async function isRestartLockPathStale(path: string, now: number): Promise<boolean> {
|
||||||
try {
|
try {
|
||||||
await rename(tmpPath, lockPath);
|
const info = await stat(path);
|
||||||
|
return now - info.mtimeMs >= RESTART_LOCK_STALE_MS;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
await unlink(tmpPath).catch(() => {});
|
if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||||
|
return false; // Gone, not stale — the caller will re-contend.
|
||||||
|
}
|
||||||
|
return false; // Can't stat — treat as live and back off rather than reap.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Path of the reclaim lock that serializes reaping of a crashed-holder mutex. */
|
||||||
|
function restartReclaimPath(mutexPath: string): string {
|
||||||
|
return `${mutexPath}.reclaim`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reap a registry mutex left behind by a process that CRASHED mid-transition —
|
||||||
|
* one whose file has aged past RESTART_LOCK_STALE_MS. Because the mutex is held
|
||||||
|
* only for a couple of filesystem ops (no sleeps, never across the restart), a
|
||||||
|
* mutex this old can only belong to a dead holder.
|
||||||
|
*
|
||||||
|
* The reap removes the dead mutex but never CREATES/holds it — acquisition stays
|
||||||
|
* the single `open('wx')` create in {@link acquireRestartMutex}, so exactly one
|
||||||
|
* contender wins ownership no matter how the reap and acquires interleave. The
|
||||||
|
* removal is made conditional by a dedicated reclaim lock: while it is held the
|
||||||
|
* dead mutex is stable (its dead holder will never touch it, and no other
|
||||||
|
* reclaimer can race), so re-reading it and removing it only if it is STILL stale
|
||||||
|
* is a true compare — a live holder's fresh mutex is never removed. This closes
|
||||||
|
* the reclaim race a content-blind rename-and-restore left open (a third
|
||||||
|
* contender slipping into the gap while a fresh mutex was moved aside).
|
||||||
|
*/
|
||||||
|
async function reclaimStaleRestartMutex(mutexPath: string): Promise<void> {
|
||||||
|
const reclaimPath = restartReclaimPath(mutexPath);
|
||||||
|
let handle: Awaited<ReturnType<typeof open>>;
|
||||||
|
try {
|
||||||
|
handle = await open(reclaimPath, 'wx');
|
||||||
|
} catch (err) {
|
||||||
|
if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
|
||||||
throw err;
|
throw err;
|
||||||
}
|
}
|
||||||
return (await readRestartLockToken(lockPath)) === token;
|
// Someone is already reclaiming. If their reclaim lock is itself stale by
|
||||||
|
// mtime, its holder crashed mid-reap (the lock spans only a stat + unlink,
|
||||||
|
// microseconds) — clear it so a later pass can retry. Otherwise a live
|
||||||
|
// reclaimer has it; back off. Either way we do not reap the mutex this pass.
|
||||||
|
if (await isRestartLockPathStale(reclaimPath, Date.now())) {
|
||||||
|
await unlink(reclaimPath).catch(() => {});
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
// Re-check the mutex UNDER the reclaim lock and remove it only if it is STILL
|
||||||
|
// stale by mtime. A live holder's mutex is fresh and is left untouched; a dead
|
||||||
|
// holder's mutex is stable here (its holder is gone and no other reclaimer can
|
||||||
|
// race us), so this re-check is authoritative.
|
||||||
|
if (await isRestartLockPathStale(mutexPath, Date.now())) {
|
||||||
|
await unlink(mutexPath).catch(() => {});
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await handle.close();
|
||||||
|
await unlink(reclaimPath).catch(() => {});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Acquire the registry mutex, BLOCKING (with brief back-offs) until held, and
|
||||||
|
* return a token-gated release. This is the single point of mutual exclusion for
|
||||||
|
* the restart lock: acquire, release, and stale/timeout takeover all run under it,
|
||||||
|
* so "read the lock, then mutate it" is atomic — no acquirer, releaser, or breaker
|
||||||
|
* can ever interleave with another. A mutex left by a crashed holder is reclaimed
|
||||||
|
* once it ages past the stale threshold.
|
||||||
|
*/
|
||||||
|
async function acquireRestartMutex(
|
||||||
|
mutexPath: string,
|
||||||
|
token: string,
|
||||||
|
): Promise<RestartGuard['release']> {
|
||||||
|
for (;;) {
|
||||||
|
let handle: Awaited<ReturnType<typeof open>>;
|
||||||
|
try {
|
||||||
|
handle = await open(mutexPath, 'wx');
|
||||||
|
} catch (err) {
|
||||||
|
if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
// Staleness is judged by mtime, not content, so a mutex that exists but has
|
||||||
|
// not yet had its token written (the open-before-write window) reads as live
|
||||||
|
// and is never wrongly reaped.
|
||||||
|
if (!(await isRestartLockPathStale(mutexPath, Date.now()))) {
|
||||||
|
// A live holder has it — it will be gone in microseconds. Back off briefly.
|
||||||
|
await new Promise((resolve) => setTimeout(resolve, RESTART_MUTEX_RETRY_MS));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
await reclaimStaleRestartMutex(mutexPath);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// We created the mutex. Populate it with our token; if writing fails, clean up
|
||||||
|
// our own file so we never leak an empty mutex that a peer would have to reap.
|
||||||
|
try {
|
||||||
|
await handle.writeFile(formatRestartLockContent(token));
|
||||||
|
await handle.close();
|
||||||
|
} catch (err) {
|
||||||
|
await handle.close().catch(() => {});
|
||||||
|
await unlink(mutexPath).catch(() => {});
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
return async (): Promise<void> => {
|
||||||
|
if ((await readRestartLockToken(mutexPath)) !== token) return;
|
||||||
|
await unlink(mutexPath).catch(() => {});
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -652,11 +752,16 @@ async function breakAndOwnRestartLock(
|
|||||||
* by a crashed owner, and after RESTART_LOCK_MAX_WAIT_MS breaks the lock to
|
* by a crashed owner, and after RESTART_LOCK_MAX_WAIT_MS breaks the lock to
|
||||||
* avoid a permanent deadlock.
|
* avoid a permanent deadlock.
|
||||||
*
|
*
|
||||||
* Ownership is tracked by a unique per-acquire token written into the lock.
|
* Correctness rests on a single invariant: EVERY transition of the lock — taking
|
||||||
* `release()` only unlinks the lock while our token is still on disk, and a
|
* a free lock, taking over a stale/timed-out one, and releasing — happens under
|
||||||
* break takes ownership atomically — so once another caller has broken and
|
* the registry mutex. Because the check ("is the lock free / stale / fresh?") and
|
||||||
* re-owned the lock, neither the timed-out original owner's `release()` nor a
|
* the mutation that follows it both run while the mutex is held, they are atomic:
|
||||||
* stale `break` can drop the new owner's lock and let a third restart interleave.
|
* no other acquirer, releaser, or breaker can slip in between. That is what makes
|
||||||
|
* takeover a true compare-and-swap rather than a content-blind clobber — a normal
|
||||||
|
* `open('wx')` acquirer cannot create a fresh lock in a gap, and the original
|
||||||
|
* owner's `release()` (also mutex-gated and token-checked) cannot drop a lock a
|
||||||
|
* breaker already took over. So no interleaving lets two restarts both own the
|
||||||
|
* lock and run concurrently.
|
||||||
*/
|
*/
|
||||||
export async function acquireRestartLock(
|
export async function acquireRestartLock(
|
||||||
mosaicHome: string,
|
mosaicHome: string,
|
||||||
@@ -664,50 +769,67 @@ export async function acquireRestartLock(
|
|||||||
): Promise<RestartGuard> {
|
): Promise<RestartGuard> {
|
||||||
const token = randomUUID();
|
const token = randomUUID();
|
||||||
const lockPath = restartLockPath(mosaicHome);
|
const lockPath = restartLockPath(mosaicHome);
|
||||||
|
const mutexPath = restartMutexPath(lockPath);
|
||||||
await mkdir(dirname(lockPath), { recursive: true });
|
await mkdir(dirname(lockPath), { recursive: true });
|
||||||
const release = async (): Promise<void> => {
|
const release = async (): Promise<void> => {
|
||||||
// Ownership-safe: only remove the lock if it is still ours. If another
|
// Mutex-gated and token-gated: only remove the lock if it is still ours. If
|
||||||
// caller broke and re-owned it (after a stale/timeout break), the token no
|
// another caller took it over (after a stale/timeout break) the token no
|
||||||
// longer matches and we must leave their lock intact.
|
// longer matches and we leave their lock intact.
|
||||||
if ((await readRestartLockToken(lockPath)) !== token) {
|
const releaseMutex = await acquireRestartMutex(mutexPath, token);
|
||||||
return;
|
|
||||||
}
|
|
||||||
try {
|
try {
|
||||||
await unlink(lockPath);
|
if ((await readRestartLockToken(lockPath)) === token) {
|
||||||
} catch {
|
await unlink(lockPath).catch(() => {});
|
||||||
// Raced away between the token check and unlink — nothing more to do.
|
}
|
||||||
|
} finally {
|
||||||
|
await releaseMutex();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS;
|
const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
let owned = false;
|
||||||
|
const releaseMutex = await acquireRestartMutex(mutexPath, token);
|
||||||
try {
|
try {
|
||||||
const handle = await open(lockPath, 'wx');
|
// Read and (if appropriate) mutate the lock atomically under the mutex.
|
||||||
await handle.writeFile(formatRestartLockContent(token));
|
let current: string | null = null;
|
||||||
await handle.close();
|
let absent = false;
|
||||||
return { release };
|
try {
|
||||||
} catch (err) {
|
current = await readFile(lockPath, 'utf8');
|
||||||
if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
|
} catch (readErr) {
|
||||||
throw err;
|
if ((readErr as NodeJS.ErrnoException).code === 'ENOENT') {
|
||||||
|
absent = true;
|
||||||
|
} else {
|
||||||
|
current = null; // Unreadable/corrupt: treat as stale.
|
||||||
}
|
}
|
||||||
// A restart is already in flight (or its lock was left behind).
|
}
|
||||||
const stale = await isRestartLockStale(lockPath, Date.now());
|
const now = Date.now();
|
||||||
const timedOut = Date.now() >= deadline;
|
if (absent) {
|
||||||
|
// Lock is free — take it.
|
||||||
|
await writeFile(lockPath, formatRestartLockContent(token));
|
||||||
|
owned = true;
|
||||||
|
} else {
|
||||||
|
const stale = current === null || isRestartLockContentStale(current, now);
|
||||||
|
const timedOut = now >= deadline;
|
||||||
if (stale || timedOut) {
|
if (stale || timedOut) {
|
||||||
if (await breakAndOwnRestartLock(lockPath, token, formatRestartLockContent(token))) {
|
|
||||||
process.stderr.write(
|
process.stderr.write(
|
||||||
stale
|
stale
|
||||||
? 'Breaking stale fleet restart lock and proceeding.\n'
|
? 'Breaking stale fleet restart lock.\n'
|
||||||
: `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` +
|
: `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` +
|
||||||
'restart; breaking the lock and proceeding.\n',
|
'restart; breaking the lock.\n',
|
||||||
);
|
);
|
||||||
|
// Takeover is just an overwrite — safe because we hold the mutex, so no
|
||||||
|
// acquirer or releaser can touch the lock between our read and this write.
|
||||||
|
await writeFile(lockPath, formatRestartLockContent(token));
|
||||||
|
owned = true;
|
||||||
|
}
|
||||||
|
// else: a fresh restart owns it — wait below and re-evaluate.
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await releaseMutex();
|
||||||
|
}
|
||||||
|
if (owned) {
|
||||||
return { release };
|
return { release };
|
||||||
}
|
}
|
||||||
// A concurrent breaker won the takeover; back off and re-evaluate.
|
|
||||||
await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
|
await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
|
||||||
continue;
|
|
||||||
}
|
|
||||||
await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user