fix(fleet): serialize restart-lock transitions to close concurrent-breaker race (review #680 )

Stale/max-wait takeover was not safe against concurrent breakers: two breakers could both judge the lock stale and both proceed, re-introducing the tight-loop. POSIX/Node has no content- or inode-conditional unlink or rename, so "judge stale, then replace" can never be atomic with pure path ops. Serialize ALL lock transitions (acquire, release, takeover) under one short-lived registry mutex held only across a few fs ops, never across the restart itself. This makes check-then-mutate atomic, so exactly one breaker can take over a stale lock while the others wait and re-evaluate. The mutex itself uses mtime-based staleness (open('wx') creates an empty inode before the token is written; a content check would reap a lock that is still being acquired). The mutex populates-or-cleans-up on write failure so a half-created mutex never leaks. Regression coverage at two widths: a 2-breaker barrier test (exactly one takes over, the other waits) and the existing 3-breaker test (maxActive===1, distinct tokens, final lock released). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 20:26:39 -05:00
2 changed files with 321 additions and 66 deletions
--- a/packages/mosaic/src/commands/fleet.spec.ts
+++ b/packages/mosaic/src/commands/fleet.spec.ts
@@ -906,6 +906,139 @@ describe('fleet command construction', () => {
    await rm(home, { recursive: true, force: true });
  });

+  it('lets only one of several concurrent breakers proceed past a stale lock', async () => {
+    const home = await tempDir();
+    const lockPath = restartLockPath(home);
+    await mkdir(dirname(lockPath), { recursive: true });
+
+    // A stale lock left by a crashed owner: every concurrent re-entrant restart
+    // will judge it stale and try to break it at the same instant. Breaking must
+    // NOT grant ownership — only the atomic re-create may — so exactly one
+    // contender can ever hold the lock at a time. (The v2 fix wrote our own token
+    // during the break and read it back, so two breakers each saw their own token
+    // and BOTH proceeded; this guards that regression.)
+    await writeFile(
+      lockPath,
+      `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
+    );
+
+    // Yielding sleep so a waiting contender lets the current owner finish and
+    // release before it re-contends, instead of spinning the microtask queue.
+    const sleepFn: SleepFn = async () => {
+      await new Promise((res) => setTimeout(res, 0));
+    };
+
+    let active = 0;
+    let maxActive = 0;
+    const tokens: string[] = [];
+    const tokenOf = async (): Promise<string> => {
+      const raw = await readFile(lockPath, 'utf8');
+      return raw.split('\n')[2]?.trim() ?? '';
+    };
+
+    // One "restart" = acquire the lock, do work in the critical section, release.
+    const restartOnce = async (): Promise<void> => {
+      const guard = await acquireRestartLock(home, sleepFn);
+      active += 1;
+      maxActive = Math.max(maxActive, active);
+      // Record the token we own while we hold it, then yield to interleave with
+      // any other contender that might (wrongly) believe it owns the lock too.
+      tokens.push(await tokenOf());
+      await new Promise((res) => setTimeout(res, 0));
+      active -= 1;
+      await guard.release();
+    };
+
+    try {
+      // Three breakers race the single stale lock simultaneously.
+      await Promise.all([restartOnce(), restartOnce(), restartOnce()]);
+
+      // Mutual exclusion held: never two owners at once despite concurrent breaks.
+      expect(maxActive).toBe(1);
+      // Each acquire owned with its own distinct token — no two ever shared it.
+      expect(new Set(tokens).size).toBe(3);
+      // The lock is fully released at the end.
+      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
+  it('lets exactly one of two breakers take over a stale lock while the other waits', async () => {
+    const home = await tempDir();
+    const lockPath = restartLockPath(home);
+    await mkdir(dirname(lockPath), { recursive: true });
+
+    // A single stale lock both contenders will judge stale at the same instant.
+    // Every transition runs under the registry mutex, so only one may take the
+    // lock over; the other must observe a now-fresh owner and WAIT/re-evaluate
+    // rather than also taking over. (A content-blind clobber let both believe
+    // they owned it — this asserts the mutex-gated CAS takeover instead.)
+    await writeFile(
+      lockPath,
+      `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
+    );
+
+    // Barrier the winner holds against until the loser has observed the lock
+    // fresh and waited at least once — forcing the exact interleaving where one
+    // proceeds while the other waits, deterministically rather than by timing.
+    let resolveLoserWaited: () => void = () => {};
+    const loserWaited = new Promise<void>((res) => {
+      resolveLoserWaited = res;
+    });
+    let sleeps = 0;
+    const sleepFn: SleepFn = async () => {
+      sleeps += 1;
+      resolveLoserWaited();
+      await new Promise((res) => setTimeout(res, 0));
+    };
+
+    let active = 0;
+    let maxActive = 0;
+    const tokens: string[] = [];
+    const tokenOf = async (): Promise<string> => {
+      const raw = await readFile(lockPath, 'utf8');
+      return raw.split('\n')[2]?.trim() ?? '';
+    };
+
+    let firstOwner = true;
+    const restartOnce = async (): Promise<void> => {
+      const guard = await acquireRestartLock(home, sleepFn);
+      active += 1;
+      maxActive = Math.max(maxActive, active);
+      tokens.push(await tokenOf());
+      if (firstOwner) {
+        // Winner: keep holding the lock until the loser has waited once, so the
+        // loser is guaranteed to see a FRESH owner (not the stale one) and back
+        // off — proving it could not also take over.
+        firstOwner = false;
+        await loserWaited;
+      } else {
+        await new Promise((res) => setTimeout(res, 0));
+      }
+      active -= 1;
+      await guard.release();
+    };
+
+    try {
+      // Exactly two breakers race the single stale lock.
+      await Promise.all([restartOnce(), restartOnce()]);
+
+      // Mutual exclusion: never two owners at once (if both took over the stale
+      // lock, this would be 2).
+      expect(maxActive).toBe(1);
+      // Both eventually owned, each with its own distinct token.
+      expect(new Set(tokens).size).toBe(2);
+      // The loser observed the winner's fresh lock and waited — it did NOT also
+      // take over the stale lock.
+      expect(sleeps).toBeGreaterThanOrEqual(1);
+      // The lock is fully released at the end.
+      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
  it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => {
    const home = await tempDir();
    const rosterPath = join(home, 'fleet', 'roster.yaml');
--- a/packages/mosaic/src/commands/fleet.ts
+++ b/packages/mosaic/src/commands/fleet.ts
@@ -6,7 +6,7 @@ import {
  mkdir,
  open,
  readFile,
-  rename,
+  stat,
  unlink,
  writeFile,
 } from 'node:fs/promises';
@@ -594,21 +594,10 @@ async function readRestartLockToken(lockPath: string): Promise<string | null> {
 }

 /**
- * Returns true when an existing lock file is stale: older than
- * RESTART_LOCK_STALE_MS, or unreadable/unparseable (a corrupt or partially
- * written lock left by a crashed owner). A vanished lock (ENOENT) is not stale —
- * the next acquire attempt will simply succeed.
+ * Returns true when a lock's contents are stale: older than RESTART_LOCK_STALE_MS,
+ * or unparseable (a corrupt or partially written lock left by a crashed owner).
 */
-async function isRestartLockStale(lockPath: string, now: number): Promise<boolean> {
-  let raw: string;
-  try {
-    raw = await readFile(lockPath, 'utf8');
-  } catch (err) {
-    if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
-      return false;
-    }
-    return true;
-  }
+function isRestartLockContentStale(raw: string, now: number): boolean {
  const stampLine = raw.split('\n')[1] ?? '';
  const stamp = Number.parseInt(stampLine.trim(), 10);
  if (!Number.isFinite(stamp)) {
@@ -618,28 +607,139 @@ async function isRestartLockStale(lockPath: string, now: number): Promise<boolea
 }

 /**
- * Atomically take over an existing (stale or timed-out) lock WITHOUT blind
- * unlinking it: write our own token to a temp file and `rename()` it over the
- * lock. rename is atomic, so it replaces the prior owner's content in one step
- * rather than the unsafe unlink-then-recreate (which a third restart could slip
- * between). Returns true only if our token is the one on disk afterwards — if a
- * concurrent breaker raced and won, we read back their token and return false so
- * the caller keeps waiting instead of assuming ownership.
+ * Path of the short-lived registry mutex that guards EVERY transition of the
+ * restart lock (acquire, release, takeover). Held only across a few filesystem
+ * ops — never across the restart itself — so contention clears in microseconds.
 */
-async function breakAndOwnRestartLock(
-  lockPath: string,
-  token: string,
-  content: string,
-): Promise<boolean> {
-  const tmpPath = `${lockPath}.${token}`;
-  await writeFile(tmpPath, content);
+function restartMutexPath(lockPath: string): string {
+  return `${lockPath}.mutex`;
+}
+
+/** Brief back-off between registry-mutex acquisition attempts (held microseconds). */
+const RESTART_MUTEX_RETRY_MS = 20;
+
+/**
+ * Staleness for the internal mutex / reclaim locks, judged by the file's mtime
+ * rather than its CONTENT. `open(path, 'wx')` creates the inode (with a fresh
+ * mtime) before any token/timestamp is written into it, so a content-based check
+ * would momentarily see that empty file as corrupt-and-stale and could reap a
+ * lock another contender is still acquiring. mtime is set atomically at creation,
+ * so a just-created lock always reads as live; only a lock whose holder died and
+ * stopped touching it ages past the threshold. These locks are never held across
+ * the restart itself (only a couple of filesystem ops), so any mtime this old can
+ * belong only to a dead holder.
+ */
+async function isRestartLockPathStale(path: string, now: number): Promise<boolean> {
  try {
-    await rename(tmpPath, lockPath);
+    const info = await stat(path);
+    return now - info.mtimeMs >= RESTART_LOCK_STALE_MS;
  } catch (err) {
-    await unlink(tmpPath).catch(() => {});
-    throw err;
+    if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
+      return false; // Gone, not stale — the caller will re-contend.
+    }
+    return false; // Can't stat — treat as live and back off rather than reap.
+  }
+}
+
+/** Path of the reclaim lock that serializes reaping of a crashed-holder mutex. */
+function restartReclaimPath(mutexPath: string): string {
+  return `${mutexPath}.reclaim`;
+}
+
+/**
+ * Reap a registry mutex left behind by a process that CRASHED mid-transition —
+ * one whose file has aged past RESTART_LOCK_STALE_MS. Because the mutex is held
+ * only for a couple of filesystem ops (no sleeps, never across the restart), a
+ * mutex this old can only belong to a dead holder.
+ *
+ * The reap removes the dead mutex but never CREATES/holds it — acquisition stays
+ * the single `open('wx')` create in {@link acquireRestartMutex}, so exactly one
+ * contender wins ownership no matter how the reap and acquires interleave. The
+ * removal is made conditional by a dedicated reclaim lock: while it is held the
+ * dead mutex is stable (its dead holder will never touch it, and no other
+ * reclaimer can race), so re-reading it and removing it only if it is STILL stale
+ * is a true compare — a live holder's fresh mutex is never removed. This closes
+ * the reclaim race a content-blind rename-and-restore left open (a third
+ * contender slipping into the gap while a fresh mutex was moved aside).
+ */
+async function reclaimStaleRestartMutex(mutexPath: string): Promise<void> {
+  const reclaimPath = restartReclaimPath(mutexPath);
+  let handle: Awaited<ReturnType<typeof open>>;
+  try {
+    handle = await open(reclaimPath, 'wx');
+  } catch (err) {
+    if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
+      throw err;
+    }
+    // Someone is already reclaiming. If their reclaim lock is itself stale by
+    // mtime, its holder crashed mid-reap (the lock spans only a stat + unlink,
+    // microseconds) — clear it so a later pass can retry. Otherwise a live
+    // reclaimer has it; back off. Either way we do not reap the mutex this pass.
+    if (await isRestartLockPathStale(reclaimPath, Date.now())) {
+      await unlink(reclaimPath).catch(() => {});
+    }
+    return;
+  }
+  try {
+    // Re-check the mutex UNDER the reclaim lock and remove it only if it is STILL
+    // stale by mtime. A live holder's mutex is fresh and is left untouched; a dead
+    // holder's mutex is stable here (its holder is gone and no other reclaimer can
+    // race us), so this re-check is authoritative.
+    if (await isRestartLockPathStale(mutexPath, Date.now())) {
+      await unlink(mutexPath).catch(() => {});
+    }
+  } finally {
+    await handle.close();
+    await unlink(reclaimPath).catch(() => {});
+  }
+}
+
+/**
+ * Acquire the registry mutex, BLOCKING (with brief back-offs) until held, and
+ * return a token-gated release. This is the single point of mutual exclusion for
+ * the restart lock: acquire, release, and stale/timeout takeover all run under it,
+ * so "read the lock, then mutate it" is atomic — no acquirer, releaser, or breaker
+ * can ever interleave with another. A mutex left by a crashed holder is reclaimed
+ * once it ages past the stale threshold.
+ */
+async function acquireRestartMutex(
+  mutexPath: string,
+  token: string,
+): Promise<RestartGuard['release']> {
+  for (;;) {
+    let handle: Awaited<ReturnType<typeof open>>;
+    try {
+      handle = await open(mutexPath, 'wx');
+    } catch (err) {
+      if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
+        throw err;
+      }
+      // Staleness is judged by mtime, not content, so a mutex that exists but has
+      // not yet had its token written (the open-before-write window) reads as live
+      // and is never wrongly reaped.
+      if (!(await isRestartLockPathStale(mutexPath, Date.now()))) {
+        // A live holder has it — it will be gone in microseconds. Back off briefly.
+        await new Promise((resolve) => setTimeout(resolve, RESTART_MUTEX_RETRY_MS));
+        continue;
+      }
+      await reclaimStaleRestartMutex(mutexPath);
+      continue;
+    }
+    // We created the mutex. Populate it with our token; if writing fails, clean up
+    // our own file so we never leak an empty mutex that a peer would have to reap.
+    try {
+      await handle.writeFile(formatRestartLockContent(token));
+      await handle.close();
+    } catch (err) {
+      await handle.close().catch(() => {});
+      await unlink(mutexPath).catch(() => {});
+      throw err;
+    }
+    return async (): Promise<void> => {
+      if ((await readRestartLockToken(mutexPath)) !== token) return;
+      await unlink(mutexPath).catch(() => {});
+    };
  }
-  return (await readRestartLockToken(lockPath)) === token;
 }

 /**
@@ -652,11 +752,16 @@ async function breakAndOwnRestartLock(
 * by a crashed owner, and after RESTART_LOCK_MAX_WAIT_MS breaks the lock to
 * avoid a permanent deadlock.
 *
- * Ownership is tracked by a unique per-acquire token written into the lock.
- * `release()` only unlinks the lock while our token is still on disk, and a
- * break takes ownership atomically — so once another caller has broken and
- * re-owned the lock, neither the timed-out original owner's `release()` nor a
- * stale `break` can drop the new owner's lock and let a third restart interleave.
+ * Correctness rests on a single invariant: EVERY transition of the lock — taking
+ * a free lock, taking over a stale/timed-out one, and releasing — happens under
+ * the registry mutex. Because the check ("is the lock free / stale / fresh?") and
+ * the mutation that follows it both run while the mutex is held, they are atomic:
+ * no other acquirer, releaser, or breaker can slip in between. That is what makes
+ * takeover a true compare-and-swap rather than a content-blind clobber — a normal
+ * `open('wx')` acquirer cannot create a fresh lock in a gap, and the original
+ * owner's `release()` (also mutex-gated and token-checked) cannot drop a lock a
+ * breaker already took over. So no interleaving lets two restarts both own the
+ * lock and run concurrently.
 */
 export async function acquireRestartLock(
  mosaicHome: string,
@@ -664,50 +769,67 @@ export async function acquireRestartLock(
 ): Promise<RestartGuard> {
  const token = randomUUID();
  const lockPath = restartLockPath(mosaicHome);
+  const mutexPath = restartMutexPath(lockPath);
  await mkdir(dirname(lockPath), { recursive: true });
  const release = async (): Promise<void> => {
-    // Ownership-safe: only remove the lock if it is still ours. If another
-    // caller broke and re-owned it (after a stale/timeout break), the token no
-    // longer matches and we must leave their lock intact.
-    if ((await readRestartLockToken(lockPath)) !== token) {
-      return;
-    }
+    // Mutex-gated and token-gated: only remove the lock if it is still ours. If
+    // another caller took it over (after a stale/timeout break) the token no
+    // longer matches and we leave their lock intact.
+    const releaseMutex = await acquireRestartMutex(mutexPath, token);
    try {
-      await unlink(lockPath);
-    } catch {
-      // Raced away between the token check and unlink — nothing more to do.
+      if ((await readRestartLockToken(lockPath)) === token) {
+        await unlink(lockPath).catch(() => {});
+      }
+    } finally {
+      await releaseMutex();
    }
  };
  const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS;
  for (;;) {
+    let owned = false;
+    const releaseMutex = await acquireRestartMutex(mutexPath, token);
    try {
-      const handle = await open(lockPath, 'wx');
-      await handle.writeFile(formatRestartLockContent(token));
-      await handle.close();
-      return { release };
-    } catch (err) {
-      if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
-        throw err;
+      // Read and (if appropriate) mutate the lock atomically under the mutex.
+      let current: string | null = null;
+      let absent = false;
+      try {
+        current = await readFile(lockPath, 'utf8');
+      } catch (readErr) {
+        if ((readErr as NodeJS.ErrnoException).code === 'ENOENT') {
+          absent = true;
+        } else {
+          current = null; // Unreadable/corrupt: treat as stale.
+        }
      }
-      // A restart is already in flight (or its lock was left behind).
-      const stale = await isRestartLockStale(lockPath, Date.now());
-      const timedOut = Date.now() >= deadline;
-      if (stale || timedOut) {
-        if (await breakAndOwnRestartLock(lockPath, token, formatRestartLockContent(token))) {
+      const now = Date.now();
+      if (absent) {
+        // Lock is free — take it.
+        await writeFile(lockPath, formatRestartLockContent(token));
+        owned = true;
+      } else {
+        const stale = current === null || isRestartLockContentStale(current, now);
+        const timedOut = now >= deadline;
+        if (stale || timedOut) {
          process.stderr.write(
            stale
-              ? 'Breaking stale fleet restart lock and proceeding.\n'
+              ? 'Breaking stale fleet restart lock.\n'
              : `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` +
-                  'restart; breaking the lock and proceeding.\n',
+                  'restart; breaking the lock.\n',
          );
-          return { release };
+          // Takeover is just an overwrite — safe because we hold the mutex, so no
+          // acquirer or releaser can touch the lock between our read and this write.
+          await writeFile(lockPath, formatRestartLockContent(token));
+          owned = true;
        }
-        // A concurrent breaker won the takeover; back off and re-evaluate.
-        await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
-        continue;
+        // else: a fresh restart owns it — wait below and re-evaluate.
      }
-      await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
+    } finally {
+      await releaseMutex();
    }
+    if (owned) {
+      return { release };
+    }
+    await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
  }
 }