feat(#462 ): add federation list verb

2026-06-24 18:54:47 -05:00
8 changed files with 74 additions and 1097 deletions
--- a/apps/gateway/src/federation/server/verbs/tests/list-query.service.spec.ts
+++ b/apps/gateway/src/federation/server/verbs/tests/list-query.service.spec.ts
@@ -1,10 +1,8 @@
 import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest';
 import {
  createPgliteDb,
  insights,
  missionTasks,
  missions,
  preferences,
  projects,
  runPgliteMigrations,
  teams,
@@ -27,21 +25,13 @@ const TASK_FILTER: FederationScopeQueryFilter = {
 const SUBJECT_USER_ID = 'fed-m3-05-subject';
 const OTHER_USER_ID = 'fed-m3-05-other';
 const TEAM_ID = '05000000-0000-4000-8000-000000000001';
 const UNAUTHORIZED_TEAM_ID = '05000000-0000-4000-8000-000000000002';
 const PERSONAL_PROJECT_ID = '05000000-0000-4000-8000-000000000101';
 const TEAM_PROJECT_ID = '05000000-0000-4000-8000-000000000102';
 const UNAUTHORIZED_PROJECT_ID = '05000000-0000-4000-8000-000000000103';
 const PERSONAL_MISSION_ID = '05000000-0000-4000-8000-000000000201';
 const TEAM_MISSION_ID = '05000000-0000-4000-8000-000000000202';
 const UNAUTHORIZED_MISSION_ID = '05000000-0000-4000-8000-000000000203';
 const SUBJECT_TEAM_NOTE_ID = '05000000-0000-4000-8000-000000000301';
 const OTHER_TEAM_NOTE_ID = '05000000-0000-4000-8000-000000000302';
 const SUBJECT_PERSONAL_NOTE_ID = '05000000-0000-4000-8000-000000000303';
 const SUBJECT_UNAUTHORIZED_NOTE_ID = '05000000-0000-4000-8000-000000000304';
 const INSIGHT_ONE_ID = '05000000-0000-4000-8000-000000000401';
 const INSIGHT_TWO_ID = '05000000-0000-4000-8000-000000000402';
 const PREFERENCE_ONE_ID = '05000000-0000-4000-8000-000000000501';
 const PREFERENCE_TWO_ID = '05000000-0000-4000-8000-000000000502';
 let dbHandle: DbHandle | undefined;
@@ -76,22 +66,13 @@ async function seedNotesFixture() {
    },
  ]);
-  await dbHandle.db.insert(teams).values([
+  await dbHandle.db.insert(teams).values({
-    {
+    id: TEAM_ID,
-      id: TEAM_ID,
+    name: 'FED-M3-05 Team',
-      name: 'FED-M3-05 Team',
+    slug: 'fed-m3-05-team',
-      slug: 'fed-m3-05-team',
+    ownerId: SUBJECT_USER_ID,
-      ownerId: SUBJECT_USER_ID,
+    managerId: SUBJECT_USER_ID,
-      managerId: SUBJECT_USER_ID,
+  });
    },
    {
      id: UNAUTHORIZED_TEAM_ID,
      name: 'FED-M3-05 Unauthorized Team',
      slug: 'fed-m3-05-unauthorized-team',
      ownerId: OTHER_USER_ID,
      managerId: OTHER_USER_ID,
    },
  ]);
  await dbHandle.db.insert(projects).values([
    {
@@ -106,12 +87,6 @@ async function seedNotesFixture() {
      teamId: TEAM_ID,
      ownerType: 'team',
    },
    {
      id: UNAUTHORIZED_PROJECT_ID,
      name: 'FED-M3-05 Unauthorized Project',
      teamId: UNAUTHORIZED_TEAM_ID,
      ownerType: 'team',
    },
  ]);
  await dbHandle.db.insert(missions).values([
@@ -127,12 +102,6 @@ async function seedNotesFixture() {
      projectId: TEAM_PROJECT_ID,
      userId: SUBJECT_USER_ID,
    },
    {
      id: UNAUTHORIZED_MISSION_ID,
      name: 'FED-M3-05 Unauthorized Mission',
      projectId: UNAUTHORIZED_PROJECT_ID,
      userId: SUBJECT_USER_ID,
    },
  ]);
  await dbHandle.db.insert(missionTasks).values([
@@ -160,53 +129,6 @@ async function seedNotesFixture() {
      createdAt: new Date('2026-06-24T01:00:00.000Z'),
      updatedAt: new Date('2026-06-24T01:00:00.000Z'),
    },
    {
      id: SUBJECT_UNAUTHORIZED_NOTE_ID,
      missionId: UNAUTHORIZED_MISSION_ID,
      userId: SUBJECT_USER_ID,
      notes: 'subject note outside grant-visible missions',
      createdAt: new Date('2026-06-24T04:00:00.000Z'),
      updatedAt: new Date('2026-06-24T04:00:00.000Z'),
    },
  ]);
  const memoryCreatedAt = new Date('2026-06-24T05:00:00.000Z');
  await dbHandle.db.insert(insights).values([
    {
      id: INSIGHT_ONE_ID,
      userId: SUBJECT_USER_ID,
      content: 'first insight',
      source: 'agent',
      createdAt: memoryCreatedAt,
      updatedAt: memoryCreatedAt,
    },
    {
      id: INSIGHT_TWO_ID,
      userId: SUBJECT_USER_ID,
      content: 'second insight',
      source: 'agent',
      createdAt: memoryCreatedAt,
      updatedAt: memoryCreatedAt,
    },
  ]);
  await dbHandle.db.insert(preferences).values([
    {
      id: PREFERENCE_ONE_ID,
      userId: SUBJECT_USER_ID,
      key: 'fed-m3-05-pref-1',
      value: { enabled: true },
      createdAt: memoryCreatedAt,
      updatedAt: memoryCreatedAt,
    },
    {
      id: PREFERENCE_TWO_ID,
      userId: SUBJECT_USER_ID,
      key: 'fed-m3-05-pref-2',
      value: { enabled: false },
      createdAt: memoryCreatedAt,
      updatedAt: memoryCreatedAt,
    },
  ]);
 }
@@ -321,31 +243,6 @@ describe('FederationListQueryService', () => {
    );
  });
  it('throws when a truncated page cannot encode a resumable cursor', async () => {
    const service = makeService();
    stubRows(service, [
      { id: '2', createdAt: 'not-a-date' },
      { id: '1', createdAt: 'not-a-date' },
    ]);
    await expect(service.list({ filter: { ...TASK_FILTER, limit: 1 } })).rejects.toThrow(
      'Federation list cursor cannot be encoded',
    );
  });
  it('throws on unsupported resources instead of crashing pagination', async () => {
    const service = makeService();
    await expect(
      service.list({
        filter: {
          ...TASK_FILTER,
          resource: 'unknown-resource' as FederationScopeQueryFilter['resource'],
        },
      }),
    ).rejects.toThrow('Unsupported federation list resource');
  });
  it('does not leak another user mission task notes through team-scoped note reads', async () => {
    const service = makeDbService();
@@ -381,48 +278,4 @@ describe('FederationListQueryService', () => {
    expect(result.items.map((item) => item['id'])).not.toContain(SUBJECT_PERSONAL_NOTE_ID);
  });
  it('does not return subject notes from missions outside the grant-visible project set', async () => {
    const service = makeDbService();
    const result = await service.list({
      filter: {
        resource: 'notes',
        subjectUserId: SUBJECT_USER_ID,
        includePersonal: true,
        teamIds: [TEAM_ID],
        limit: 10,
        maxRowsPerQuery: 10,
      },
    });
    const ids = result.items.map((item) => item['id']);
    expect(ids).toContain(SUBJECT_PERSONAL_NOTE_ID);
    expect(ids).toContain(SUBJECT_TEAM_NOTE_ID);
    expect(ids).not.toContain(SUBJECT_UNAUTHORIZED_NOTE_ID);
    expect(ids).not.toContain(OTHER_TEAM_NOTE_ID);
  });
  it('paginates memory deterministically across insights and preferences', async () => {
    const service = makeDbService();
    const filter: FederationScopeQueryFilter = {
      resource: 'memory',
      subjectUserId: SUBJECT_USER_ID,
      includePersonal: true,
      teamIds: [],
      limit: 2,
      maxRowsPerQuery: 2,
    };
    const firstPage = await service.list({ filter });
    const secondPage = await service.list({ filter, cursor: firstPage.nextCursor });
    const firstPageIds = firstPage.items.map((item) => item['id']);
    const secondPageIds = secondPage.items.map((item) => item['id']);
    const allIds = [...firstPageIds, ...secondPageIds];
    expect(firstPage).toMatchObject({ truncated: true, nextCursor: expect.any(String) });
    expect(firstPageIds).toEqual([INSIGHT_TWO_ID, INSIGHT_ONE_ID]);
    expect(secondPageIds).toEqual([PREFERENCE_TWO_ID, PREFERENCE_ONE_ID]);
    expect(new Set(allIds).size).toBe(allIds.length);
  });
 });
--- a/apps/gateway/src/federation/server/verbs/tests/list.controller.spec.ts
+++ b/apps/gateway/src/federation/server/verbs/tests/list.controller.spec.ts
@@ -120,24 +120,6 @@ describe('ListController', () => {
    });
  });
  it('returns a federation error envelope when auth guard context is missing', async () => {
    const { controller, scope, query } = makeController();
    await expect(
      controller.list('tasks', {} as unknown as FastifyRequest, {}),
    ).rejects.toMatchObject({
      response: {
        error: {
          code: 'unauthorized',
          message: 'Federation context missing',
        },
      },
      status: 401,
    });
    expect(scope.evaluateAccess).not.toHaveBeenCalled();
    expect(query.list).not.toHaveBeenCalled();
  });
  it('returns a federation error envelope when scope evaluation denies access', async () => {
    const { controller, query } = makeController({
      scopeResult: {
--- a/apps/gateway/src/federation/server/verbs/list-query.service.ts
+++ b/apps/gateway/src/federation/server/verbs/list-query.service.ts
@@ -44,29 +44,23 @@ export interface FederationListQueryResult<T extends object = Record<string, unk
  readonly truncated: boolean;
 }
-type CursorSource = 'insights' | 'preferences';
+type RowObject = Record<string, unknown>;
 const CURSOR_SOURCE = Symbol('federationCursorSource');
 type RowObject = Record<string, unknown> & { readonly [CURSOR_SOURCE]?: CursorSource };
 interface KeysetCursor {
  readonly createdAt: Date;
  readonly id: string;
  readonly source?: CursorSource;
 }
-function encodeCursor(row: RowObject): string {
+function encodeCursor(row: RowObject): string | undefined {
  const createdAt = row['createdAt'];
  const id = row['id'];
  if (!(createdAt instanceof Date) || typeof id !== 'string') {
-    throw new Error('Federation list cursor cannot be encoded');
+    return undefined;
  }
-  const source = row[CURSOR_SOURCE];
+  return Buffer.from(JSON.stringify({ createdAt: createdAt.toISOString(), id }), 'utf8').toString(
-  return Buffer.from(
+    'base64url',
-    JSON.stringify({ createdAt: createdAt.toISOString(), id, ...(source ? { source } : {}) }),
+  );
    'utf8',
  ).toString('base64url');
 }
 function decodeCursor(cursor: string | undefined): KeysetCursor | undefined {
@@ -80,24 +74,17 @@ function decodeCursor(cursor: string | undefined): KeysetCursor | undefined {
      throw new Error('cursor must be an object');
    }
-    const { createdAt, id, source } = parsed as {
+    const { createdAt, id } = parsed as { createdAt?: unknown; id?: unknown };
      createdAt?: unknown;
      id?: unknown;
      source?: unknown;
    };
    if (typeof createdAt !== 'string' || typeof id !== 'string' || id.length === 0) {
      throw new Error('cursor is missing createdAt or id');
    }
    if (source !== undefined && source !== 'insights' && source !== 'preferences') {
      throw new Error('cursor source is invalid');
    }
    const date = new Date(createdAt);
    if (Number.isNaN(date.getTime())) {
      throw new Error('cursor createdAt is invalid');
    }
-    return { createdAt: date, id, ...(source ? { source } : {}) };
+    return { createdAt: date, id };
  } catch {
    throw new Error('Invalid federation list cursor');
  }
@@ -115,15 +102,6 @@ function paginate<T extends RowObject>(rows: T[], limit: number): FederationList
  };
 }
 function markCursorSource<T extends RowObject>(row: T, source: CursorSource): T {
  Object.defineProperty(row, CURSOR_SOURCE, {
    value: source,
    enumerable: false,
    configurable: false,
  });
  return row;
 }
 function sortRows(rows: RowObject[]): RowObject[] {
  return [...rows].sort((a, b) => {
    const aTime = a['createdAt'] instanceof Date ? a['createdAt'].getTime() : 0;
@@ -181,8 +159,6 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
      case 'credentials':
      case 'api_keys':
        return [];
      default:
        throw new Error(`Unsupported federation list resource: ${String(filter.resource)}`);
    }
  }
@@ -335,25 +311,22 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
    if (!filter.includePersonal) {
      return [];
    }
    if (cursor && cursor.source === undefined) {
      throw new Error('Invalid federation list cursor');
    }
-    const rows: RowObject[] = [];
+    const insightCursorClause = cursor
      ? or(
          lt(insights.createdAt, cursor.createdAt),
          and(eq(insights.createdAt, cursor.createdAt), lt(insights.id, cursor.id)),
        )
      : undefined;
    const preferenceCursorClause = cursor
      ? or(
          lt(preferences.createdAt, cursor.createdAt),
          and(eq(preferences.createdAt, cursor.createdAt), lt(preferences.id, cursor.id)),
        )
      : undefined;
-    // Memory spans two physical tables. To keep pagination deterministic and
+    const [insightRows, preferenceRows] = await Promise.all([
-    // resumable without a SQL UNION, M3 emits a fixed block order: all insights
+      this.db
    // first, then preferences. The opaque cursor records which table produced
    // the boundary row, so the next page never re-applies one table's keyset to
    // the other table (which could duplicate/skip rows at equal timestamps).
    if (cursor?.source !== 'preferences') {
      const insightCursorClause = cursor
        ? or(
            lt(insights.createdAt, cursor.createdAt),
            and(eq(insights.createdAt, cursor.createdAt), lt(insights.id, cursor.id)),
          )
        : undefined;
      const insightRows = await this.db
        .select({
          id: insights.id,
          kind: insights.source,
@@ -367,42 +340,24 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
        .from(insights)
        .where(and(eq(insights.userId, filter.subjectUserId), insightCursorClause))
        .orderBy(desc(insights.createdAt), desc(insights.id))
-        .limit(rowLimit);
+        .limit(rowLimit),
      this.db
        .select({
          id: preferences.id,
          kind: preferences.category,
          key: preferences.key,
          value: preferences.value,
          source: preferences.source,
          mutable: preferences.mutable,
          createdAt: preferences.createdAt,
          updatedAt: preferences.updatedAt,
        })
        .from(preferences)
        .where(and(eq(preferences.userId, filter.subjectUserId), preferenceCursorClause))
        .orderBy(desc(preferences.createdAt), desc(preferences.id))
        .limit(rowLimit),
    ]);
-      rows.push(...(insightRows as RowObject[]).map((row) => markCursorSource(row, 'insights')));
+    return sortRows([...(insightRows as RowObject[]), ...(preferenceRows as RowObject[])]);
    }
    const remaining = rowLimit - rows.length;
    if (remaining <= 0) {
      return rows;
    }
    const preferenceCursorClause =
      cursor?.source === 'preferences'
        ? or(
            lt(preferences.createdAt, cursor.createdAt),
            and(eq(preferences.createdAt, cursor.createdAt), lt(preferences.id, cursor.id)),
          )
        : undefined;
    const preferenceRows = await this.db
      .select({
        id: preferences.id,
        kind: preferences.category,
        key: preferences.key,
        value: preferences.value,
        source: preferences.source,
        mutable: preferences.mutable,
        createdAt: preferences.createdAt,
        updatedAt: preferences.updatedAt,
      })
      .from(preferences)
      .where(and(eq(preferences.userId, filter.subjectUserId), preferenceCursorClause))
      .orderBy(desc(preferences.createdAt), desc(preferences.id))
      .limit(remaining);
    rows.push(
      ...(preferenceRows as RowObject[]).map((row) => markCursorSource(row, 'preferences')),
    );
    return rows;
  }
 }
--- a/apps/gateway/src/federation/server/verbs/list.controller.ts
+++ b/apps/gateway/src/federation/server/verbs/list.controller.ts
@@ -24,7 +24,6 @@ import type { FastifyRequest } from 'fastify';
 import {
  FederationInvalidRequestError,
  FederationScopeViolationError,
  FederationUnauthorizedError,
  SOURCE_LOCAL,
  tagWithSource,
  type FederationListResponse,
@@ -94,10 +93,7 @@ export class ListController {
    @Body() body?: FederationListRequestBody,
  ): Promise<FederationListResponse<FederatedRow>> {
    if (!request.federationContext) {
-      throw new HttpException(
+      throw new Error('Federation context missing after auth guard');
        new FederationUnauthorizedError('Federation context missing').toEnvelope(),
        401,
      );
    }
    const requestedLimit = parseLimit(body);
--- a/docs/scratchpads/FED-M3-05-list-verb.md
+++ b/docs/scratchpads/FED-M3-05-list-verb.md
@@ -28,12 +28,10 @@ Implement `POST /api/federation/v1/list/:resource`.
  - `credentials` / `api_keys`: denied by native RBAC in M3 even if present in scope; sensitive-resource implementation is not part of FED-M3-05.
 - Cursor pagination uses an opaque base64url keyset cursor over `(createdAt, id)`; DB reads fetch at most `limit + 1` rows per resource query.
 - Reviewer isolation fix: `mission_tasks.notes` rows are always constrained by `missionTasks.userId = subjectUserId` and accessible mission IDs; team scope narrows missions but never widens to other users' mission task notes.
 - Follow-up review fix: memory listing now uses deterministic table-block pagination (`insights` first, then `preferences`) with cursor source metadata, so one table's cursor is never applied to the other.
 - Follow-up hardening: missing auth-guard context returns a structured federation `unauthorized` envelope; unsupported resources and non-encodable truncated cursors throw instead of silently crashing/truncating.
 ## Tests
- `pnpm --filter @mosaicstack/gateway test -- list.controller.spec.ts list-query.service.spec.ts` — PASS (16 tests, including PGlite regression coverage for team-scoped notes isolation, unauthorized mission notes exclusion, `includePersonal: false`, deterministic memory pagination, missing context envelope, unsupported resource, and cursor encode failure).
+- `pnpm --filter @mosaicstack/gateway test -- list.controller.spec.ts list-query.service.spec.ts` — PASS (11 tests, including PGlite regression coverage for team-scoped notes isolation and `includePersonal: false`).
 - `pnpm --filter @mosaicstack/gateway typecheck` — PASS.
 - `pnpm --filter @mosaicstack/gateway lint` — PASS.
 - `pnpm format:check` — PASS.
@@ -43,8 +41,8 @@ Implement `POST /api/federation/v1/list/:resource`.
 ## Review evidence
- `~/.config/mosaic/tools/codex/codex-code-review.sh --uncommitted` — PASS after follow-up remediation; approve, no findings.
+- `~/.config/mosaic/tools/codex/codex-code-review.sh --uncommitted` — PASS after remediation; approve, no findings.
- `~/.config/mosaic/tools/codex/codex-security-review.sh --uncommitted` — PASS after follow-up remediation; risk level none, no findings.
+- `~/.config/mosaic/tools/codex/codex-security-review.sh --uncommitted` — PASS after cursor + notes isolation remediation; risk level none, no findings.
 - Security-review note: read-path audit logging remains intentionally deferred to M4 per orchestrator clarification and FED-M3-05 scope.
 ## Risks / follow-up
--- a/packages/mosaic/src/commands/fleet.spec.ts
+++ b/packages/mosaic/src/commands/fleet.spec.ts
@@ -4,7 +4,6 @@ import { dirname, join, resolve } from 'node:path';
 import { Command } from 'commander';
 import { afterEach, describe, expect, it, vi } from 'vitest';
 import {
  acquireRestartLock,
  addAgentToRoster,
  buildAgentSendCommand,
  buildAgentWatchAttachCommand,
@@ -46,8 +45,6 @@ import {
  removeAgentFromRoster,
  resolveFleetPaths,
  resolvePresetFilename,
  restartLockPath,
  RESTART_LOCK_STALE_MS,
  RUNTIME_ACCEPTABLE_COMMANDS,
  serializeRosterToYaml,
  VERIFY_DEFAULT_TIMEOUT_MS,
@@ -681,364 +678,6 @@ describe('fleet command construction', () => {
    }
  });
  it('waits for an in-flight restart to clear before relaunching (re-entry guard)', async () => {
    const home = await tempDir();
    const rosterPath = join(home, 'fleet', 'roster.yaml');
    await mkdir(join(home, 'fleet'), { recursive: true });
    await writeFile(
      rosterPath,
      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
        '\n',
      ),
    );
    // Simulate another `mosaic fleet restart` process mid-teardown: a fresh lock
    // (recent timestamp, so it is NOT treated as stale) already held.
    const lockPath = restartLockPath(home);
    await mkdir(dirname(lockPath), { recursive: true });
    await writeFile(lockPath, `4242\n${Date.now()}\n`);
    const events: string[] = [];
    const runner: CommandRunner = async (command, args) => {
      events.push(`run:${args[args.length - 1]}`);
      return { stdout: '', stderr: '', exitCode: 0 };
    };
    // The injected sleep stands in for time passing while we wait; the in-flight
    // restart "finishes" (releases its lock) after the first poll.
    let sleeps = 0;
    const sleepFn: SleepFn = async () => {
      sleeps += 1;
      events.push(`sleep:${sleeps}`);
      await rm(lockPath, { force: true });
    };
    const program = new Command();
    program.exitOverride();
    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
    try {
      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
      // It must have waited at least once before issuing any systemctl restart.
      expect(sleeps).toBeGreaterThan(0);
      const firstSleep = events.findIndex((e) => e.startsWith('sleep:'));
      const firstRun = events.findIndex((e) => e.startsWith('run:'));
      expect(firstSleep).toBeGreaterThanOrEqual(0);
      expect(firstRun).toBeGreaterThan(firstSleep);
      // And it still performs the full restart once the lock clears.
      expect(events).toContain('run:mosaic-tmux-holder.service');
      expect(events).toContain('run:mosaic-agent@coder0.service');
      // The lock is released after the restart completes.
      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
    } finally {
      await rm(home, { recursive: true, force: true });
    }
  });
  it('breaks a stale restart lock and proceeds without waiting', async () => {
    const home = await tempDir();
    const rosterPath = join(home, 'fleet', 'roster.yaml');
    await mkdir(join(home, 'fleet'), { recursive: true });
    await writeFile(
      rosterPath,
      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
        '\n',
      ),
    );
    // A lock left behind by a crashed owner: timestamp older than the stale window.
    const lockPath = restartLockPath(home);
    await mkdir(dirname(lockPath), { recursive: true });
    await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n`);
    const calls: string[][] = [];
    const runner: CommandRunner = async (command, args) => {
      calls.push([command, ...args]);
      return { stdout: '', stderr: '', exitCode: 0 };
    };
    const sleepFn = vi.fn<SleepFn>(async () => {});
    const program = new Command();
    program.exitOverride();
    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
    try {
      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
      // Stale lock is broken immediately — no waiting.
      expect(sleepFn).not.toHaveBeenCalled();
      expect(calls).toEqual([
        ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
        ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
      ]);
      // The stale lock is gone once the restart completes.
      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
    } finally {
      await rm(home, { recursive: true, force: true });
    }
  });
  it('releases the restart lock so a subsequent restart is not blocked', async () => {
    const home = await tempDir();
    const rosterPath = join(home, 'fleet', 'roster.yaml');
    await mkdir(join(home, 'fleet'), { recursive: true });
    await writeFile(
      rosterPath,
      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
        '\n',
      ),
    );
    const calls: string[][] = [];
    const runner: CommandRunner = async (command, args) => {
      calls.push([command, ...args]);
      return { stdout: '', stderr: '', exitCode: 0 };
    };
    const sleepFn = vi.fn<SleepFn>(async () => {});
    const program = new Command();
    program.exitOverride();
    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
    try {
      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
      // Two sequential restarts both run fully and neither has to wait.
      expect(sleepFn).not.toHaveBeenCalled();
      expect(calls).toEqual([
        ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
        ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
        ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
        ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
      ]);
    } finally {
      await rm(home, { recursive: true, force: true });
    }
  });
  it('guards the single-agent restart path behind the in-flight restart lock', async () => {
    const home = await tempDir();
    const rosterPath = join(home, 'fleet', 'roster.yaml');
    await mkdir(join(home, 'fleet'), { recursive: true });
    await writeFile(
      rosterPath,
      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
        '\n',
      ),
    );
    // A full restart is mid-flight (lock held); a single-agent restart re-enters.
    const lockPath = restartLockPath(home);
    await mkdir(dirname(lockPath), { recursive: true });
    await writeFile(lockPath, `4242\n${Date.now()}\n`);
    const events: string[] = [];
    const runner: CommandRunner = async (command, args) => {
      events.push(`run:${args[args.length - 1]}`);
      return { stdout: '', stderr: '', exitCode: 0 };
    };
    let sleeps = 0;
    const sleepFn: SleepFn = async () => {
      sleeps += 1;
      events.push(`sleep:${sleeps}`);
      await rm(lockPath, { force: true });
    };
    const program = new Command();
    program.exitOverride();
    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
    try {
      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart', 'coder0']);
      // The single-agent restart waits for the in-flight restart before acting.
      expect(sleeps).toBeGreaterThan(0);
      const firstSleep = events.findIndex((e) => e.startsWith('sleep:'));
      const firstRun = events.findIndex((e) => e.startsWith('run:'));
      expect(firstSleep).toBeGreaterThanOrEqual(0);
      expect(firstRun).toBeGreaterThan(firstSleep);
      // Only the named agent is restarted; the holder is untouched.
      expect(events).toContain('run:mosaic-agent@coder0.service');
      expect(events).not.toContain('run:mosaic-tmux-holder.service');
    } finally {
      await rm(home, { recursive: true, force: true });
    }
  });
  it('does not let a timed-out owner drop a lock another restart broke and re-owned', async () => {
    const home = await tempDir();
    const runDir = join(home, 'fleet', 'run');
    await mkdir(runDir, { recursive: true });
    const lockPath = restartLockPath(home);
    const tokenOf = async (): Promise<string> => {
      const raw = await readFile(lockPath, 'utf8');
      return raw.split('\n')[2]?.trim() ?? '';
    };
    const sleepFn = vi.fn<SleepFn>(async () => {});
    // R1 acquires the lock and begins a restart that then hangs.
    const r1 = await acquireRestartLock(home, sleepFn);
    const tokenR1 = await tokenOf();
    expect(tokenR1).not.toBe('');
    // The hung R1 leaves a stale lock: rewrite its timestamp into the past while
    // preserving R1's token — exactly the on-disk state a stuck owner leaves.
    await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n${tokenR1}\n`);
    // R2 re-enters, sees the stale lock, and atomically takes ownership.
    const r2 = await acquireRestartLock(home, sleepFn);
    const tokenR2 = await tokenOf();
    expect(tokenR2).not.toBe(tokenR1);
    expect(sleepFn).not.toHaveBeenCalled();
    // R1 finally finishes and releases. It must NOT delete R2's lock — otherwise
    // a third restart (R3) could acquire and interleave with R2 still running.
    await r1.release();
    expect(await tokenOf()).toBe(tokenR2);
    // R2 releases cleanly and the lock is gone.
    await r2.release();
    await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
    await rm(home, { recursive: true, force: true });
  });
  it('lets only one of several concurrent breakers proceed past a stale lock', async () => {
    const home = await tempDir();
    const lockPath = restartLockPath(home);
    await mkdir(dirname(lockPath), { recursive: true });
    // A stale lock left by a crashed owner: every concurrent re-entrant restart
    // will judge it stale and try to break it at the same instant. Breaking must
    // NOT grant ownership — only the atomic re-create may — so exactly one
    // contender can ever hold the lock at a time. (The v2 fix wrote our own token
    // during the break and read it back, so two breakers each saw their own token
    // and BOTH proceeded; this guards that regression.)
    await writeFile(
      lockPath,
      `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
    );
    // Yielding sleep so a waiting contender lets the current owner finish and
    // release before it re-contends, instead of spinning the microtask queue.
    const sleepFn: SleepFn = async () => {
      await new Promise((res) => setTimeout(res, 0));
    };
    let active = 0;
    let maxActive = 0;
    const tokens: string[] = [];
    const tokenOf = async (): Promise<string> => {
      const raw = await readFile(lockPath, 'utf8');
      return raw.split('\n')[2]?.trim() ?? '';
    };
    // One "restart" = acquire the lock, do work in the critical section, release.
    const restartOnce = async (): Promise<void> => {
      const guard = await acquireRestartLock(home, sleepFn);
      active += 1;
      maxActive = Math.max(maxActive, active);
      // Record the token we own while we hold it, then yield to interleave with
      // any other contender that might (wrongly) believe it owns the lock too.
      tokens.push(await tokenOf());
      await new Promise((res) => setTimeout(res, 0));
      active -= 1;
      await guard.release();
    };
    try {
      // Three breakers race the single stale lock simultaneously.
      await Promise.all([restartOnce(), restartOnce(), restartOnce()]);
      // Mutual exclusion held: never two owners at once despite concurrent breaks.
      expect(maxActive).toBe(1);
      // Each acquire owned with its own distinct token — no two ever shared it.
      expect(new Set(tokens).size).toBe(3);
      // The lock is fully released at the end.
      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
    } finally {
      await rm(home, { recursive: true, force: true });
    }
  });
  it('lets exactly one of two breakers take over a stale lock while the other waits', async () => {
    const home = await tempDir();
    const lockPath = restartLockPath(home);
    await mkdir(dirname(lockPath), { recursive: true });
    // A single stale lock both contenders will judge stale at the same instant.
    // Every transition runs under the registry mutex, so only one may take the
    // lock over; the other must observe a now-fresh owner and WAIT/re-evaluate
    // rather than also taking over. (A content-blind clobber let both believe
    // they owned it — this asserts the mutex-gated CAS takeover instead.)
    await writeFile(
      lockPath,
      `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
    );
    // Barrier the winner holds against until the loser has observed the lock
    // fresh and waited at least once — forcing the exact interleaving where one
    // proceeds while the other waits, deterministically rather than by timing.
    let resolveLoserWaited: () => void = () => {};
    const loserWaited = new Promise<void>((res) => {
      resolveLoserWaited = res;
    });
    let sleeps = 0;
    const sleepFn: SleepFn = async () => {
      sleeps += 1;
      resolveLoserWaited();
      await new Promise((res) => setTimeout(res, 0));
    };
    let active = 0;
    let maxActive = 0;
    const tokens: string[] = [];
    const tokenOf = async (): Promise<string> => {
      const raw = await readFile(lockPath, 'utf8');
      return raw.split('\n')[2]?.trim() ?? '';
    };
    let firstOwner = true;
    const restartOnce = async (): Promise<void> => {
      const guard = await acquireRestartLock(home, sleepFn);
      active += 1;
      maxActive = Math.max(maxActive, active);
      tokens.push(await tokenOf());
      if (firstOwner) {
        // Winner: keep holding the lock until the loser has waited once, so the
        // loser is guaranteed to see a FRESH owner (not the stale one) and back
        // off — proving it could not also take over.
        firstOwner = false;
        await loserWaited;
      } else {
        await new Promise((res) => setTimeout(res, 0));
      }
      active -= 1;
      await guard.release();
    };
    try {
      // Exactly two breakers race the single stale lock.
      await Promise.all([restartOnce(), restartOnce()]);
      // Mutual exclusion: never two owners at once (if both took over the stale
      // lock, this would be 2).
      expect(maxActive).toBe(1);
      // Both eventually owned, each with its own distinct token.
      expect(new Set(tokens).size).toBe(2);
      // The loser observed the winner's fresh lock and waited — it did NOT also
      // take over the stale lock.
      expect(sleeps).toBeGreaterThanOrEqual(1);
      // The lock is fully released at the end.
      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
    } finally {
      await rm(home, { recursive: true, force: true });
    }
  });
  it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => {
    const home = await tempDir();
    const rosterPath = join(home, 'fleet', 'roster.yaml');
--- a/packages/mosaic/src/commands/fleet.ts
+++ b/packages/mosaic/src/commands/fleet.ts
@@ -1,16 +1,5 @@
 import { constants } from 'node:fs';
-import {
+import { access, chmod, copyFile, mkdir, readFile, unlink, writeFile } from 'node:fs/promises';
  access,
  chmod,
  copyFile,
  mkdir,
  open,
  readFile,
  stat,
  unlink,
  writeFile,
 } from 'node:fs/promises';
 import { randomUUID } from 'node:crypto';
 import { homedir, hostname, userInfo } from 'node:os';
 import { dirname, join, resolve } from 'node:path';
 import { fileURLToPath } from 'node:url';
@@ -544,295 +533,6 @@ export function buildFleetServiceCommand(action: FleetServiceAction, agentName?:
  return ['systemctl', '--user', action, service];
 }
 /** Poll interval (ms) while waiting for an in-flight restart's lock to clear. */
 export const RESTART_LOCK_POLL_INTERVAL_MS = 250;
 /**
 * Maximum time (ms) a re-entrant restart waits for the in-flight restart to
 * finish before it breaks the lock and proceeds anyway. A bound is required so
 * a crashed holder of the lock can never deadlock the fleet permanently.
 */
 export const RESTART_LOCK_MAX_WAIT_MS = 30_000;
 /**
 * Age (ms) past which a restart lock is treated as stale (its owner died
 * without releasing it) and is broken immediately rather than waited on.
 */
 export const RESTART_LOCK_STALE_MS = 60_000;
 /**
 * Resolves the path of the cross-process restart lock for a given Mosaic home.
 * Kept strictly under `<mosaicHome>/fleet/run` (not the heartbeat env override)
 * so the lock is scoped to the same fleet the restart acts on.
 */
 export function restartLockPath(mosaicHome: string): string {
  return join(mosaicHome, 'fleet', 'run', 'restart.lock');
 }
 /** A held restart lock; `release()` removes the lock file iff we still own it. */
 interface RestartGuard {
  release(): Promise<void>;
 }
 /** Lock-file contents: pid (informational), timestamp, and a unique owner token. */
 function formatRestartLockContent(token: string): string {
  return `${process.pid}\n${Date.now()}\n${token}\n`;
 }
 /**
 * Reads the owner token (line 3) from a lock file, or null if the file is
 * missing/unreadable/tokenless. The token is what makes release and break
 * ownership-safe: a process only ever acts on a lock whose token matches its own.
 */
 async function readRestartLockToken(lockPath: string): Promise<string | null> {
  let raw: string;
  try {
    raw = await readFile(lockPath, 'utf8');
  } catch {
    return null;
  }
  const token = raw.split('\n')[2]?.trim();
  return token ? token : null;
 }
 /**
 * Returns true when a lock's contents are stale: older than RESTART_LOCK_STALE_MS,
 * or unparseable (a corrupt or partially written lock left by a crashed owner).
 */
 function isRestartLockContentStale(raw: string, now: number): boolean {
  const stampLine = raw.split('\n')[1] ?? '';
  const stamp = Number.parseInt(stampLine.trim(), 10);
  if (!Number.isFinite(stamp)) {
    return true;
  }
  return now - stamp >= RESTART_LOCK_STALE_MS;
 }
 /**
 * Path of the short-lived registry mutex that guards EVERY transition of the
 * restart lock (acquire, release, takeover). Held only across a few filesystem
 * ops — never across the restart itself — so contention clears in microseconds.
 */
 function restartMutexPath(lockPath: string): string {
  return `${lockPath}.mutex`;
 }
 /** Brief back-off between registry-mutex acquisition attempts (held microseconds). */
 const RESTART_MUTEX_RETRY_MS = 20;
 /**
 * Staleness for the internal mutex / reclaim locks, judged by the file's mtime
 * rather than its CONTENT. `open(path, 'wx')` creates the inode (with a fresh
 * mtime) before any token/timestamp is written into it, so a content-based check
 * would momentarily see that empty file as corrupt-and-stale and could reap a
 * lock another contender is still acquiring. mtime is set atomically at creation,
 * so a just-created lock always reads as live; only a lock whose holder died and
 * stopped touching it ages past the threshold. These locks are never held across
 * the restart itself (only a couple of filesystem ops), so any mtime this old can
 * belong only to a dead holder.
 */
 async function isRestartLockPathStale(path: string, now: number): Promise<boolean> {
  try {
    const info = await stat(path);
    return now - info.mtimeMs >= RESTART_LOCK_STALE_MS;
  } catch (err) {
    if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
      return false; // Gone, not stale — the caller will re-contend.
    }
    return false; // Can't stat — treat as live and back off rather than reap.
  }
 }
 /** Path of the reclaim lock that serializes reaping of a crashed-holder mutex. */
 function restartReclaimPath(mutexPath: string): string {
  return `${mutexPath}.reclaim`;
 }
 /**
 * Reap a registry mutex left behind by a process that CRASHED mid-transition —
 * one whose file has aged past RESTART_LOCK_STALE_MS. Because the mutex is held
 * only for a couple of filesystem ops (no sleeps, never across the restart), a
 * mutex this old can only belong to a dead holder.
 *
 * The reap removes the dead mutex but never CREATES/holds it — acquisition stays
 * the single `open('wx')` create in {@link acquireRestartMutex}, so exactly one
 * contender wins ownership no matter how the reap and acquires interleave. The
 * removal is made conditional by a dedicated reclaim lock: while it is held the
 * dead mutex is stable (its dead holder will never touch it, and no other
 * reclaimer can race), so re-reading it and removing it only if it is STILL stale
 * is a true compare — a live holder's fresh mutex is never removed. This closes
 * the reclaim race a content-blind rename-and-restore left open (a third
 * contender slipping into the gap while a fresh mutex was moved aside).
 */
 async function reclaimStaleRestartMutex(mutexPath: string): Promise<void> {
  const reclaimPath = restartReclaimPath(mutexPath);
  let handle: Awaited<ReturnType<typeof open>>;
  try {
    handle = await open(reclaimPath, 'wx');
  } catch (err) {
    if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
      throw err;
    }
    // Someone is already reclaiming. If their reclaim lock is itself stale by
    // mtime, its holder crashed mid-reap (the lock spans only a stat + unlink,
    // microseconds) — clear it so a later pass can retry. Otherwise a live
    // reclaimer has it; back off. Either way we do not reap the mutex this pass.
    if (await isRestartLockPathStale(reclaimPath, Date.now())) {
      await unlink(reclaimPath).catch(() => {});
    }
    return;
  }
  try {
    // Re-check the mutex UNDER the reclaim lock and remove it only if it is STILL
    // stale by mtime. A live holder's mutex is fresh and is left untouched; a dead
    // holder's mutex is stable here (its holder is gone and no other reclaimer can
    // race us), so this re-check is authoritative.
    if (await isRestartLockPathStale(mutexPath, Date.now())) {
      await unlink(mutexPath).catch(() => {});
    }
  } finally {
    await handle.close();
    await unlink(reclaimPath).catch(() => {});
  }
 }
 /**
 * Acquire the registry mutex, BLOCKING (with brief back-offs) until held, and
 * return a token-gated release. This is the single point of mutual exclusion for
 * the restart lock: acquire, release, and stale/timeout takeover all run under it,
 * so "read the lock, then mutate it" is atomic — no acquirer, releaser, or breaker
 * can ever interleave with another. A mutex left by a crashed holder is reclaimed
 * once it ages past the stale threshold.
 */
 async function acquireRestartMutex(
  mutexPath: string,
  token: string,
 ): Promise<RestartGuard['release']> {
  for (;;) {
    let handle: Awaited<ReturnType<typeof open>>;
    try {
      handle = await open(mutexPath, 'wx');
    } catch (err) {
      if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
        throw err;
      }
      // Staleness is judged by mtime, not content, so a mutex that exists but has
      // not yet had its token written (the open-before-write window) reads as live
      // and is never wrongly reaped.
      if (!(await isRestartLockPathStale(mutexPath, Date.now()))) {
        // A live holder has it — it will be gone in microseconds. Back off briefly.
        await new Promise((resolve) => setTimeout(resolve, RESTART_MUTEX_RETRY_MS));
        continue;
      }
      await reclaimStaleRestartMutex(mutexPath);
      continue;
    }
    // We created the mutex. Populate it with our token; if writing fails, clean up
    // our own file so we never leak an empty mutex that a peer would have to reap.
    try {
      await handle.writeFile(formatRestartLockContent(token));
      await handle.close();
    } catch (err) {
      await handle.close().catch(() => {});
      await unlink(mutexPath).catch(() => {});
      throw err;
    }
    return async (): Promise<void> => {
      if ((await readRestartLockToken(mutexPath)) !== token) return;
      await unlink(mutexPath).catch(() => {});
    };
  }
 }
 /**
 * Acquire the fleet restart lock, serializing concurrent `mosaic fleet restart`
 * invocations across processes. Each restart tears the tmux holder (and the
 * agent sessions inside it) down and back up; without this guard a re-entrant
 * restart relaunches agents against a half-torn-down holder, which fails and
 * tight-loops. A re-entrant caller waits for the in-flight restart to release
 * the lock (clean shutdown settled) before proceeding, breaks a stale lock left
 * by a crashed owner, and after RESTART_LOCK_MAX_WAIT_MS breaks the lock to
 * avoid a permanent deadlock.
 *
 * Correctness rests on a single invariant: EVERY transition of the lock — taking
 * a free lock, taking over a stale/timed-out one, and releasing — happens under
 * the registry mutex. Because the check ("is the lock free / stale / fresh?") and
 * the mutation that follows it both run while the mutex is held, they are atomic:
 * no other acquirer, releaser, or breaker can slip in between. That is what makes
 * takeover a true compare-and-swap rather than a content-blind clobber — a normal
 * `open('wx')` acquirer cannot create a fresh lock in a gap, and the original
 * owner's `release()` (also mutex-gated and token-checked) cannot drop a lock a
 * breaker already took over. So no interleaving lets two restarts both own the
 * lock and run concurrently.
 */
 export async function acquireRestartLock(
  mosaicHome: string,
  sleepFn: SleepFn,
 ): Promise<RestartGuard> {
  const token = randomUUID();
  const lockPath = restartLockPath(mosaicHome);
  const mutexPath = restartMutexPath(lockPath);
  await mkdir(dirname(lockPath), { recursive: true });
  const release = async (): Promise<void> => {
    // Mutex-gated and token-gated: only remove the lock if it is still ours. If
    // another caller took it over (after a stale/timeout break) the token no
    // longer matches and we leave their lock intact.
    const releaseMutex = await acquireRestartMutex(mutexPath, token);
    try {
      if ((await readRestartLockToken(lockPath)) === token) {
        await unlink(lockPath).catch(() => {});
      }
    } finally {
      await releaseMutex();
    }
  };
  const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS;
  for (;;) {
    let owned = false;
    const releaseMutex = await acquireRestartMutex(mutexPath, token);
    try {
      // Read and (if appropriate) mutate the lock atomically under the mutex.
      let current: string | null = null;
      let absent = false;
      try {
        current = await readFile(lockPath, 'utf8');
      } catch (readErr) {
        if ((readErr as NodeJS.ErrnoException).code === 'ENOENT') {
          absent = true;
        } else {
          current = null; // Unreadable/corrupt: treat as stale.
        }
      }
      const now = Date.now();
      if (absent) {
        // Lock is free — take it.
        await writeFile(lockPath, formatRestartLockContent(token));
        owned = true;
      } else {
        const stale = current === null || isRestartLockContentStale(current, now);
        const timedOut = now >= deadline;
        if (stale || timedOut) {
          process.stderr.write(
            stale
              ? 'Breaking stale fleet restart lock.\n'
              : `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` +
                  'restart; breaking the lock.\n',
          );
          // Takeover is just an overwrite — safe because we hold the mutex, so no
          // acquirer or releaser can touch the lock between our read and this write.
          await writeFile(lockPath, formatRestartLockContent(token));
          owned = true;
        }
        // else: a fresh restart owns it — wait below and re-evaluate.
      }
    } finally {
      await releaseMutex();
    }
    if (owned) {
      return { release };
    }
    await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
  }
 }
 /**
 * Returns the systemctl --user enable command for a given unit.
 * Used by the install auto-enable step to persist units across reboots.
@@ -1472,7 +1172,6 @@ export function isSendAccepted(capturedOutput: string): SendVerifyResult {
 export function registerFleetCommand(program: Command, deps: FleetCommandDeps = {}): Command {
  const runner = deps.runner ?? runCommand;
  const sleepFn = deps.sleepFn ?? defaultSleep;
  const paths = resolveFleetPaths(deps.mosaicHome);
  const frameworkRoot = deps.frameworkRoot ?? resolveFrameworkRoot();
@@ -1586,22 +1285,9 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
      .command(`${action} [agent]`)
      .description(`${action} the fleet holder or one agent`)
      .action(async (agent?: string) => {
        const commandOpts = cmd.opts<{ mosaicHome: string; roster?: string }>();
        const activePaths = resolveFleetPaths(commandOpts.mosaicHome);
        const roster = await loadRosterForCommand(cmd);
        if (agent) {
          getRosterAgent(roster, agent);
          // Single-agent restart is guarded too: it can race a full restart that
          // is tearing the shared holder down.
          if (action === 'restart') {
            const guard = await acquireRestartLock(activePaths.mosaicHome, sleepFn);
            try {
              await runChecked(runner, buildFleetServiceCommand(action, agent));
            } finally {
              await guard.release();
            }
            return;
          }
          await runChecked(runner, buildFleetServiceCommand(action, agent));
          return;
        }
@@ -1612,21 +1298,6 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
          );
          return;
        }
        if (action === 'restart') {
          // Serialize the holder+agents teardown/relaunch behind the restart lock
          // so a re-entrant restart waits for clean shutdown before relaunching,
          // instead of racing a half-torn-down holder into a tight loop.
          const guard = await acquireRestartLock(activePaths.mosaicHome, sleepFn);
          try {
            await runChecked(runner, buildFleetServiceCommand(action));
            for (const rosterAgent of roster.agents) {
              await runChecked(runner, buildFleetServiceCommand(action, rosterAgent.name));
            }
          } finally {
            await guard.release();
          }
          return;
        }
        await runChecked(runner, buildFleetServiceCommand(action));
        for (const rosterAgent of roster.agents) {
          await runChecked(runner, buildFleetServiceCommand(action, rosterAgent.name));
--- a/tools/install.sh
+++ b/tools/install.sh
@@ -16,10 +16,6 @@
 #   --framework       Install/upgrade framework only (skip npm CLI)
 #   --cli             Install/upgrade npm CLI only (skip framework)
 #   --ref <branch>    Git ref for framework archive (default: main)
 #   --dev             Build CLI + gateway FROM SOURCE at --ref instead of the
 #                     registry @latest. Zero registry writes — packs local
 #                     tarballs and installs them globally. Use to test a branch
 #                     end-to-end before cutting a release.
 #   --yes             Accept all defaults; headless/non-interactive install
 #   --no-auto-launch  Skip automatic mosaic wizard + gateway install on first install
 #   --uninstall       Reverse the install: remove framework dir, CLI package, and npmrc line
@@ -31,7 +27,6 @@
 #   MOSAIC_PREFIX       — npm global prefix          (default: ~/.npm-global)
 #   MOSAIC_NO_COLOR     — disable colour             (set to 1)
 #   MOSAIC_REF          — git ref for framework      (default: main)
 #   MOSAIC_DEV          — equivalent to --dev         (set to 1)
 #   MOSAIC_ASSUME_YES   — equivalent to --yes        (set to 1)
 # ──────────────────────────────────────────────────────────────────────────────
 #
@@ -48,7 +43,6 @@ FLAG_CLI=true
 FLAG_NO_AUTO_LAUNCH=false
 FLAG_YES=false
 FLAG_UNINSTALL=false
 FLAG_DEV=false
 GIT_REF="${MOSAIC_REF:-main}"
 # MOSAIC_ASSUME_YES env var acts the same as --yes
@@ -56,18 +50,12 @@ if [[ "${MOSAIC_ASSUME_YES:-0}" == "1" ]]; then
  FLAG_YES=true
 fi
 # MOSAIC_DEV env var acts the same as --dev
 if [[ "${MOSAIC_DEV:-0}" == "1" ]]; then
  FLAG_DEV=true
 fi
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --check)          FLAG_CHECK=true; shift ;;
    --framework)      FLAG_CLI=false; shift ;;
    --cli)            FLAG_FRAMEWORK=false; shift ;;
    --ref)            GIT_REF="${2:-main}"; shift 2 ;;
    --dev)            FLAG_DEV=true; shift ;;
    --yes|-y)         FLAG_YES=true; shift ;;
    --no-auto-launch) FLAG_NO_AUTO_LAUNCH=true; shift ;;
    --uninstall)      FLAG_UNINSTALL=true; shift ;;
@@ -84,17 +72,6 @@ CLI_PKG="${SCOPE}/mosaic"
 REPO_BASE="https://git.mosaicstack.dev/mosaicstack/stack"
 ARCHIVE_URL="${REPO_BASE}/archive/${GIT_REF}.tar.gz"
 # In dev (build-from-source) mode the gateway is installed globally from a
 # locally-built tarball. Tell the wizard / gateway-config stage NOT to overwrite
 # it with the registry @latest build (honored by gatewayConfigStage).
 if [[ "$FLAG_DEV" == "true" ]]; then
  export MOSAIC_GATEWAY_SKIP_NPM_INSTALL=1
 fi
 # Shared monorepo checkout (populated on demand by ensure_monorepo).
 WORK_DIR=""
 EXTRACTED_DIR=""
 # ─── uninstall path ───────────────────────────────────────────────────────────
 # Shell-level uninstall for when the CLI is broken or not available.
 # Handles: framework directory, npm CLI package, npmrc scope line.
@@ -262,99 +239,6 @@ framework_version() {
  fi
 }
 # Download + extract the monorepo archive at $GIT_REF exactly once per run.
 # Sets the script-level EXTRACTED_DIR to the repo root. Reused by both the
 # framework install (Part 1) and the dev build-from-source path (Part 2).
 ensure_monorepo() {
  if [[ -n "$EXTRACTED_DIR" ]] && [[ -d "$EXTRACTED_DIR" ]]; then
    return 0
  fi
  require_cmd tar
  WORK_DIR="$(mktemp -d "${TMPDIR:-/tmp}/mosaic-install-XXXXXX")"
  # shellcheck disable=SC2317
  cleanup_work() { [[ -n "$WORK_DIR" ]] && rm -rf "$WORK_DIR"; }
  trap cleanup_work EXIT
  info "Downloading source from ${GIT_REF}…"
  if command -v curl &>/dev/null; then
    curl -fsSL "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
  elif command -v wget &>/dev/null; then
    wget -qO- "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
  else
    fail "curl or wget required to download source."
    exit 1
  fi
  # Gitea archives extract to <repo-name>/ inside the work dir
  EXTRACTED_DIR="$(find "$WORK_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)"
  if [[ -z "$EXTRACTED_DIR" ]] || [[ ! -d "$EXTRACTED_DIR" ]]; then
    fail "Could not locate extracted source in archive."
    ls -la "$WORK_DIR" >&2
    exit 1
  fi
 }
 # Build @mosaicstack/mosaic + @mosaicstack/gateway from source and install both
 # globally from locally-packed tarballs. ZERO registry writes. Workspace deps
 # (brain/config/db/…) are pulled from the registry at the versions pinned in
 # each package.json — `pnpm pack` rewrites `workspace:*` to those versions.
 install_cli_from_source() {
  local src="$EXTRACTED_DIR"
  local out_dir="$WORK_DIR/dist-tarballs"
  mkdir -p "$out_dir"
  # pnpm via corepack (ships with Node >= 16.9; required by Node >= 20 preflight).
  # Pin to the repo's packageManager version so the build matches CI. Surface
  # corepack failures so the fresh-machine case gives an actionable error
  # instead of a bare "command not found".
  if ! command -v pnpm &>/dev/null; then
    info "Activating pnpm via corepack…"
    corepack enable 2>&1 | sed 's/^/  /' || warn "corepack enable failed — pnpm may need manual install."
    corepack prepare pnpm@10.6.2 --activate 2>&1 | sed 's/^/  /' \
      || warn "corepack prepare failed — pnpm may need manual install."
  fi
  if ! command -v pnpm &>/dev/null; then
    fail "pnpm not available after corepack activation."
    echo "  Install pnpm manually (https://pnpm.io/installation) and re-run with --dev."
    exit 1
  fi
  info "Installing workspace dependencies (pnpm install)…"
  ( cd "$src" && pnpm install ) 2>&1 | sed 's/^/  /'
  info "Building CLI + gateway from source…"
  ( cd "$src" && pnpm --filter "@mosaicstack/mosaic..." --filter "@mosaicstack/gateway..." run build ) 2>&1 | sed 's/^/  /'
  info "Packing local tarballs…"
  ( cd "$src/packages/mosaic" && pnpm pack --pack-destination "$out_dir" ) 2>&1 | sed 's/^/  /'
  ( cd "$src/apps/gateway"    && pnpm pack --pack-destination "$out_dir" ) 2>&1 | sed 's/^/  /'
  local cli_tgz gw_tgz
  cli_tgz="$(ls -1t "$out_dir"/mosaicstack-mosaic-*.tgz 2>/dev/null | head -1)"
  gw_tgz="$(ls -1t "$out_dir"/mosaicstack-gateway-*.tgz 2>/dev/null | head -1)"
  if [[ ! -f "$cli_tgz" ]]; then
    fail "CLI tarball was not produced by pnpm pack."
    exit 1
  fi
  if [[ ! -f "$gw_tgz" ]]; then
    fail "Gateway tarball was not produced by pnpm pack."
    exit 1
  fi
  # Gateway first so it is present globally before the CLI's wizard runs (which
  # skips its own gateway install via MOSAIC_GATEWAY_SKIP_NPM_INSTALL=1).
  info "Installing gateway from source tarball (global)…"
  npm install -g "$gw_tgz" --prefix="$PREFIX" 2>&1 | sed 's/^/  /'
  info "Installing CLI from source tarball (global)…"
  npm install -g "$cli_tgz" --prefix="$PREFIX" 2>&1 | sed 's/^/  /'
  ok "Installed from source: CLI $(installed_cli_version)"
 }
 # ─── preflight ────────────────────────────────────────────────────────────────
 require_cmd node
@@ -398,8 +282,25 @@ if [[ "$FLAG_FRAMEWORK" == "true" ]]; then
      warn "Framework not installed."
    fi
  else
-    # Download repo archive and extract framework (shared with the dev build)
+    # Download repo archive and extract framework
-    ensure_monorepo
+    require_cmd tar
    WORK_DIR="$(mktemp -d "${TMPDIR:-/tmp}/mosaic-install-XXXXXX")"
    cleanup_work() { rm -rf "$WORK_DIR"; }
    trap cleanup_work EXIT
    info "Downloading framework from ${GIT_REF}…"
    if command -v curl &>/dev/null; then
      curl -fsSL "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
    elif command -v wget &>/dev/null; then
      wget -qO- "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
    else
      fail "curl or wget required to download framework."
      exit 1
    fi
    # Gitea archives extract to <repo-name>/ inside the work dir
    EXTRACTED_DIR="$(find "$WORK_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)"
    FRAMEWORK_SRC="$EXTRACTED_DIR/packages/mosaic/framework"
    if [[ ! -d "$FRAMEWORK_SRC" ]]; then
@@ -455,11 +356,7 @@ if [[ "$FLAG_CLI" == "true" ]]; then
  fi
  CURRENT="$(installed_cli_version)"
-  if [[ "$FLAG_DEV" == "true" ]]; then
+  LATEST="$(latest_cli_version)"
    LATEST=""
  else
    LATEST="$(latest_cli_version)"
  fi
  if [[ -n "$CURRENT" ]]; then
    dim "  Installed: ${CLI_PKG}@${CURRENT}"
@@ -467,9 +364,7 @@ if [[ "$FLAG_CLI" == "true" ]]; then
    dim "  Installed: (none)"
  fi
-  if [[ "$FLAG_DEV" == "true" ]]; then
+  if [[ -n "$LATEST" ]]; then
    dim "  Source:    ${REPO_BASE} (ref: ${GIT_REF}, build-from-source)"
  elif [[ -n "$LATEST" ]]; then
    dim "  Latest:    ${CLI_PKG}@${LATEST}"
  else
    dim "  Latest:    (registry unreachable)"
@@ -477,9 +372,7 @@ if [[ "$FLAG_CLI" == "true" ]]; then
  echo ""
  if [[ "$FLAG_CHECK" == "true" ]]; then
-    if [[ "$FLAG_DEV" == "true" ]]; then
+    if [[ -z "$LATEST" ]]; then
      info "Dev mode: installed version is ${CURRENT:-(none)} (no registry comparison)."
    elif [[ -z "$LATEST" ]]; then
      warn "Could not reach registry."
    elif [[ -z "$CURRENT" ]]; then
      warn "Not installed."
@@ -490,16 +383,6 @@ if [[ "$FLAG_CLI" == "true" ]]; then
    else
      ok "Up to date (or ahead of registry)."
    fi
  elif [[ "$FLAG_DEV" == "true" ]]; then
    info "Dev mode — building CLI + gateway from source at ref ${GIT_REF}…"
    ensure_monorepo
    install_cli_from_source
    # PATH check for npm prefix
    if [[ ":$PATH:" != *":$PREFIX/bin:"* ]]; then
      warn "$PREFIX/bin is not on your PATH"
      dim "  Add to your shell rc:  export PATH=\"$PREFIX/bin:\$PATH\""
    fi
  else
    if [[ -z "$LATEST" ]]; then
      warn "Could not reach registry at $REGISTRY — skipping npm CLI."