feat(#462 ): add federation list verb (#682 )

fix(fleet): guard mosaic fleet restart against tight-loop re-entry race (#680 )
feat(installer): --dev flag builds CLI + gateway from source (#681 )
2026-06-25 02:15:17 +00:00 · 2026-06-25 01:44:48 +00:00 · 2026-06-24 23:54:52 +00:00
8 changed files with 1266 additions and 83 deletions
--- a/apps/gateway/src/federation/server/verbs/tests/list-query.service.spec.ts
+++ b/apps/gateway/src/federation/server/verbs/tests/list-query.service.spec.ts
@@ -1,5 +1,17 @@
-import { describe, expect, it, vi } from 'vitest';
-import type { Db } from '@mosaicstack/db';
+import { afterAll, beforeAll, describe, expect, it, vi } from 'vitest';
+import {
+  createPgliteDb,
+  insights,
+  missionTasks,
+  missions,
+  preferences,
+  projects,
+  runPgliteMigrations,
+  teams,
+  users,
+  type Db,
+  type DbHandle,
+} from '@mosaicstack/db';
 import type { FederationScopeQueryFilter } from '../../scope.service.js';
 import { FederationListQueryService } from '../list-query.service.js';

@@ -12,10 +24,192 @@ const TASK_FILTER: FederationScopeQueryFilter = {
  maxRowsPerQuery: 2,
 };

+const SUBJECT_USER_ID = 'fed-m3-05-subject';
+const OTHER_USER_ID = 'fed-m3-05-other';
+const TEAM_ID = '05000000-0000-4000-8000-000000000001';
+const UNAUTHORIZED_TEAM_ID = '05000000-0000-4000-8000-000000000002';
+const PERSONAL_PROJECT_ID = '05000000-0000-4000-8000-000000000101';
+const TEAM_PROJECT_ID = '05000000-0000-4000-8000-000000000102';
+const UNAUTHORIZED_PROJECT_ID = '05000000-0000-4000-8000-000000000103';
+const PERSONAL_MISSION_ID = '05000000-0000-4000-8000-000000000201';
+const TEAM_MISSION_ID = '05000000-0000-4000-8000-000000000202';
+const UNAUTHORIZED_MISSION_ID = '05000000-0000-4000-8000-000000000203';
+const SUBJECT_TEAM_NOTE_ID = '05000000-0000-4000-8000-000000000301';
+const OTHER_TEAM_NOTE_ID = '05000000-0000-4000-8000-000000000302';
+const SUBJECT_PERSONAL_NOTE_ID = '05000000-0000-4000-8000-000000000303';
+const SUBJECT_UNAUTHORIZED_NOTE_ID = '05000000-0000-4000-8000-000000000304';
+const INSIGHT_ONE_ID = '05000000-0000-4000-8000-000000000401';
+const INSIGHT_TWO_ID = '05000000-0000-4000-8000-000000000402';
+const PREFERENCE_ONE_ID = '05000000-0000-4000-8000-000000000501';
+const PREFERENCE_TWO_ID = '05000000-0000-4000-8000-000000000502';
+
+let dbHandle: DbHandle | undefined;
+
 function makeService() {
  return new FederationListQueryService({} as Db);
 }

+function makeDbService() {
+  if (!dbHandle) {
+    throw new Error('test DB not initialized');
+  }
+  return new FederationListQueryService(dbHandle.db);
+}
+
+async function seedNotesFixture() {
+  if (!dbHandle) {
+    throw new Error('test DB not initialized');
+  }
+
+  await dbHandle.db.insert(users).values([
+    {
+      id: SUBJECT_USER_ID,
+      name: 'Federation Subject',
+      email: `${SUBJECT_USER_ID}@example.test`,
+      emailVerified: false,
+    },
+    {
+      id: OTHER_USER_ID,
+      name: 'Federation Other',
+      email: `${OTHER_USER_ID}@example.test`,
+      emailVerified: false,
+    },
+  ]);
+
+  await dbHandle.db.insert(teams).values([
+    {
+      id: TEAM_ID,
+      name: 'FED-M3-05 Team',
+      slug: 'fed-m3-05-team',
+      ownerId: SUBJECT_USER_ID,
+      managerId: SUBJECT_USER_ID,
+    },
+    {
+      id: UNAUTHORIZED_TEAM_ID,
+      name: 'FED-M3-05 Unauthorized Team',
+      slug: 'fed-m3-05-unauthorized-team',
+      ownerId: OTHER_USER_ID,
+      managerId: OTHER_USER_ID,
+    },
+  ]);
+
+  await dbHandle.db.insert(projects).values([
+    {
+      id: PERSONAL_PROJECT_ID,
+      name: 'FED-M3-05 Personal Project',
+      ownerId: SUBJECT_USER_ID,
+      ownerType: 'user',
+    },
+    {
+      id: TEAM_PROJECT_ID,
+      name: 'FED-M3-05 Team Project',
+      teamId: TEAM_ID,
+      ownerType: 'team',
+    },
+    {
+      id: UNAUTHORIZED_PROJECT_ID,
+      name: 'FED-M3-05 Unauthorized Project',
+      teamId: UNAUTHORIZED_TEAM_ID,
+      ownerType: 'team',
+    },
+  ]);
+
+  await dbHandle.db.insert(missions).values([
+    {
+      id: PERSONAL_MISSION_ID,
+      name: 'FED-M3-05 Personal Mission',
+      projectId: PERSONAL_PROJECT_ID,
+      userId: SUBJECT_USER_ID,
+    },
+    {
+      id: TEAM_MISSION_ID,
+      name: 'FED-M3-05 Team Mission',
+      projectId: TEAM_PROJECT_ID,
+      userId: SUBJECT_USER_ID,
+    },
+    {
+      id: UNAUTHORIZED_MISSION_ID,
+      name: 'FED-M3-05 Unauthorized Mission',
+      projectId: UNAUTHORIZED_PROJECT_ID,
+      userId: SUBJECT_USER_ID,
+    },
+  ]);
+
+  await dbHandle.db.insert(missionTasks).values([
+    {
+      id: SUBJECT_TEAM_NOTE_ID,
+      missionId: TEAM_MISSION_ID,
+      userId: SUBJECT_USER_ID,
+      notes: 'subject note on team mission',
+      createdAt: new Date('2026-06-24T03:00:00.000Z'),
+      updatedAt: new Date('2026-06-24T03:00:00.000Z'),
+    },
+    {
+      id: OTHER_TEAM_NOTE_ID,
+      missionId: TEAM_MISSION_ID,
+      userId: OTHER_USER_ID,
+      notes: 'other user note on team mission',
+      createdAt: new Date('2026-06-24T02:00:00.000Z'),
+      updatedAt: new Date('2026-06-24T02:00:00.000Z'),
+    },
+    {
+      id: SUBJECT_PERSONAL_NOTE_ID,
+      missionId: PERSONAL_MISSION_ID,
+      userId: SUBJECT_USER_ID,
+      notes: 'subject note on personal mission',
+      createdAt: new Date('2026-06-24T01:00:00.000Z'),
+      updatedAt: new Date('2026-06-24T01:00:00.000Z'),
+    },
+    {
+      id: SUBJECT_UNAUTHORIZED_NOTE_ID,
+      missionId: UNAUTHORIZED_MISSION_ID,
+      userId: SUBJECT_USER_ID,
+      notes: 'subject note outside grant-visible missions',
+      createdAt: new Date('2026-06-24T04:00:00.000Z'),
+      updatedAt: new Date('2026-06-24T04:00:00.000Z'),
+    },
+  ]);
+
+  const memoryCreatedAt = new Date('2026-06-24T05:00:00.000Z');
+  await dbHandle.db.insert(insights).values([
+    {
+      id: INSIGHT_ONE_ID,
+      userId: SUBJECT_USER_ID,
+      content: 'first insight',
+      source: 'agent',
+      createdAt: memoryCreatedAt,
+      updatedAt: memoryCreatedAt,
+    },
+    {
+      id: INSIGHT_TWO_ID,
+      userId: SUBJECT_USER_ID,
+      content: 'second insight',
+      source: 'agent',
+      createdAt: memoryCreatedAt,
+      updatedAt: memoryCreatedAt,
+    },
+  ]);
+
+  await dbHandle.db.insert(preferences).values([
+    {
+      id: PREFERENCE_ONE_ID,
+      userId: SUBJECT_USER_ID,
+      key: 'fed-m3-05-pref-1',
+      value: { enabled: true },
+      createdAt: memoryCreatedAt,
+      updatedAt: memoryCreatedAt,
+    },
+    {
+      id: PREFERENCE_TWO_ID,
+      userId: SUBJECT_USER_ID,
+      key: 'fed-m3-05-pref-2',
+      value: { enabled: false },
+      createdAt: memoryCreatedAt,
+      updatedAt: memoryCreatedAt,
+    },
+  ]);
+}
+
 function stubRows(
  service: FederationListQueryService,
  ...pages: Array<Array<Record<string, unknown>>>
@@ -37,6 +231,17 @@ function stubRows(
 }

 describe('FederationListQueryService', () => {
+  beforeAll(async () => {
+    dbHandle = createPgliteDb(`memory://fed-m3-05-list-${Date.now()}`);
+    await runPgliteMigrations(dbHandle);
+    await seedNotesFixture();
+  });
+
+  afterAll(async () => {
+    await dbHandle?.close();
+    dbHandle = undefined;
+  });
+
  it('denies sensitive resources in native RBAC for M3 list reads', async () => {
    const service = makeService();

@@ -115,4 +320,109 @@ describe('FederationListQueryService', () => {
      'Invalid federation list cursor',
    );
  });
+
+  it('throws when a truncated page cannot encode a resumable cursor', async () => {
+    const service = makeService();
+    stubRows(service, [
+      { id: '2', createdAt: 'not-a-date' },
+      { id: '1', createdAt: 'not-a-date' },
+    ]);
+
+    await expect(service.list({ filter: { ...TASK_FILTER, limit: 1 } })).rejects.toThrow(
+      'Federation list cursor cannot be encoded',
+    );
+  });
+
+  it('throws on unsupported resources instead of crashing pagination', async () => {
+    const service = makeService();
+
+    await expect(
+      service.list({
+        filter: {
+          ...TASK_FILTER,
+          resource: 'unknown-resource' as FederationScopeQueryFilter['resource'],
+        },
+      }),
+    ).rejects.toThrow('Unsupported federation list resource');
+  });
+
+  it('does not leak another user mission task notes through team-scoped note reads', async () => {
+    const service = makeDbService();
+
+    const result = await service.list({
+      filter: {
+        resource: 'notes',
+        subjectUserId: SUBJECT_USER_ID,
+        includePersonal: false,
+        teamIds: [TEAM_ID],
+        limit: 10,
+        maxRowsPerQuery: 10,
+      },
+    });
+
+    const ids = result.items.map((item) => item['id']);
+    expect(ids).toEqual([SUBJECT_TEAM_NOTE_ID]);
+    expect(ids).not.toContain(OTHER_TEAM_NOTE_ID);
+  });
+
+  it('does not return subject personal mission task notes when includePersonal is false', async () => {
+    const service = makeDbService();
+
+    const result = await service.list({
+      filter: {
+        resource: 'notes',
+        subjectUserId: SUBJECT_USER_ID,
+        includePersonal: false,
+        teamIds: [TEAM_ID],
+        limit: 10,
+        maxRowsPerQuery: 10,
+      },
+    });
+
+    expect(result.items.map((item) => item['id'])).not.toContain(SUBJECT_PERSONAL_NOTE_ID);
+  });
+
+  it('does not return subject notes from missions outside the grant-visible project set', async () => {
+    const service = makeDbService();
+
+    const result = await service.list({
+      filter: {
+        resource: 'notes',
+        subjectUserId: SUBJECT_USER_ID,
+        includePersonal: true,
+        teamIds: [TEAM_ID],
+        limit: 10,
+        maxRowsPerQuery: 10,
+      },
+    });
+
+    const ids = result.items.map((item) => item['id']);
+    expect(ids).toContain(SUBJECT_PERSONAL_NOTE_ID);
+    expect(ids).toContain(SUBJECT_TEAM_NOTE_ID);
+    expect(ids).not.toContain(SUBJECT_UNAUTHORIZED_NOTE_ID);
+    expect(ids).not.toContain(OTHER_TEAM_NOTE_ID);
+  });
+
+  it('paginates memory deterministically across insights and preferences', async () => {
+    const service = makeDbService();
+    const filter: FederationScopeQueryFilter = {
+      resource: 'memory',
+      subjectUserId: SUBJECT_USER_ID,
+      includePersonal: true,
+      teamIds: [],
+      limit: 2,
+      maxRowsPerQuery: 2,
+    };
+
+    const firstPage = await service.list({ filter });
+    const secondPage = await service.list({ filter, cursor: firstPage.nextCursor });
+    const firstPageIds = firstPage.items.map((item) => item['id']);
+    const secondPageIds = secondPage.items.map((item) => item['id']);
+    const allIds = [...firstPageIds, ...secondPageIds];
+
+    expect(firstPage).toMatchObject({ truncated: true, nextCursor: expect.any(String) });
+    expect(firstPageIds).toEqual([INSIGHT_TWO_ID, INSIGHT_ONE_ID]);
+    expect(secondPageIds).toEqual([PREFERENCE_TWO_ID, PREFERENCE_ONE_ID]);
+    expect(new Set(allIds).size).toBe(allIds.length);
+  });
 });
--- a/apps/gateway/src/federation/server/verbs/tests/list.controller.spec.ts
+++ b/apps/gateway/src/federation/server/verbs/tests/list.controller.spec.ts
@@ -120,6 +120,24 @@ describe('ListController', () => {
    });
  });

+  it('returns a federation error envelope when auth guard context is missing', async () => {
+    const { controller, scope, query } = makeController();
+
+    await expect(
+      controller.list('tasks', {} as unknown as FastifyRequest, {}),
+    ).rejects.toMatchObject({
+      response: {
+        error: {
+          code: 'unauthorized',
+          message: 'Federation context missing',
+        },
+      },
+      status: 401,
+    });
+    expect(scope.evaluateAccess).not.toHaveBeenCalled();
+    expect(query.list).not.toHaveBeenCalled();
+  });
+
  it('returns a federation error envelope when scope evaluation denies access', async () => {
    const { controller, query } = makeController({
      scopeResult: {
--- a/apps/gateway/src/federation/server/verbs/list-query.service.ts
+++ b/apps/gateway/src/federation/server/verbs/list-query.service.ts
@@ -44,23 +44,29 @@ export interface FederationListQueryResult<T extends object = Record<string, unk
  readonly truncated: boolean;
 }

-type RowObject = Record<string, unknown>;
+type CursorSource = 'insights' | 'preferences';
+const CURSOR_SOURCE = Symbol('federationCursorSource');
+
+type RowObject = Record<string, unknown> & { readonly [CURSOR_SOURCE]?: CursorSource };

 interface KeysetCursor {
  readonly createdAt: Date;
  readonly id: string;
+  readonly source?: CursorSource;
 }

-function encodeCursor(row: RowObject): string | undefined {
+function encodeCursor(row: RowObject): string {
  const createdAt = row['createdAt'];
  const id = row['id'];
  if (!(createdAt instanceof Date) || typeof id !== 'string') {
-    return undefined;
+    throw new Error('Federation list cursor cannot be encoded');
  }

-  return Buffer.from(JSON.stringify({ createdAt: createdAt.toISOString(), id }), 'utf8').toString(
-    'base64url',
-  );
+  const source = row[CURSOR_SOURCE];
+  return Buffer.from(
+    JSON.stringify({ createdAt: createdAt.toISOString(), id, ...(source ? { source } : {}) }),
+    'utf8',
+  ).toString('base64url');
 }

 function decodeCursor(cursor: string | undefined): KeysetCursor | undefined {
@@ -74,17 +80,24 @@ function decodeCursor(cursor: string | undefined): KeysetCursor | undefined {
      throw new Error('cursor must be an object');
    }

-    const { createdAt, id } = parsed as { createdAt?: unknown; id?: unknown };
+    const { createdAt, id, source } = parsed as {
+      createdAt?: unknown;
+      id?: unknown;
+      source?: unknown;
+    };
    if (typeof createdAt !== 'string' || typeof id !== 'string' || id.length === 0) {
      throw new Error('cursor is missing createdAt or id');
    }
+    if (source !== undefined && source !== 'insights' && source !== 'preferences') {
+      throw new Error('cursor source is invalid');
+    }

    const date = new Date(createdAt);
    if (Number.isNaN(date.getTime())) {
      throw new Error('cursor createdAt is invalid');
    }

-    return { createdAt: date, id };
+    return { createdAt: date, id, ...(source ? { source } : {}) };
  } catch {
    throw new Error('Invalid federation list cursor');
  }
@@ -102,6 +115,15 @@ function paginate<T extends RowObject>(rows: T[], limit: number): FederationList
  };
 }

+function markCursorSource<T extends RowObject>(row: T, source: CursorSource): T {
+  Object.defineProperty(row, CURSOR_SOURCE, {
+    value: source,
+    enumerable: false,
+    configurable: false,
+  });
+  return row;
+}
+
 function sortRows(rows: RowObject[]): RowObject[] {
  return [...rows].sort((a, b) => {
    const aTime = a['createdAt'] instanceof Date ? a['createdAt'].getTime() : 0;
@@ -159,6 +181,8 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
      case 'credentials':
      case 'api_keys':
        return [];
+      default:
+        throw new Error(`Unsupported federation list resource: ${String(filter.resource)}`);
    }
  }

@@ -266,20 +290,18 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
  ): Promise<RowObject[]> {
    const projectIds = await this.listAccessibleProjectIds(filter);
    const missionIds = await this.listMissionIds(projectIds);
-    const clauses = [];

-    if (filter.includePersonal) {
-      clauses.push(eq(missionTasks.userId, filter.subjectUserId));
-    }
-    if (missionIds.length > 0) {
-      clauses.push(inArray(missionTasks.missionId, missionIds));
-    }
-
-    if (clauses.length === 0) {
+    if (missionIds.length === 0) {
      return [];
    }

-    const scopeClause = clauses.length === 1 ? clauses[0] : or(...clauses);
+    // mission_tasks rows are user-scoped even when the mission belongs to a team.
+    // Team visibility can narrow the mission set, but it must never widen the
+    // query to other users' mission task notes.
+    const scopeClause = and(
+      eq(missionTasks.userId, filter.subjectUserId),
+      inArray(missionTasks.missionId, missionIds),
+    );
    const cursorClause = cursor
      ? or(
          lt(missionTasks.createdAt, cursor.createdAt),
@@ -313,22 +335,25 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
    if (!filter.includePersonal) {
      return [];
    }
+    if (cursor && cursor.source === undefined) {
+      throw new Error('Invalid federation list cursor');
+    }

+    const rows: RowObject[] = [];
+
+    // Memory spans two physical tables. To keep pagination deterministic and
+    // resumable without a SQL UNION, M3 emits a fixed block order: all insights
+    // first, then preferences. The opaque cursor records which table produced
+    // the boundary row, so the next page never re-applies one table's keyset to
+    // the other table (which could duplicate/skip rows at equal timestamps).
+    if (cursor?.source !== 'preferences') {
      const insightCursorClause = cursor
        ? or(
            lt(insights.createdAt, cursor.createdAt),
            and(eq(insights.createdAt, cursor.createdAt), lt(insights.id, cursor.id)),
          )
        : undefined;
-    const preferenceCursorClause = cursor
-      ? or(
-          lt(preferences.createdAt, cursor.createdAt),
-          and(eq(preferences.createdAt, cursor.createdAt), lt(preferences.id, cursor.id)),
-        )
-      : undefined;
-
-    const [insightRows, preferenceRows] = await Promise.all([
-      this.db
+      const insightRows = await this.db
        .select({
          id: insights.id,
          kind: insights.source,
@@ -342,8 +367,24 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
        .from(insights)
        .where(and(eq(insights.userId, filter.subjectUserId), insightCursorClause))
        .orderBy(desc(insights.createdAt), desc(insights.id))
-        .limit(rowLimit),
-      this.db
+        .limit(rowLimit);
+
+      rows.push(...(insightRows as RowObject[]).map((row) => markCursorSource(row, 'insights')));
+    }
+
+    const remaining = rowLimit - rows.length;
+    if (remaining <= 0) {
+      return rows;
+    }
+
+    const preferenceCursorClause =
+      cursor?.source === 'preferences'
+        ? or(
+            lt(preferences.createdAt, cursor.createdAt),
+            and(eq(preferences.createdAt, cursor.createdAt), lt(preferences.id, cursor.id)),
+          )
+        : undefined;
+    const preferenceRows = await this.db
      .select({
        id: preferences.id,
        kind: preferences.category,
@@ -357,9 +398,11 @@ export class FederationListQueryService implements FederationNativeRbacEvaluator
      .from(preferences)
      .where(and(eq(preferences.userId, filter.subjectUserId), preferenceCursorClause))
      .orderBy(desc(preferences.createdAt), desc(preferences.id))
-        .limit(rowLimit),
-    ]);
+      .limit(remaining);

-    return sortRows([...(insightRows as RowObject[]), ...(preferenceRows as RowObject[])]);
+    rows.push(
+      ...(preferenceRows as RowObject[]).map((row) => markCursorSource(row, 'preferences')),
+    );
+    return rows;
  }
 }
--- a/apps/gateway/src/federation/server/verbs/list.controller.ts
+++ b/apps/gateway/src/federation/server/verbs/list.controller.ts
@@ -24,6 +24,7 @@ import type { FastifyRequest } from 'fastify';
 import {
  FederationInvalidRequestError,
  FederationScopeViolationError,
+  FederationUnauthorizedError,
  SOURCE_LOCAL,
  tagWithSource,
  type FederationListResponse,
@@ -93,7 +94,10 @@ export class ListController {
    @Body() body?: FederationListRequestBody,
  ): Promise<FederationListResponse<FederatedRow>> {
    if (!request.federationContext) {
-      throw new Error('Federation context missing after auth guard');
+      throw new HttpException(
+        new FederationUnauthorizedError('Federation context missing').toEnvelope(),
+        401,
+      );
    }

    const requestedLimit = parseLimit(body);
--- a/docs/scratchpads/FED-M3-05-list-verb.md
+++ b/docs/scratchpads/FED-M3-05-list-verb.md
@@ -15,8 +15,7 @@ Implement `POST /api/federation/v1/list/:resource`.
 ## Base / branch

 - Branch: `feat/federation-m3-verb-list`
- Base: `feat/federation-m3-scope-service` (PR #672), per orchestrator, because M3-04 is not merged yet.
- Rebase target after #672 merges: `main`.
+- Base: `main` after M3-04 scope service merged via PR #672 (`c739256a`).

 ## Implementation notes

@@ -28,10 +27,13 @@ Implement `POST /api/federation/v1/list/:resource`.
  - `memory`: user-owned `insights` and `preferences` rows.
  - `credentials` / `api_keys`: denied by native RBAC in M3 even if present in scope; sensitive-resource implementation is not part of FED-M3-05.
 - Cursor pagination uses an opaque base64url keyset cursor over `(createdAt, id)`; DB reads fetch at most `limit + 1` rows per resource query.
+- Reviewer isolation fix: `mission_tasks.notes` rows are always constrained by `missionTasks.userId = subjectUserId` and accessible mission IDs; team scope narrows missions but never widens to other users' mission task notes.
+- Follow-up review fix: memory listing now uses deterministic table-block pagination (`insights` first, then `preferences`) with cursor source metadata, so one table's cursor is never applied to the other.
+- Follow-up hardening: missing auth-guard context returns a structured federation `unauthorized` envelope; unsupported resources and non-encodable truncated cursors throw instead of silently crashing/truncating.

 ## Tests

- `pnpm --filter @mosaicstack/gateway test -- list.controller.spec.ts list-query.service.spec.ts` — PASS (9 tests).
+- `pnpm --filter @mosaicstack/gateway test -- list.controller.spec.ts list-query.service.spec.ts` — PASS (16 tests, including PGlite regression coverage for team-scoped notes isolation, unauthorized mission notes exclusion, `includePersonal: false`, deterministic memory pagination, missing context envelope, unsupported resource, and cursor encode failure).
 - `pnpm --filter @mosaicstack/gateway typecheck` — PASS.
 - `pnpm --filter @mosaicstack/gateway lint` — PASS.
 - `pnpm format:check` — PASS.
@@ -41,11 +43,10 @@ Implement `POST /api/federation/v1/list/:resource`.

 ## Review evidence

- `~/.config/mosaic/tools/codex/codex-code-review.sh --uncommitted` — PASS after remediation; approve, no findings.
- `~/.config/mosaic/tools/codex/codex-security-review.sh --uncommitted` — PASS after cursor remediation; risk level none, no findings.
+- `~/.config/mosaic/tools/codex/codex-code-review.sh --uncommitted` — PASS after follow-up remediation; approve, no findings.
+- `~/.config/mosaic/tools/codex/codex-security-review.sh --uncommitted` — PASS after follow-up remediation; risk level none, no findings.
 - Security-review note: read-path audit logging remains intentionally deferred to M4 per orchestrator clarification and FED-M3-05 scope.

 ## Risks / follow-up

- This branch intentionally includes M3-04 diff until PR #672 lands; final PR must be rebased onto main after #672 merges.
- Current branch base predates the M3-07 capabilities module registration; expect a small `FederationModule` rebase conflict once #672 and #674 are both on main.
+- Read-path audit logging remains intentionally deferred to M4.
--- a/packages/mosaic/src/commands/fleet.spec.ts
+++ b/packages/mosaic/src/commands/fleet.spec.ts
@@ -4,6 +4,7 @@ import { dirname, join, resolve } from 'node:path';
 import { Command } from 'commander';
 import { afterEach, describe, expect, it, vi } from 'vitest';
 import {
+  acquireRestartLock,
  addAgentToRoster,
  buildAgentSendCommand,
  buildAgentWatchAttachCommand,
@@ -45,6 +46,8 @@ import {
  removeAgentFromRoster,
  resolveFleetPaths,
  resolvePresetFilename,
+  restartLockPath,
+  RESTART_LOCK_STALE_MS,
  RUNTIME_ACCEPTABLE_COMMANDS,
  serializeRosterToYaml,
  VERIFY_DEFAULT_TIMEOUT_MS,
@@ -678,6 +681,364 @@ describe('fleet command construction', () => {
    }
  });

+  it('waits for an in-flight restart to clear before relaunching (re-entry guard)', async () => {
+    const home = await tempDir();
+    const rosterPath = join(home, 'fleet', 'roster.yaml');
+    await mkdir(join(home, 'fleet'), { recursive: true });
+    await writeFile(
+      rosterPath,
+      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
+        '\n',
+      ),
+    );
+
+    // Simulate another `mosaic fleet restart` process mid-teardown: a fresh lock
+    // (recent timestamp, so it is NOT treated as stale) already held.
+    const lockPath = restartLockPath(home);
+    await mkdir(dirname(lockPath), { recursive: true });
+    await writeFile(lockPath, `4242\n${Date.now()}\n`);
+
+    const events: string[] = [];
+    const runner: CommandRunner = async (command, args) => {
+      events.push(`run:${args[args.length - 1]}`);
+      return { stdout: '', stderr: '', exitCode: 0 };
+    };
+    // The injected sleep stands in for time passing while we wait; the in-flight
+    // restart "finishes" (releases its lock) after the first poll.
+    let sleeps = 0;
+    const sleepFn: SleepFn = async () => {
+      sleeps += 1;
+      events.push(`sleep:${sleeps}`);
+      await rm(lockPath, { force: true });
+    };
+
+    const program = new Command();
+    program.exitOverride();
+    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
+
+    try {
+      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
+
+      // It must have waited at least once before issuing any systemctl restart.
+      expect(sleeps).toBeGreaterThan(0);
+      const firstSleep = events.findIndex((e) => e.startsWith('sleep:'));
+      const firstRun = events.findIndex((e) => e.startsWith('run:'));
+      expect(firstSleep).toBeGreaterThanOrEqual(0);
+      expect(firstRun).toBeGreaterThan(firstSleep);
+
+      // And it still performs the full restart once the lock clears.
+      expect(events).toContain('run:mosaic-tmux-holder.service');
+      expect(events).toContain('run:mosaic-agent@coder0.service');
+
+      // The lock is released after the restart completes.
+      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
+  it('breaks a stale restart lock and proceeds without waiting', async () => {
+    const home = await tempDir();
+    const rosterPath = join(home, 'fleet', 'roster.yaml');
+    await mkdir(join(home, 'fleet'), { recursive: true });
+    await writeFile(
+      rosterPath,
+      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
+        '\n',
+      ),
+    );
+
+    // A lock left behind by a crashed owner: timestamp older than the stale window.
+    const lockPath = restartLockPath(home);
+    await mkdir(dirname(lockPath), { recursive: true });
+    await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n`);
+
+    const calls: string[][] = [];
+    const runner: CommandRunner = async (command, args) => {
+      calls.push([command, ...args]);
+      return { stdout: '', stderr: '', exitCode: 0 };
+    };
+    const sleepFn = vi.fn<SleepFn>(async () => {});
+
+    const program = new Command();
+    program.exitOverride();
+    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
+
+    try {
+      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
+
+      // Stale lock is broken immediately — no waiting.
+      expect(sleepFn).not.toHaveBeenCalled();
+      expect(calls).toEqual([
+        ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
+        ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
+      ]);
+      // The stale lock is gone once the restart completes.
+      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
+  it('releases the restart lock so a subsequent restart is not blocked', async () => {
+    const home = await tempDir();
+    const rosterPath = join(home, 'fleet', 'roster.yaml');
+    await mkdir(join(home, 'fleet'), { recursive: true });
+    await writeFile(
+      rosterPath,
+      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
+        '\n',
+      ),
+    );
+
+    const calls: string[][] = [];
+    const runner: CommandRunner = async (command, args) => {
+      calls.push([command, ...args]);
+      return { stdout: '', stderr: '', exitCode: 0 };
+    };
+    const sleepFn = vi.fn<SleepFn>(async () => {});
+
+    const program = new Command();
+    program.exitOverride();
+    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
+
+    try {
+      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
+      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart']);
+
+      // Two sequential restarts both run fully and neither has to wait.
+      expect(sleepFn).not.toHaveBeenCalled();
+      expect(calls).toEqual([
+        ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
+        ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
+        ['systemctl', '--user', 'restart', 'mosaic-tmux-holder.service'],
+        ['systemctl', '--user', 'restart', 'mosaic-agent@coder0.service'],
+      ]);
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
+  it('guards the single-agent restart path behind the in-flight restart lock', async () => {
+    const home = await tempDir();
+    const rosterPath = join(home, 'fleet', 'roster.yaml');
+    await mkdir(join(home, 'fleet'), { recursive: true });
+    await writeFile(
+      rosterPath,
+      ['version: 1', 'transport: tmux', 'agents:', '  - name: coder0', '    runtime: codex'].join(
+        '\n',
+      ),
+    );
+
+    // A full restart is mid-flight (lock held); a single-agent restart re-enters.
+    const lockPath = restartLockPath(home);
+    await mkdir(dirname(lockPath), { recursive: true });
+    await writeFile(lockPath, `4242\n${Date.now()}\n`);
+
+    const events: string[] = [];
+    const runner: CommandRunner = async (command, args) => {
+      events.push(`run:${args[args.length - 1]}`);
+      return { stdout: '', stderr: '', exitCode: 0 };
+    };
+    let sleeps = 0;
+    const sleepFn: SleepFn = async () => {
+      sleeps += 1;
+      events.push(`sleep:${sleeps}`);
+      await rm(lockPath, { force: true });
+    };
+
+    const program = new Command();
+    program.exitOverride();
+    registerFleetCommand(program, { runner, sleepFn, mosaicHome: home });
+
+    try {
+      await program.parseAsync(['node', 'mosaic', 'fleet', 'restart', 'coder0']);
+
+      // The single-agent restart waits for the in-flight restart before acting.
+      expect(sleeps).toBeGreaterThan(0);
+      const firstSleep = events.findIndex((e) => e.startsWith('sleep:'));
+      const firstRun = events.findIndex((e) => e.startsWith('run:'));
+      expect(firstSleep).toBeGreaterThanOrEqual(0);
+      expect(firstRun).toBeGreaterThan(firstSleep);
+      // Only the named agent is restarted; the holder is untouched.
+      expect(events).toContain('run:mosaic-agent@coder0.service');
+      expect(events).not.toContain('run:mosaic-tmux-holder.service');
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
+  it('does not let a timed-out owner drop a lock another restart broke and re-owned', async () => {
+    const home = await tempDir();
+    const runDir = join(home, 'fleet', 'run');
+    await mkdir(runDir, { recursive: true });
+    const lockPath = restartLockPath(home);
+    const tokenOf = async (): Promise<string> => {
+      const raw = await readFile(lockPath, 'utf8');
+      return raw.split('\n')[2]?.trim() ?? '';
+    };
+    const sleepFn = vi.fn<SleepFn>(async () => {});
+
+    // R1 acquires the lock and begins a restart that then hangs.
+    const r1 = await acquireRestartLock(home, sleepFn);
+    const tokenR1 = await tokenOf();
+    expect(tokenR1).not.toBe('');
+
+    // The hung R1 leaves a stale lock: rewrite its timestamp into the past while
+    // preserving R1's token — exactly the on-disk state a stuck owner leaves.
+    await writeFile(lockPath, `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\n${tokenR1}\n`);
+
+    // R2 re-enters, sees the stale lock, and atomically takes ownership.
+    const r2 = await acquireRestartLock(home, sleepFn);
+    const tokenR2 = await tokenOf();
+    expect(tokenR2).not.toBe(tokenR1);
+    expect(sleepFn).not.toHaveBeenCalled();
+
+    // R1 finally finishes and releases. It must NOT delete R2's lock — otherwise
+    // a third restart (R3) could acquire and interleave with R2 still running.
+    await r1.release();
+    expect(await tokenOf()).toBe(tokenR2);
+
+    // R2 releases cleanly and the lock is gone.
+    await r2.release();
+    await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
+
+    await rm(home, { recursive: true, force: true });
+  });
+
+  it('lets only one of several concurrent breakers proceed past a stale lock', async () => {
+    const home = await tempDir();
+    const lockPath = restartLockPath(home);
+    await mkdir(dirname(lockPath), { recursive: true });
+
+    // A stale lock left by a crashed owner: every concurrent re-entrant restart
+    // will judge it stale and try to break it at the same instant. Breaking must
+    // NOT grant ownership — only the atomic re-create may — so exactly one
+    // contender can ever hold the lock at a time. (The v2 fix wrote our own token
+    // during the break and read it back, so two breakers each saw their own token
+    // and BOTH proceeded; this guards that regression.)
+    await writeFile(
+      lockPath,
+      `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
+    );
+
+    // Yielding sleep so a waiting contender lets the current owner finish and
+    // release before it re-contends, instead of spinning the microtask queue.
+    const sleepFn: SleepFn = async () => {
+      await new Promise((res) => setTimeout(res, 0));
+    };
+
+    let active = 0;
+    let maxActive = 0;
+    const tokens: string[] = [];
+    const tokenOf = async (): Promise<string> => {
+      const raw = await readFile(lockPath, 'utf8');
+      return raw.split('\n')[2]?.trim() ?? '';
+    };
+
+    // One "restart" = acquire the lock, do work in the critical section, release.
+    const restartOnce = async (): Promise<void> => {
+      const guard = await acquireRestartLock(home, sleepFn);
+      active += 1;
+      maxActive = Math.max(maxActive, active);
+      // Record the token we own while we hold it, then yield to interleave with
+      // any other contender that might (wrongly) believe it owns the lock too.
+      tokens.push(await tokenOf());
+      await new Promise((res) => setTimeout(res, 0));
+      active -= 1;
+      await guard.release();
+    };
+
+    try {
+      // Three breakers race the single stale lock simultaneously.
+      await Promise.all([restartOnce(), restartOnce(), restartOnce()]);
+
+      // Mutual exclusion held: never two owners at once despite concurrent breaks.
+      expect(maxActive).toBe(1);
+      // Each acquire owned with its own distinct token — no two ever shared it.
+      expect(new Set(tokens).size).toBe(3);
+      // The lock is fully released at the end.
+      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
+  it('lets exactly one of two breakers take over a stale lock while the other waits', async () => {
+    const home = await tempDir();
+    const lockPath = restartLockPath(home);
+    await mkdir(dirname(lockPath), { recursive: true });
+
+    // A single stale lock both contenders will judge stale at the same instant.
+    // Every transition runs under the registry mutex, so only one may take the
+    // lock over; the other must observe a now-fresh owner and WAIT/re-evaluate
+    // rather than also taking over. (A content-blind clobber let both believe
+    // they owned it — this asserts the mutex-gated CAS takeover instead.)
+    await writeFile(
+      lockPath,
+      `4242\n${Date.now() - RESTART_LOCK_STALE_MS - 1_000}\nstale-owner-token\n`,
+    );
+
+    // Barrier the winner holds against until the loser has observed the lock
+    // fresh and waited at least once — forcing the exact interleaving where one
+    // proceeds while the other waits, deterministically rather than by timing.
+    let resolveLoserWaited: () => void = () => {};
+    const loserWaited = new Promise<void>((res) => {
+      resolveLoserWaited = res;
+    });
+    let sleeps = 0;
+    const sleepFn: SleepFn = async () => {
+      sleeps += 1;
+      resolveLoserWaited();
+      await new Promise((res) => setTimeout(res, 0));
+    };
+
+    let active = 0;
+    let maxActive = 0;
+    const tokens: string[] = [];
+    const tokenOf = async (): Promise<string> => {
+      const raw = await readFile(lockPath, 'utf8');
+      return raw.split('\n')[2]?.trim() ?? '';
+    };
+
+    let firstOwner = true;
+    const restartOnce = async (): Promise<void> => {
+      const guard = await acquireRestartLock(home, sleepFn);
+      active += 1;
+      maxActive = Math.max(maxActive, active);
+      tokens.push(await tokenOf());
+      if (firstOwner) {
+        // Winner: keep holding the lock until the loser has waited once, so the
+        // loser is guaranteed to see a FRESH owner (not the stale one) and back
+        // off — proving it could not also take over.
+        firstOwner = false;
+        await loserWaited;
+      } else {
+        await new Promise((res) => setTimeout(res, 0));
+      }
+      active -= 1;
+      await guard.release();
+    };
+
+    try {
+      // Exactly two breakers race the single stale lock.
+      await Promise.all([restartOnce(), restartOnce()]);
+
+      // Mutual exclusion: never two owners at once (if both took over the stale
+      // lock, this would be 2).
+      expect(maxActive).toBe(1);
+      // Both eventually owned, each with its own distinct token.
+      expect(new Set(tokens).size).toBe(2);
+      // The loser observed the winner's fresh lock and waited — it did NOT also
+      // take over the stale lock.
+      expect(sleeps).toBeGreaterThanOrEqual(1);
+      // The lock is fully released at the end.
+      await expect(readFile(lockPath, 'utf8')).rejects.toMatchObject({ code: 'ENOENT' });
+    } finally {
+      await rm(home, { recursive: true, force: true });
+    }
+  });
+
  it('attempts every agent and the holder during fleet stop even when an agent stop fails', async () => {
    const home = await tempDir();
    const rosterPath = join(home, 'fleet', 'roster.yaml');
--- a/packages/mosaic/src/commands/fleet.ts
+++ b/packages/mosaic/src/commands/fleet.ts
@@ -1,5 +1,16 @@
 import { constants } from 'node:fs';
-import { access, chmod, copyFile, mkdir, readFile, unlink, writeFile } from 'node:fs/promises';
+import {
+  access,
+  chmod,
+  copyFile,
+  mkdir,
+  open,
+  readFile,
+  stat,
+  unlink,
+  writeFile,
+} from 'node:fs/promises';
+import { randomUUID } from 'node:crypto';
 import { homedir, hostname, userInfo } from 'node:os';
 import { dirname, join, resolve } from 'node:path';
 import { fileURLToPath } from 'node:url';
@@ -533,6 +544,295 @@ export function buildFleetServiceCommand(action: FleetServiceAction, agentName?:
  return ['systemctl', '--user', action, service];
 }

+/** Poll interval (ms) while waiting for an in-flight restart's lock to clear. */
+export const RESTART_LOCK_POLL_INTERVAL_MS = 250;
+/**
+ * Maximum time (ms) a re-entrant restart waits for the in-flight restart to
+ * finish before it breaks the lock and proceeds anyway. A bound is required so
+ * a crashed holder of the lock can never deadlock the fleet permanently.
+ */
+export const RESTART_LOCK_MAX_WAIT_MS = 30_000;
+/**
+ * Age (ms) past which a restart lock is treated as stale (its owner died
+ * without releasing it) and is broken immediately rather than waited on.
+ */
+export const RESTART_LOCK_STALE_MS = 60_000;
+
+/**
+ * Resolves the path of the cross-process restart lock for a given Mosaic home.
+ * Kept strictly under `<mosaicHome>/fleet/run` (not the heartbeat env override)
+ * so the lock is scoped to the same fleet the restart acts on.
+ */
+export function restartLockPath(mosaicHome: string): string {
+  return join(mosaicHome, 'fleet', 'run', 'restart.lock');
+}
+
+/** A held restart lock; `release()` removes the lock file iff we still own it. */
+interface RestartGuard {
+  release(): Promise<void>;
+}
+
+/** Lock-file contents: pid (informational), timestamp, and a unique owner token. */
+function formatRestartLockContent(token: string): string {
+  return `${process.pid}\n${Date.now()}\n${token}\n`;
+}
+
+/**
+ * Reads the owner token (line 3) from a lock file, or null if the file is
+ * missing/unreadable/tokenless. The token is what makes release and break
+ * ownership-safe: a process only ever acts on a lock whose token matches its own.
+ */
+async function readRestartLockToken(lockPath: string): Promise<string | null> {
+  let raw: string;
+  try {
+    raw = await readFile(lockPath, 'utf8');
+  } catch {
+    return null;
+  }
+  const token = raw.split('\n')[2]?.trim();
+  return token ? token : null;
+}
+
+/**
+ * Returns true when a lock's contents are stale: older than RESTART_LOCK_STALE_MS,
+ * or unparseable (a corrupt or partially written lock left by a crashed owner).
+ */
+function isRestartLockContentStale(raw: string, now: number): boolean {
+  const stampLine = raw.split('\n')[1] ?? '';
+  const stamp = Number.parseInt(stampLine.trim(), 10);
+  if (!Number.isFinite(stamp)) {
+    return true;
+  }
+  return now - stamp >= RESTART_LOCK_STALE_MS;
+}
+
+/**
+ * Path of the short-lived registry mutex that guards EVERY transition of the
+ * restart lock (acquire, release, takeover). Held only across a few filesystem
+ * ops — never across the restart itself — so contention clears in microseconds.
+ */
+function restartMutexPath(lockPath: string): string {
+  return `${lockPath}.mutex`;
+}
+
+/** Brief back-off between registry-mutex acquisition attempts (held microseconds). */
+const RESTART_MUTEX_RETRY_MS = 20;
+
+/**
+ * Staleness for the internal mutex / reclaim locks, judged by the file's mtime
+ * rather than its CONTENT. `open(path, 'wx')` creates the inode (with a fresh
+ * mtime) before any token/timestamp is written into it, so a content-based check
+ * would momentarily see that empty file as corrupt-and-stale and could reap a
+ * lock another contender is still acquiring. mtime is set atomically at creation,
+ * so a just-created lock always reads as live; only a lock whose holder died and
+ * stopped touching it ages past the threshold. These locks are never held across
+ * the restart itself (only a couple of filesystem ops), so any mtime this old can
+ * belong only to a dead holder.
+ */
+async function isRestartLockPathStale(path: string, now: number): Promise<boolean> {
+  try {
+    const info = await stat(path);
+    return now - info.mtimeMs >= RESTART_LOCK_STALE_MS;
+  } catch (err) {
+    if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
+      return false; // Gone, not stale — the caller will re-contend.
+    }
+    return false; // Can't stat — treat as live and back off rather than reap.
+  }
+}
+
+/** Path of the reclaim lock that serializes reaping of a crashed-holder mutex. */
+function restartReclaimPath(mutexPath: string): string {
+  return `${mutexPath}.reclaim`;
+}
+
+/**
+ * Reap a registry mutex left behind by a process that CRASHED mid-transition —
+ * one whose file has aged past RESTART_LOCK_STALE_MS. Because the mutex is held
+ * only for a couple of filesystem ops (no sleeps, never across the restart), a
+ * mutex this old can only belong to a dead holder.
+ *
+ * The reap removes the dead mutex but never CREATES/holds it — acquisition stays
+ * the single `open('wx')` create in {@link acquireRestartMutex}, so exactly one
+ * contender wins ownership no matter how the reap and acquires interleave. The
+ * removal is made conditional by a dedicated reclaim lock: while it is held the
+ * dead mutex is stable (its dead holder will never touch it, and no other
+ * reclaimer can race), so re-reading it and removing it only if it is STILL stale
+ * is a true compare — a live holder's fresh mutex is never removed. This closes
+ * the reclaim race a content-blind rename-and-restore left open (a third
+ * contender slipping into the gap while a fresh mutex was moved aside).
+ */
+async function reclaimStaleRestartMutex(mutexPath: string): Promise<void> {
+  const reclaimPath = restartReclaimPath(mutexPath);
+  let handle: Awaited<ReturnType<typeof open>>;
+  try {
+    handle = await open(reclaimPath, 'wx');
+  } catch (err) {
+    if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
+      throw err;
+    }
+    // Someone is already reclaiming. If their reclaim lock is itself stale by
+    // mtime, its holder crashed mid-reap (the lock spans only a stat + unlink,
+    // microseconds) — clear it so a later pass can retry. Otherwise a live
+    // reclaimer has it; back off. Either way we do not reap the mutex this pass.
+    if (await isRestartLockPathStale(reclaimPath, Date.now())) {
+      await unlink(reclaimPath).catch(() => {});
+    }
+    return;
+  }
+  try {
+    // Re-check the mutex UNDER the reclaim lock and remove it only if it is STILL
+    // stale by mtime. A live holder's mutex is fresh and is left untouched; a dead
+    // holder's mutex is stable here (its holder is gone and no other reclaimer can
+    // race us), so this re-check is authoritative.
+    if (await isRestartLockPathStale(mutexPath, Date.now())) {
+      await unlink(mutexPath).catch(() => {});
+    }
+  } finally {
+    await handle.close();
+    await unlink(reclaimPath).catch(() => {});
+  }
+}
+
+/**
+ * Acquire the registry mutex, BLOCKING (with brief back-offs) until held, and
+ * return a token-gated release. This is the single point of mutual exclusion for
+ * the restart lock: acquire, release, and stale/timeout takeover all run under it,
+ * so "read the lock, then mutate it" is atomic — no acquirer, releaser, or breaker
+ * can ever interleave with another. A mutex left by a crashed holder is reclaimed
+ * once it ages past the stale threshold.
+ */
+async function acquireRestartMutex(
+  mutexPath: string,
+  token: string,
+): Promise<RestartGuard['release']> {
+  for (;;) {
+    let handle: Awaited<ReturnType<typeof open>>;
+    try {
+      handle = await open(mutexPath, 'wx');
+    } catch (err) {
+      if ((err as NodeJS.ErrnoException).code !== 'EEXIST') {
+        throw err;
+      }
+      // Staleness is judged by mtime, not content, so a mutex that exists but has
+      // not yet had its token written (the open-before-write window) reads as live
+      // and is never wrongly reaped.
+      if (!(await isRestartLockPathStale(mutexPath, Date.now()))) {
+        // A live holder has it — it will be gone in microseconds. Back off briefly.
+        await new Promise((resolve) => setTimeout(resolve, RESTART_MUTEX_RETRY_MS));
+        continue;
+      }
+      await reclaimStaleRestartMutex(mutexPath);
+      continue;
+    }
+    // We created the mutex. Populate it with our token; if writing fails, clean up
+    // our own file so we never leak an empty mutex that a peer would have to reap.
+    try {
+      await handle.writeFile(formatRestartLockContent(token));
+      await handle.close();
+    } catch (err) {
+      await handle.close().catch(() => {});
+      await unlink(mutexPath).catch(() => {});
+      throw err;
+    }
+    return async (): Promise<void> => {
+      if ((await readRestartLockToken(mutexPath)) !== token) return;
+      await unlink(mutexPath).catch(() => {});
+    };
+  }
+}
+
+/**
+ * Acquire the fleet restart lock, serializing concurrent `mosaic fleet restart`
+ * invocations across processes. Each restart tears the tmux holder (and the
+ * agent sessions inside it) down and back up; without this guard a re-entrant
+ * restart relaunches agents against a half-torn-down holder, which fails and
+ * tight-loops. A re-entrant caller waits for the in-flight restart to release
+ * the lock (clean shutdown settled) before proceeding, breaks a stale lock left
+ * by a crashed owner, and after RESTART_LOCK_MAX_WAIT_MS breaks the lock to
+ * avoid a permanent deadlock.
+ *
+ * Correctness rests on a single invariant: EVERY transition of the lock — taking
+ * a free lock, taking over a stale/timed-out one, and releasing — happens under
+ * the registry mutex. Because the check ("is the lock free / stale / fresh?") and
+ * the mutation that follows it both run while the mutex is held, they are atomic:
+ * no other acquirer, releaser, or breaker can slip in between. That is what makes
+ * takeover a true compare-and-swap rather than a content-blind clobber — a normal
+ * `open('wx')` acquirer cannot create a fresh lock in a gap, and the original
+ * owner's `release()` (also mutex-gated and token-checked) cannot drop a lock a
+ * breaker already took over. So no interleaving lets two restarts both own the
+ * lock and run concurrently.
+ */
+export async function acquireRestartLock(
+  mosaicHome: string,
+  sleepFn: SleepFn,
+): Promise<RestartGuard> {
+  const token = randomUUID();
+  const lockPath = restartLockPath(mosaicHome);
+  const mutexPath = restartMutexPath(lockPath);
+  await mkdir(dirname(lockPath), { recursive: true });
+  const release = async (): Promise<void> => {
+    // Mutex-gated and token-gated: only remove the lock if it is still ours. If
+    // another caller took it over (after a stale/timeout break) the token no
+    // longer matches and we leave their lock intact.
+    const releaseMutex = await acquireRestartMutex(mutexPath, token);
+    try {
+      if ((await readRestartLockToken(lockPath)) === token) {
+        await unlink(lockPath).catch(() => {});
+      }
+    } finally {
+      await releaseMutex();
+    }
+  };
+  const deadline = Date.now() + RESTART_LOCK_MAX_WAIT_MS;
+  for (;;) {
+    let owned = false;
+    const releaseMutex = await acquireRestartMutex(mutexPath, token);
+    try {
+      // Read and (if appropriate) mutate the lock atomically under the mutex.
+      let current: string | null = null;
+      let absent = false;
+      try {
+        current = await readFile(lockPath, 'utf8');
+      } catch (readErr) {
+        if ((readErr as NodeJS.ErrnoException).code === 'ENOENT') {
+          absent = true;
+        } else {
+          current = null; // Unreadable/corrupt: treat as stale.
+        }
+      }
+      const now = Date.now();
+      if (absent) {
+        // Lock is free — take it.
+        await writeFile(lockPath, formatRestartLockContent(token));
+        owned = true;
+      } else {
+        const stale = current === null || isRestartLockContentStale(current, now);
+        const timedOut = now >= deadline;
+        if (stale || timedOut) {
+          process.stderr.write(
+            stale
+              ? 'Breaking stale fleet restart lock.\n'
+              : `Timed out after ${RESTART_LOCK_MAX_WAIT_MS}ms waiting for the in-flight fleet ` +
+                  'restart; breaking the lock.\n',
+          );
+          // Takeover is just an overwrite — safe because we hold the mutex, so no
+          // acquirer or releaser can touch the lock between our read and this write.
+          await writeFile(lockPath, formatRestartLockContent(token));
+          owned = true;
+        }
+        // else: a fresh restart owns it — wait below and re-evaluate.
+      }
+    } finally {
+      await releaseMutex();
+    }
+    if (owned) {
+      return { release };
+    }
+    await sleepFn(RESTART_LOCK_POLL_INTERVAL_MS);
+  }
+}
+
 /**
 * Returns the systemctl --user enable command for a given unit.
 * Used by the install auto-enable step to persist units across reboots.
@@ -1172,6 +1472,7 @@ export function isSendAccepted(capturedOutput: string): SendVerifyResult {

 export function registerFleetCommand(program: Command, deps: FleetCommandDeps = {}): Command {
  const runner = deps.runner ?? runCommand;
+  const sleepFn = deps.sleepFn ?? defaultSleep;
  const paths = resolveFleetPaths(deps.mosaicHome);
  const frameworkRoot = deps.frameworkRoot ?? resolveFrameworkRoot();

@@ -1285,9 +1586,22 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
      .command(`${action} [agent]`)
      .description(`${action} the fleet holder or one agent`)
      .action(async (agent?: string) => {
+        const commandOpts = cmd.opts<{ mosaicHome: string; roster?: string }>();
+        const activePaths = resolveFleetPaths(commandOpts.mosaicHome);
        const roster = await loadRosterForCommand(cmd);
        if (agent) {
          getRosterAgent(roster, agent);
+          // Single-agent restart is guarded too: it can race a full restart that
+          // is tearing the shared holder down.
+          if (action === 'restart') {
+            const guard = await acquireRestartLock(activePaths.mosaicHome, sleepFn);
+            try {
+              await runChecked(runner, buildFleetServiceCommand(action, agent));
+            } finally {
+              await guard.release();
+            }
+            return;
+          }
          await runChecked(runner, buildFleetServiceCommand(action, agent));
          return;
        }
@@ -1298,6 +1612,21 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
          );
          return;
        }
+        if (action === 'restart') {
+          // Serialize the holder+agents teardown/relaunch behind the restart lock
+          // so a re-entrant restart waits for clean shutdown before relaunching,
+          // instead of racing a half-torn-down holder into a tight loop.
+          const guard = await acquireRestartLock(activePaths.mosaicHome, sleepFn);
+          try {
+            await runChecked(runner, buildFleetServiceCommand(action));
+            for (const rosterAgent of roster.agents) {
+              await runChecked(runner, buildFleetServiceCommand(action, rosterAgent.name));
+            }
+          } finally {
+            await guard.release();
+          }
+          return;
+        }
        await runChecked(runner, buildFleetServiceCommand(action));
        for (const rosterAgent of roster.agents) {
          await runChecked(runner, buildFleetServiceCommand(action, rosterAgent.name));
--- a/tools/install.sh
+++ b/tools/install.sh
@@ -16,6 +16,10 @@
 #   --framework       Install/upgrade framework only (skip npm CLI)
 #   --cli             Install/upgrade npm CLI only (skip framework)
 #   --ref <branch>    Git ref for framework archive (default: main)
+#   --dev             Build CLI + gateway FROM SOURCE at --ref instead of the
+#                     registry @latest. Zero registry writes — packs local
+#                     tarballs and installs them globally. Use to test a branch
+#                     end-to-end before cutting a release.
 #   --yes             Accept all defaults; headless/non-interactive install
 #   --no-auto-launch  Skip automatic mosaic wizard + gateway install on first install
 #   --uninstall       Reverse the install: remove framework dir, CLI package, and npmrc line
@@ -27,6 +31,7 @@
 #   MOSAIC_PREFIX       — npm global prefix          (default: ~/.npm-global)
 #   MOSAIC_NO_COLOR     — disable colour             (set to 1)
 #   MOSAIC_REF          — git ref for framework      (default: main)
+#   MOSAIC_DEV          — equivalent to --dev         (set to 1)
 #   MOSAIC_ASSUME_YES   — equivalent to --yes        (set to 1)
 # ──────────────────────────────────────────────────────────────────────────────
 #
@@ -43,6 +48,7 @@ FLAG_CLI=true
 FLAG_NO_AUTO_LAUNCH=false
 FLAG_YES=false
 FLAG_UNINSTALL=false
+FLAG_DEV=false
 GIT_REF="${MOSAIC_REF:-main}"

 # MOSAIC_ASSUME_YES env var acts the same as --yes
@@ -50,12 +56,18 @@ if [[ "${MOSAIC_ASSUME_YES:-0}" == "1" ]]; then
  FLAG_YES=true
 fi

+# MOSAIC_DEV env var acts the same as --dev
+if [[ "${MOSAIC_DEV:-0}" == "1" ]]; then
+  FLAG_DEV=true
+fi
+
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --check)          FLAG_CHECK=true; shift ;;
    --framework)      FLAG_CLI=false; shift ;;
    --cli)            FLAG_FRAMEWORK=false; shift ;;
    --ref)            GIT_REF="${2:-main}"; shift 2 ;;
+    --dev)            FLAG_DEV=true; shift ;;
    --yes|-y)         FLAG_YES=true; shift ;;
    --no-auto-launch) FLAG_NO_AUTO_LAUNCH=true; shift ;;
    --uninstall)      FLAG_UNINSTALL=true; shift ;;
@@ -72,6 +84,17 @@ CLI_PKG="${SCOPE}/mosaic"
 REPO_BASE="https://git.mosaicstack.dev/mosaicstack/stack"
 ARCHIVE_URL="${REPO_BASE}/archive/${GIT_REF}.tar.gz"

+# In dev (build-from-source) mode the gateway is installed globally from a
+# locally-built tarball. Tell the wizard / gateway-config stage NOT to overwrite
+# it with the registry @latest build (honored by gatewayConfigStage).
+if [[ "$FLAG_DEV" == "true" ]]; then
+  export MOSAIC_GATEWAY_SKIP_NPM_INSTALL=1
+fi
+
+# Shared monorepo checkout (populated on demand by ensure_monorepo).
+WORK_DIR=""
+EXTRACTED_DIR=""
+
 # ─── uninstall path ───────────────────────────────────────────────────────────
 # Shell-level uninstall for when the CLI is broken or not available.
 # Handles: framework directory, npm CLI package, npmrc scope line.
@@ -239,6 +262,99 @@ framework_version() {
  fi
 }

+# Download + extract the monorepo archive at $GIT_REF exactly once per run.
+# Sets the script-level EXTRACTED_DIR to the repo root. Reused by both the
+# framework install (Part 1) and the dev build-from-source path (Part 2).
+ensure_monorepo() {
+  if [[ -n "$EXTRACTED_DIR" ]] && [[ -d "$EXTRACTED_DIR" ]]; then
+    return 0
+  fi
+
+  require_cmd tar
+
+  WORK_DIR="$(mktemp -d "${TMPDIR:-/tmp}/mosaic-install-XXXXXX")"
+  # shellcheck disable=SC2317
+  cleanup_work() { [[ -n "$WORK_DIR" ]] && rm -rf "$WORK_DIR"; }
+  trap cleanup_work EXIT
+
+  info "Downloading source from ${GIT_REF}…"
+  if command -v curl &>/dev/null; then
+    curl -fsSL "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
+  elif command -v wget &>/dev/null; then
+    wget -qO- "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
+  else
+    fail "curl or wget required to download source."
+    exit 1
+  fi
+
+  # Gitea archives extract to <repo-name>/ inside the work dir
+  EXTRACTED_DIR="$(find "$WORK_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)"
+  if [[ -z "$EXTRACTED_DIR" ]] || [[ ! -d "$EXTRACTED_DIR" ]]; then
+    fail "Could not locate extracted source in archive."
+    ls -la "$WORK_DIR" >&2
+    exit 1
+  fi
+}
+
+# Build @mosaicstack/mosaic + @mosaicstack/gateway from source and install both
+# globally from locally-packed tarballs. ZERO registry writes. Workspace deps
+# (brain/config/db/…) are pulled from the registry at the versions pinned in
+# each package.json — `pnpm pack` rewrites `workspace:*` to those versions.
+install_cli_from_source() {
+  local src="$EXTRACTED_DIR"
+  local out_dir="$WORK_DIR/dist-tarballs"
+  mkdir -p "$out_dir"
+
+  # pnpm via corepack (ships with Node >= 16.9; required by Node >= 20 preflight).
+  # Pin to the repo's packageManager version so the build matches CI. Surface
+  # corepack failures so the fresh-machine case gives an actionable error
+  # instead of a bare "command not found".
+  if ! command -v pnpm &>/dev/null; then
+    info "Activating pnpm via corepack…"
+    corepack enable 2>&1 | sed 's/^/  /' || warn "corepack enable failed — pnpm may need manual install."
+    corepack prepare pnpm@10.6.2 --activate 2>&1 | sed 's/^/  /' \
+      || warn "corepack prepare failed — pnpm may need manual install."
+  fi
+  if ! command -v pnpm &>/dev/null; then
+    fail "pnpm not available after corepack activation."
+    echo "  Install pnpm manually (https://pnpm.io/installation) and re-run with --dev."
+    exit 1
+  fi
+
+  info "Installing workspace dependencies (pnpm install)…"
+  ( cd "$src" && pnpm install ) 2>&1 | sed 's/^/  /'
+
+  info "Building CLI + gateway from source…"
+  ( cd "$src" && pnpm --filter "@mosaicstack/mosaic..." --filter "@mosaicstack/gateway..." run build ) 2>&1 | sed 's/^/  /'
+
+  info "Packing local tarballs…"
+  ( cd "$src/packages/mosaic" && pnpm pack --pack-destination "$out_dir" ) 2>&1 | sed 's/^/  /'
+  ( cd "$src/apps/gateway"    && pnpm pack --pack-destination "$out_dir" ) 2>&1 | sed 's/^/  /'
+
+  local cli_tgz gw_tgz
+  cli_tgz="$(ls -1t "$out_dir"/mosaicstack-mosaic-*.tgz 2>/dev/null | head -1)"
+  gw_tgz="$(ls -1t "$out_dir"/mosaicstack-gateway-*.tgz 2>/dev/null | head -1)"
+
+  if [[ ! -f "$cli_tgz" ]]; then
+    fail "CLI tarball was not produced by pnpm pack."
+    exit 1
+  fi
+  if [[ ! -f "$gw_tgz" ]]; then
+    fail "Gateway tarball was not produced by pnpm pack."
+    exit 1
+  fi
+
+  # Gateway first so it is present globally before the CLI's wizard runs (which
+  # skips its own gateway install via MOSAIC_GATEWAY_SKIP_NPM_INSTALL=1).
+  info "Installing gateway from source tarball (global)…"
+  npm install -g "$gw_tgz" --prefix="$PREFIX" 2>&1 | sed 's/^/  /'
+
+  info "Installing CLI from source tarball (global)…"
+  npm install -g "$cli_tgz" --prefix="$PREFIX" 2>&1 | sed 's/^/  /'
+
+  ok "Installed from source: CLI $(installed_cli_version)"
+}
+
 # ─── preflight ────────────────────────────────────────────────────────────────

 require_cmd node
@@ -282,25 +398,8 @@ if [[ "$FLAG_FRAMEWORK" == "true" ]]; then
      warn "Framework not installed."
    fi
  else
-    # Download repo archive and extract framework
-    require_cmd tar
-
-    WORK_DIR="$(mktemp -d "${TMPDIR:-/tmp}/mosaic-install-XXXXXX")"
-    cleanup_work() { rm -rf "$WORK_DIR"; }
-    trap cleanup_work EXIT
-
-    info "Downloading framework from ${GIT_REF}…"
-    if command -v curl &>/dev/null; then
-      curl -fsSL "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
-    elif command -v wget &>/dev/null; then
-      wget -qO- "$ARCHIVE_URL" | tar xz -C "$WORK_DIR"
-    else
-      fail "curl or wget required to download framework."
-      exit 1
-    fi
-
-    # Gitea archives extract to <repo-name>/ inside the work dir
-    EXTRACTED_DIR="$(find "$WORK_DIR" -maxdepth 1 -mindepth 1 -type d | head -1)"
+    # Download repo archive and extract framework (shared with the dev build)
+    ensure_monorepo
    FRAMEWORK_SRC="$EXTRACTED_DIR/packages/mosaic/framework"

    if [[ ! -d "$FRAMEWORK_SRC" ]]; then
@@ -356,7 +455,11 @@ if [[ "$FLAG_CLI" == "true" ]]; then
  fi

  CURRENT="$(installed_cli_version)"
+  if [[ "$FLAG_DEV" == "true" ]]; then
+    LATEST=""
+  else
    LATEST="$(latest_cli_version)"
+  fi

  if [[ -n "$CURRENT" ]]; then
    dim "  Installed: ${CLI_PKG}@${CURRENT}"
@@ -364,7 +467,9 @@ if [[ "$FLAG_CLI" == "true" ]]; then
    dim "  Installed: (none)"
  fi

-  if [[ -n "$LATEST" ]]; then
+  if [[ "$FLAG_DEV" == "true" ]]; then
+    dim "  Source:    ${REPO_BASE} (ref: ${GIT_REF}, build-from-source)"
+  elif [[ -n "$LATEST" ]]; then
    dim "  Latest:    ${CLI_PKG}@${LATEST}"
  else
    dim "  Latest:    (registry unreachable)"
@@ -372,7 +477,9 @@ if [[ "$FLAG_CLI" == "true" ]]; then
  echo ""

  if [[ "$FLAG_CHECK" == "true" ]]; then
-    if [[ -z "$LATEST" ]]; then
+    if [[ "$FLAG_DEV" == "true" ]]; then
+      info "Dev mode: installed version is ${CURRENT:-(none)} (no registry comparison)."
+    elif [[ -z "$LATEST" ]]; then
      warn "Could not reach registry."
    elif [[ -z "$CURRENT" ]]; then
      warn "Not installed."
@@ -383,6 +490,16 @@ if [[ "$FLAG_CLI" == "true" ]]; then
    else
      ok "Up to date (or ahead of registry)."
    fi
+  elif [[ "$FLAG_DEV" == "true" ]]; then
+    info "Dev mode — building CLI + gateway from source at ref ${GIT_REF}…"
+    ensure_monorepo
+    install_cli_from_source
+
+    # PATH check for npm prefix
+    if [[ ":$PATH:" != *":$PREFIX/bin:"* ]]; then
+      warn "$PREFIX/bin is not on your PATH"
+      dim "  Add to your shell rc:  export PATH=\"$PREFIX/bin:\$PATH\""
+    fi
  else
    if [[ -z "$LATEST" ]]; then
      warn "Could not reach registry at $REGISTRY — skipping npm CLI."
Author	SHA1	Message	Date
jason.woltje	86e106fcc9	feat(#462 ): add federation list verb (#682 ) All checks were successful ci/woodpecker/push/publish Pipeline was successful Details ci/woodpecker/push/ci Pipeline was successful Details	2026-06-25 02:15:17 +00:00
jason.woltje	67135d3822	fix(fleet): guard `mosaic fleet restart` against tight-loop re-entry race (#680 ) All checks were successful ci/woodpecker/push/publish Pipeline was successful Details ci/woodpecker/push/ci Pipeline was successful Details	2026-06-25 01:44:48 +00:00
jason.woltje	adb153428b	feat(installer): --dev flag builds CLI + gateway from source (#681 ) All checks were successful ci/woodpecker/push/ci Pipeline was successful Details ci/woodpecker/push/publish Pipeline was successful Details	2026-06-24 23:54:52 +00:00