fix(CQ-ORCH-5): Fix TOCTOU race in agent state transitions

Add per-agent mutex using promise chaining to serialize state transitions for the same agent. This prevents the Time-of-Check-Time-of-Use race condition where two concurrent requests could both read the current state, both validate it as valid for transition, and both write, causing one to overwrite the other's transition. The mutex uses a Map<string, Promise<void>> with promise chaining so that: - Concurrent transitions to the same agent are queued and executed sequentially - Different agents can still transition concurrently without contention - The lock is always released even if the transition throws an error Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 14:02:40 -06:00
parent 6dd2ce1014
commit 2b356f6ca2
2 changed files with 356 additions and 74 deletions
--- a/apps/orchestrator/src/spawner/agent-lifecycle.service.spec.ts
+++ b/apps/orchestrator/src/spawner/agent-lifecycle.service.spec.ts
@@ -706,4 +706,233 @@ describe("AgentLifecycleService", () => {
      expect(mockSpawnerService.scheduleSessionCleanup).not.toHaveBeenCalled();
    });
  });
+
+  describe("TOCTOU race prevention (CQ-ORCH-5)", () => {
+    it("should serialize concurrent transitions to the same agent", async () => {
+      const executionOrder: string[] = [];
+
+      // Simulate state that changes after first transition completes
+      let currentStatus: "spawning" | "running" | "completed" = "spawning";
+
+      mockValkeyService.getAgentState.mockImplementation(async () => {
+        return {
+          agentId: mockAgentId,
+          status: currentStatus,
+          taskId: mockTaskId,
+        } as AgentState;
+      });
+
+      mockValkeyService.updateAgentStatus.mockImplementation(
+        async (_agentId: string, status: string) => {
+          // Simulate delay to allow interleaving if lock is broken
+          await new Promise<void>((resolve) => {
+            setTimeout(resolve, 10);
+          });
+          currentStatus = status as "spawning" | "running" | "completed";
+          executionOrder.push(`updated-to-${status}`);
+          return {
+            agentId: mockAgentId,
+            status,
+            taskId: mockTaskId,
+            startedAt: "2026-02-02T10:00:00Z",
+            ...(status === "completed" && { completedAt: "2026-02-02T11:00:00Z" }),
+          } as AgentState;
+        }
+      );
+
+      // Launch both transitions concurrently
+      const [result1, result2] = await Promise.allSettled([
+        service.transitionToRunning(mockAgentId),
+        service.transitionToCompleted(mockAgentId),
+      ]);
+
+      // First should succeed (spawning -> running)
+      expect(result1.status).toBe("fulfilled");
+
+      // Second should also succeed (running -> completed) because the lock
+      // serializes them: first one completes, updates state to running,
+      // then second reads the updated state and transitions to completed
+      expect(result2.status).toBe("fulfilled");
+
+      // Verify they executed in order, not interleaved
+      expect(executionOrder).toEqual(["updated-to-running", "updated-to-completed"]);
+    });
+
+    it("should reject second concurrent transition if first makes it invalid", async () => {
+      let currentStatus: "running" | "completed" | "killed" = "running";
+
+      mockValkeyService.getAgentState.mockImplementation(async () => {
+        return {
+          agentId: mockAgentId,
+          status: currentStatus,
+          taskId: mockTaskId,
+          startedAt: "2026-02-02T10:00:00Z",
+        } as AgentState;
+      });
+
+      mockValkeyService.updateAgentStatus.mockImplementation(
+        async (_agentId: string, status: string) => {
+          await new Promise<void>((resolve) => {
+            setTimeout(resolve, 10);
+          });
+          currentStatus = status as "running" | "completed" | "killed";
+          return {
+            agentId: mockAgentId,
+            status,
+            taskId: mockTaskId,
+            startedAt: "2026-02-02T10:00:00Z",
+            completedAt: "2026-02-02T11:00:00Z",
+          } as AgentState;
+        }
+      );
+
+      // Both try to transition from running to a terminal state concurrently
+      const [result1, result2] = await Promise.allSettled([
+        service.transitionToCompleted(mockAgentId),
+        service.transitionToKilled(mockAgentId),
+      ]);
+
+      // First should succeed (running -> completed)
+      expect(result1.status).toBe("fulfilled");
+
+      // Second should fail because after first completes,
+      // agent is in "completed" state which cannot transition to "killed"
+      expect(result2.status).toBe("rejected");
+      if (result2.status === "rejected") {
+        expect(result2.reason).toBeInstanceOf(Error);
+        expect((result2.reason as Error).message).toContain("Invalid state transition");
+      }
+    });
+
+    it("should allow concurrent transitions to different agents", async () => {
+      const agent1Id = "agent-1";
+      const agent2Id = "agent-2";
+      const executionOrder: string[] = [];
+
+      mockValkeyService.getAgentState.mockImplementation(async (agentId: string) => {
+        return {
+          agentId,
+          status: "spawning",
+          taskId: `task-for-${agentId}`,
+        } as AgentState;
+      });
+
+      mockValkeyService.updateAgentStatus.mockImplementation(
+        async (agentId: string, status: string) => {
+          executionOrder.push(`${agentId}-start`);
+          await new Promise<void>((resolve) => {
+            setTimeout(resolve, 10);
+          });
+          executionOrder.push(`${agentId}-end`);
+          return {
+            agentId,
+            status,
+            taskId: `task-for-${agentId}`,
+            startedAt: "2026-02-02T10:00:00Z",
+          } as AgentState;
+        }
+      );
+
+      // Both should run concurrently since they target different agents
+      const [result1, result2] = await Promise.allSettled([
+        service.transitionToRunning(agent1Id),
+        service.transitionToRunning(agent2Id),
+      ]);
+
+      expect(result1.status).toBe("fulfilled");
+      expect(result2.status).toBe("fulfilled");
+
+      // Both should start before either finishes (concurrent, not serialized)
+      // The execution order should show interleaving
+      expect(executionOrder).toContain("agent-1-start");
+      expect(executionOrder).toContain("agent-2-start");
+    });
+
+    it("should release lock even when transition throws an error", async () => {
+      let callCount = 0;
+
+      mockValkeyService.getAgentState.mockImplementation(async () => {
+        callCount++;
+        if (callCount === 1) {
+          // First call: throw error
+          return null;
+        }
+        // Second call: return valid state
+        return {
+          agentId: mockAgentId,
+          status: "spawning",
+          taskId: mockTaskId,
+        } as AgentState;
+      });
+
+      mockValkeyService.updateAgentStatus.mockResolvedValue({
+        agentId: mockAgentId,
+        status: "running",
+        taskId: mockTaskId,
+        startedAt: "2026-02-02T10:00:00Z",
+      });
+
+      // First transition should fail (agent not found)
+      await expect(service.transitionToRunning(mockAgentId)).rejects.toThrow(
+        `Agent ${mockAgentId} not found`
+      );
+
+      // Second transition should succeed (lock was released despite error)
+      const result = await service.transitionToRunning(mockAgentId);
+      expect(result.status).toBe("running");
+    });
+
+    it("should handle three concurrent transitions sequentially for same agent", async () => {
+      const executionOrder: string[] = [];
+      let currentStatus: "spawning" | "running" | "completed" | "failed" = "spawning";
+
+      mockValkeyService.getAgentState.mockImplementation(async () => {
+        return {
+          agentId: mockAgentId,
+          status: currentStatus,
+          taskId: mockTaskId,
+          ...(currentStatus !== "spawning" && { startedAt: "2026-02-02T10:00:00Z" }),
+        } as AgentState;
+      });
+
+      mockValkeyService.updateAgentStatus.mockImplementation(
+        async (_agentId: string, status: string) => {
+          executionOrder.push(`update-${status}`);
+          await new Promise<void>((resolve) => {
+            setTimeout(resolve, 5);
+          });
+          currentStatus = status as "spawning" | "running" | "completed" | "failed";
+          return {
+            agentId: mockAgentId,
+            status,
+            taskId: mockTaskId,
+            startedAt: "2026-02-02T10:00:00Z",
+            ...(["completed", "failed"].includes(status) && {
+              completedAt: "2026-02-02T11:00:00Z",
+            }),
+          } as AgentState;
+        }
+      );
+
+      // Launch three transitions at once: spawning->running->completed, plus a failed attempt
+      const [r1, r2, r3] = await Promise.allSettled([
+        service.transitionToRunning(mockAgentId),
+        service.transitionToCompleted(mockAgentId),
+        service.transitionToFailed(mockAgentId, "late error"),
+      ]);
+
+      // First: spawning -> running (succeeds)
+      expect(r1.status).toBe("fulfilled");
+      // Second: running -> completed (succeeds, serialized after first)
+      expect(r2.status).toBe("fulfilled");
+      // Third: completed -> failed (fails, completed is terminal)
+      expect(r3.status).toBe("rejected");
+
+      // Verify sequential execution
+      expect(executionOrder[0]).toBe("update-running");
+      expect(executionOrder[1]).toBe("update-completed");
+      // Third never gets to update because validation fails
+      expect(executionOrder).toHaveLength(2);
+    });
+  });
 });