fix(CQ-ORCH-5): Fix TOCTOU race in agent state transitions
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

Add per-agent mutex using promise chaining to serialize state transitions
for the same agent. This prevents the Time-of-Check-Time-of-Use race
condition where two concurrent requests could both read the current state,
both validate it as valid for transition, and both write, causing one to
overwrite the other's transition.

The mutex uses a Map<string, Promise<void>> with promise chaining so that:
- Concurrent transitions to the same agent are queued and executed sequentially
- Different agents can still transition concurrently without contention
- The lock is always released even if the transition throws an error

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jason Woltje
2026-02-06 14:02:40 -06:00
parent 6dd2ce1014
commit 2b356f6ca2
2 changed files with 356 additions and 74 deletions

View File

@@ -706,4 +706,233 @@ describe("AgentLifecycleService", () => {
expect(mockSpawnerService.scheduleSessionCleanup).not.toHaveBeenCalled();
});
});
describe("TOCTOU race prevention (CQ-ORCH-5)", () => {
it("should serialize concurrent transitions to the same agent", async () => {
const executionOrder: string[] = [];
// Simulate state that changes after first transition completes
let currentStatus: "spawning" | "running" | "completed" = "spawning";
mockValkeyService.getAgentState.mockImplementation(async () => {
return {
agentId: mockAgentId,
status: currentStatus,
taskId: mockTaskId,
} as AgentState;
});
mockValkeyService.updateAgentStatus.mockImplementation(
async (_agentId: string, status: string) => {
// Simulate delay to allow interleaving if lock is broken
await new Promise<void>((resolve) => {
setTimeout(resolve, 10);
});
currentStatus = status as "spawning" | "running" | "completed";
executionOrder.push(`updated-to-${status}`);
return {
agentId: mockAgentId,
status,
taskId: mockTaskId,
startedAt: "2026-02-02T10:00:00Z",
...(status === "completed" && { completedAt: "2026-02-02T11:00:00Z" }),
} as AgentState;
}
);
// Launch both transitions concurrently
const [result1, result2] = await Promise.allSettled([
service.transitionToRunning(mockAgentId),
service.transitionToCompleted(mockAgentId),
]);
// First should succeed (spawning -> running)
expect(result1.status).toBe("fulfilled");
// Second should also succeed (running -> completed) because the lock
// serializes them: first one completes, updates state to running,
// then second reads the updated state and transitions to completed
expect(result2.status).toBe("fulfilled");
// Verify they executed in order, not interleaved
expect(executionOrder).toEqual(["updated-to-running", "updated-to-completed"]);
});
it("should reject second concurrent transition if first makes it invalid", async () => {
let currentStatus: "running" | "completed" | "killed" = "running";
mockValkeyService.getAgentState.mockImplementation(async () => {
return {
agentId: mockAgentId,
status: currentStatus,
taskId: mockTaskId,
startedAt: "2026-02-02T10:00:00Z",
} as AgentState;
});
mockValkeyService.updateAgentStatus.mockImplementation(
async (_agentId: string, status: string) => {
await new Promise<void>((resolve) => {
setTimeout(resolve, 10);
});
currentStatus = status as "running" | "completed" | "killed";
return {
agentId: mockAgentId,
status,
taskId: mockTaskId,
startedAt: "2026-02-02T10:00:00Z",
completedAt: "2026-02-02T11:00:00Z",
} as AgentState;
}
);
// Both try to transition from running to a terminal state concurrently
const [result1, result2] = await Promise.allSettled([
service.transitionToCompleted(mockAgentId),
service.transitionToKilled(mockAgentId),
]);
// First should succeed (running -> completed)
expect(result1.status).toBe("fulfilled");
// Second should fail because after first completes,
// agent is in "completed" state which cannot transition to "killed"
expect(result2.status).toBe("rejected");
if (result2.status === "rejected") {
expect(result2.reason).toBeInstanceOf(Error);
expect((result2.reason as Error).message).toContain("Invalid state transition");
}
});
it("should allow concurrent transitions to different agents", async () => {
const agent1Id = "agent-1";
const agent2Id = "agent-2";
const executionOrder: string[] = [];
mockValkeyService.getAgentState.mockImplementation(async (agentId: string) => {
return {
agentId,
status: "spawning",
taskId: `task-for-${agentId}`,
} as AgentState;
});
mockValkeyService.updateAgentStatus.mockImplementation(
async (agentId: string, status: string) => {
executionOrder.push(`${agentId}-start`);
await new Promise<void>((resolve) => {
setTimeout(resolve, 10);
});
executionOrder.push(`${agentId}-end`);
return {
agentId,
status,
taskId: `task-for-${agentId}`,
startedAt: "2026-02-02T10:00:00Z",
} as AgentState;
}
);
// Both should run concurrently since they target different agents
const [result1, result2] = await Promise.allSettled([
service.transitionToRunning(agent1Id),
service.transitionToRunning(agent2Id),
]);
expect(result1.status).toBe("fulfilled");
expect(result2.status).toBe("fulfilled");
// Both should start before either finishes (concurrent, not serialized)
// The execution order should show interleaving
expect(executionOrder).toContain("agent-1-start");
expect(executionOrder).toContain("agent-2-start");
});
it("should release lock even when transition throws an error", async () => {
let callCount = 0;
mockValkeyService.getAgentState.mockImplementation(async () => {
callCount++;
if (callCount === 1) {
// First call: throw error
return null;
}
// Second call: return valid state
return {
agentId: mockAgentId,
status: "spawning",
taskId: mockTaskId,
} as AgentState;
});
mockValkeyService.updateAgentStatus.mockResolvedValue({
agentId: mockAgentId,
status: "running",
taskId: mockTaskId,
startedAt: "2026-02-02T10:00:00Z",
});
// First transition should fail (agent not found)
await expect(service.transitionToRunning(mockAgentId)).rejects.toThrow(
`Agent ${mockAgentId} not found`
);
// Second transition should succeed (lock was released despite error)
const result = await service.transitionToRunning(mockAgentId);
expect(result.status).toBe("running");
});
it("should handle three concurrent transitions sequentially for same agent", async () => {
const executionOrder: string[] = [];
let currentStatus: "spawning" | "running" | "completed" | "failed" = "spawning";
mockValkeyService.getAgentState.mockImplementation(async () => {
return {
agentId: mockAgentId,
status: currentStatus,
taskId: mockTaskId,
...(currentStatus !== "spawning" && { startedAt: "2026-02-02T10:00:00Z" }),
} as AgentState;
});
mockValkeyService.updateAgentStatus.mockImplementation(
async (_agentId: string, status: string) => {
executionOrder.push(`update-${status}`);
await new Promise<void>((resolve) => {
setTimeout(resolve, 5);
});
currentStatus = status as "spawning" | "running" | "completed" | "failed";
return {
agentId: mockAgentId,
status,
taskId: mockTaskId,
startedAt: "2026-02-02T10:00:00Z",
...(["completed", "failed"].includes(status) && {
completedAt: "2026-02-02T11:00:00Z",
}),
} as AgentState;
}
);
// Launch three transitions at once: spawning->running->completed, plus a failed attempt
const [r1, r2, r3] = await Promise.allSettled([
service.transitionToRunning(mockAgentId),
service.transitionToCompleted(mockAgentId),
service.transitionToFailed(mockAgentId, "late error"),
]);
// First: spawning -> running (succeeds)
expect(r1.status).toBe("fulfilled");
// Second: running -> completed (succeeds, serialized after first)
expect(r2.status).toBe("fulfilled");
// Third: completed -> failed (fails, completed is terminal)
expect(r3.status).toBe("rejected");
// Verify sequential execution
expect(executionOrder[0]).toBe("update-running");
expect(executionOrder[1]).toBe("update-completed");
// Third never gets to update because validation fails
expect(executionOrder).toHaveLength(2);
});
});
});