fix(CQ-ORCH-5): Fix TOCTOU race in agent state transitions
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful

Add per-agent mutex using promise chaining to serialize state transitions
for the same agent. This prevents the Time-of-Check-Time-of-Use race
condition where two concurrent requests could both read the current state,
both validate it as valid for transition, and both write, causing one to
overwrite the other's transition.

The mutex uses a Map<string, Promise<void>> with promise chaining so that:
- Concurrent transitions to the same agent are queued and executed sequentially
- Different agents can still transition concurrently without contention
- The lock is always released even if the transition throws an error

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jason Woltje
2026-02-06 14:02:40 -06:00
parent 6dd2ce1014
commit 2b356f6ca2
2 changed files with 356 additions and 74 deletions

View File

@@ -14,11 +14,21 @@ import { isValidAgentTransition } from "../valkey/types/state.types";
* - Persists agent state changes to Valkey
* - Emits pub/sub events on state changes
* - Tracks agent metadata (startedAt, completedAt, error)
* - Uses per-agent mutex to prevent TOCTOU race conditions (CQ-ORCH-5)
*/
@Injectable()
export class AgentLifecycleService {
private readonly logger = new Logger(AgentLifecycleService.name);
/**
* Per-agent mutex map to serialize state transitions.
* Uses promise chaining so concurrent transitions to the same agent
* are queued and executed sequentially, preventing TOCTOU races
* where two concurrent requests could both read the same state,
* both validate it as valid, and both write, causing lost updates.
*/
private readonly agentLocks = new Map<string, Promise<void>>();
constructor(
private readonly valkeyService: ValkeyService,
@Inject(forwardRef(() => AgentSpawnerService))
@@ -27,6 +37,37 @@ export class AgentLifecycleService {
this.logger.log("AgentLifecycleService initialized");
}
/**
* Acquire a per-agent mutex to serialize state transitions.
* Uses promise chaining: each caller chains onto the previous lock,
* ensuring transitions for the same agent are strictly sequential.
* Different agents can transition concurrently without contention.
*
* @param agentId Agent to acquire lock for
* @param fn Critical section to execute while holding the lock
* @returns Result of the critical section
*/
private async withAgentLock<T>(agentId: string, fn: () => Promise<T>): Promise<T> {
const previousLock = this.agentLocks.get(agentId) ?? Promise.resolve();
let releaseLock!: () => void;
const currentLock = new Promise<void>((resolve) => {
releaseLock = resolve;
});
this.agentLocks.set(agentId, currentLock);
try {
await previousLock;
return await fn();
} finally {
releaseLock();
// Clean up the map entry if we are the last in the chain
if (this.agentLocks.get(agentId) === currentLock) {
this.agentLocks.delete(agentId);
}
}
}
/**
* Transition agent from spawning to running state
* @param agentId Unique agent identifier
@@ -34,28 +75,34 @@ export class AgentLifecycleService {
* @throws Error if agent not found or invalid transition
*/
async transitionToRunning(agentId: string): Promise<AgentState> {
this.logger.log(`Transitioning agent ${agentId} to running`);
return this.withAgentLock(agentId, async () => {
this.logger.log(`Transitioning agent ${agentId} to running`);
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "running");
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "running");
// Set startedAt timestamp if not already set
const startedAt = currentState.startedAt ?? new Date().toISOString();
// Set startedAt timestamp if not already set
const startedAt = currentState.startedAt ?? new Date().toISOString();
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(agentId, "running", undefined);
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(
agentId,
"running",
undefined
);
// Ensure startedAt is set
if (!updatedState.startedAt) {
updatedState.startedAt = startedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Ensure startedAt is set
if (!updatedState.startedAt) {
updatedState.startedAt = startedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Emit event
await this.publishStateChangeEvent("agent.running", updatedState);
// Emit event
await this.publishStateChangeEvent("agent.running", updatedState);
this.logger.log(`Agent ${agentId} transitioned to running`);
return updatedState;
this.logger.log(`Agent ${agentId} transitioned to running`);
return updatedState;
});
}
/**
@@ -65,35 +112,37 @@ export class AgentLifecycleService {
* @throws Error if agent not found or invalid transition
*/
async transitionToCompleted(agentId: string): Promise<AgentState> {
this.logger.log(`Transitioning agent ${agentId} to completed`);
return this.withAgentLock(agentId, async () => {
this.logger.log(`Transitioning agent ${agentId} to completed`);
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "completed");
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "completed");
// Set completedAt timestamp
const completedAt = new Date().toISOString();
// Set completedAt timestamp
const completedAt = new Date().toISOString();
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(
agentId,
"completed",
undefined
);
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(
agentId,
"completed",
undefined
);
// Ensure completedAt is set
if (!updatedState.completedAt) {
updatedState.completedAt = completedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Ensure completedAt is set
if (!updatedState.completedAt) {
updatedState.completedAt = completedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Emit event
await this.publishStateChangeEvent("agent.completed", updatedState);
// Emit event
await this.publishStateChangeEvent("agent.completed", updatedState);
// Schedule session cleanup
this.spawnerService.scheduleSessionCleanup(agentId);
// Schedule session cleanup
this.spawnerService.scheduleSessionCleanup(agentId);
this.logger.log(`Agent ${agentId} transitioned to completed`);
return updatedState;
this.logger.log(`Agent ${agentId} transitioned to completed`);
return updatedState;
});
}
/**
@@ -104,31 +153,33 @@ export class AgentLifecycleService {
* @throws Error if agent not found or invalid transition
*/
async transitionToFailed(agentId: string, error: string): Promise<AgentState> {
this.logger.log(`Transitioning agent ${agentId} to failed: ${error}`);
return this.withAgentLock(agentId, async () => {
this.logger.log(`Transitioning agent ${agentId} to failed: ${error}`);
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "failed");
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "failed");
// Set completedAt timestamp
const completedAt = new Date().toISOString();
// Set completedAt timestamp
const completedAt = new Date().toISOString();
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(agentId, "failed", error);
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(agentId, "failed", error);
// Ensure completedAt is set
if (!updatedState.completedAt) {
updatedState.completedAt = completedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Ensure completedAt is set
if (!updatedState.completedAt) {
updatedState.completedAt = completedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Emit event
await this.publishStateChangeEvent("agent.failed", updatedState, error);
// Emit event
await this.publishStateChangeEvent("agent.failed", updatedState, error);
// Schedule session cleanup
this.spawnerService.scheduleSessionCleanup(agentId);
// Schedule session cleanup
this.spawnerService.scheduleSessionCleanup(agentId);
this.logger.error(`Agent ${agentId} transitioned to failed: ${error}`);
return updatedState;
this.logger.error(`Agent ${agentId} transitioned to failed: ${error}`);
return updatedState;
});
}
/**
@@ -138,31 +189,33 @@ export class AgentLifecycleService {
* @throws Error if agent not found or invalid transition
*/
async transitionToKilled(agentId: string): Promise<AgentState> {
this.logger.log(`Transitioning agent ${agentId} to killed`);
return this.withAgentLock(agentId, async () => {
this.logger.log(`Transitioning agent ${agentId} to killed`);
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "killed");
const currentState = await this.getAgentState(agentId);
this.validateTransition(currentState.status, "killed");
// Set completedAt timestamp
const completedAt = new Date().toISOString();
// Set completedAt timestamp
const completedAt = new Date().toISOString();
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(agentId, "killed", undefined);
// Update state in Valkey
const updatedState = await this.valkeyService.updateAgentStatus(agentId, "killed", undefined);
// Ensure completedAt is set
if (!updatedState.completedAt) {
updatedState.completedAt = completedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Ensure completedAt is set
if (!updatedState.completedAt) {
updatedState.completedAt = completedAt;
await this.valkeyService.setAgentState(updatedState);
}
// Emit event
await this.publishStateChangeEvent("agent.killed", updatedState);
// Emit event
await this.publishStateChangeEvent("agent.killed", updatedState);
// Schedule session cleanup
this.spawnerService.scheduleSessionCleanup(agentId);
// Schedule session cleanup
this.spawnerService.scheduleSessionCleanup(agentId);
this.logger.warn(`Agent ${agentId} transitioned to killed`);
return updatedState;
this.logger.warn(`Agent ${agentId} transitioned to killed`);
return updatedState;
});
}
/**