fix(#196): fix race condition in job status updates

Implemented optimistic locking with version field and SELECT FOR UPDATE
transactions to prevent data corruption from concurrent job status updates.

Changes:
- Added version field to RunnerJob schema for optimistic locking
- Created migration 20260202_add_runner_job_version_for_concurrency
- Implemented ConcurrentUpdateException for conflict detection
- Updated RunnerJobsService methods with optimistic locking:
  * updateStatus() - with version checking and retry logic
  * updateProgress() - with version checking and retry logic
  * cancel() - with version checking and retry logic
- Updated CoordinatorIntegrationService with SELECT FOR UPDATE:
  * updateJobStatus() - transaction with row locking
  * completeJob() - transaction with row locking
  * failJob() - transaction with row locking
  * updateJobProgress() - optimistic locking
- Added retry mechanism (3 attempts) with exponential backoff
- Added comprehensive concurrency tests (10 tests, all passing)
- Updated existing test mocks to support updateMany

Test Results:
- All 10 concurrency tests passing ✓
- Tests cover concurrent status updates, progress updates, completions,
  cancellations, retry logic, and exponential backoff

This fix prevents race conditions that could cause:
- Lost job results (double completion)
- Lost progress updates
- Invalid status transitions
- Data corruption under concurrent access

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Jason Woltje
2026-02-02 12:51:17 -06:00
parent a3b48dd631
commit ef25167c24
251 changed files with 7045 additions and 261 deletions

View File

@@ -4,6 +4,7 @@ import { Response } from "express";
import { PrismaService } from "../prisma/prisma.service";
import { BullMqService } from "../bullmq/bullmq.service";
import { QUEUE_NAMES } from "../bullmq/queues";
import { ConcurrentUpdateException } from "../common/exceptions/concurrent-update.exception";
import type { CreateJobDto, QueryJobsDto } from "./dto";
/**
@@ -144,37 +145,57 @@ export class RunnerJobsService {
}
/**
* Cancel a running or queued job
* Cancel a running or queued job with optimistic locking
*/
async cancel(id: string, workspaceId: string) {
// Verify job exists
const existingJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
return this.retryOnConflict(async () => {
// Verify job exists
const existingJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
if (!existingJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found`);
}
// Check if job can be cancelled
if (
existingJob.status === RunnerJobStatus.COMPLETED ||
existingJob.status === RunnerJobStatus.CANCELLED ||
existingJob.status === RunnerJobStatus.FAILED
) {
throw new BadRequestException(`Cannot cancel job with status ${existingJob.status}`);
}
// Update job status to cancelled with version check
const result = await this.prisma.runnerJob.updateMany({
where: {
id,
workspaceId,
version: existingJob.version,
},
data: {
status: RunnerJobStatus.CANCELLED,
completedAt: new Date(),
version: { increment: 1 },
},
});
if (result.count === 0) {
throw new ConcurrentUpdateException("RunnerJob", id, existingJob.version);
}
// Fetch and return updated job
const job = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
if (!job) {
throw new NotFoundException(`RunnerJob with ID ${id} not found after cancel`);
}
return job;
});
if (!existingJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found`);
}
// Check if job can be cancelled
if (
existingJob.status === RunnerJobStatus.COMPLETED ||
existingJob.status === RunnerJobStatus.CANCELLED ||
existingJob.status === RunnerJobStatus.FAILED
) {
throw new BadRequestException(`Cannot cancel job with status ${existingJob.status}`);
}
// Update job status to cancelled
const job = await this.prisma.runnerJob.update({
where: { id, workspaceId },
data: {
status: RunnerJobStatus.CANCELLED,
completedAt: new Date(),
},
});
return job;
}
/**
@@ -413,74 +434,179 @@ export class RunnerJobsService {
}
/**
* Update job status
* Retry wrapper for optimistic locking conflicts
* Retries the operation up to maxRetries times with exponential backoff
*/
private async retryOnConflict<T>(operation: () => Promise<T>, maxRetries = 3): Promise<T> {
for (let attempt = 0; attempt < maxRetries; attempt++) {
try {
return await operation();
} catch (error) {
if (error instanceof ConcurrentUpdateException && attempt < maxRetries - 1) {
// Exponential backoff: 100ms, 200ms, 400ms
const delayMs = Math.pow(2, attempt) * 100;
await new Promise((resolve) => setTimeout(resolve, delayMs));
continue;
}
throw error;
}
}
throw new Error("Retry logic failed unexpectedly");
}
/**
* Update job status with optimistic locking
*/
async updateStatus(
id: string,
workspaceId: string,
status: RunnerJobStatus,
data?: { result?: unknown; error?: string }
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.update>>> {
// Verify job exists
const existingJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.findUnique>>> {
return this.retryOnConflict(async () => {
// Read current job state
const existingJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
if (!existingJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found`);
}
if (!existingJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found`);
}
const updateData: Prisma.RunnerJobUpdateInput = {
status,
};
// Validate status transition (prevent invalid transitions even with concurrency)
if (!this.isValidStatusTransition(existingJob.status, status)) {
throw new BadRequestException(
`Invalid status transition from ${existingJob.status} to ${status}`
);
}
// Set timestamps based on status
if (status === RunnerJobStatus.RUNNING && !existingJob.startedAt) {
updateData.startedAt = new Date();
}
const updateData: Prisma.RunnerJobUpdateInput = {
status,
version: { increment: 1 }, // Increment version for optimistic locking
};
if (
status === RunnerJobStatus.COMPLETED ||
status === RunnerJobStatus.FAILED ||
status === RunnerJobStatus.CANCELLED
) {
updateData.completedAt = new Date();
}
// Set timestamps based on status
if (status === RunnerJobStatus.RUNNING && !existingJob.startedAt) {
updateData.startedAt = new Date();
}
// Add optional data
if (data?.result !== undefined) {
updateData.result = data.result as Prisma.InputJsonValue;
}
if (data?.error !== undefined) {
updateData.error = data.error;
}
if (
status === RunnerJobStatus.COMPLETED ||
status === RunnerJobStatus.FAILED ||
status === RunnerJobStatus.CANCELLED
) {
updateData.completedAt = new Date();
}
return this.prisma.runnerJob.update({
where: { id, workspaceId },
data: updateData,
// Add optional data
if (data?.result !== undefined) {
updateData.result = data.result as Prisma.InputJsonValue;
}
if (data?.error !== undefined) {
updateData.error = data.error;
}
// Use updateMany with version check for optimistic locking
const result = await this.prisma.runnerJob.updateMany({
where: {
id,
workspaceId,
version: existingJob.version, // Only update if version matches
},
data: updateData,
});
// If count is 0, version mismatch (concurrent update detected)
if (result.count === 0) {
throw new ConcurrentUpdateException("RunnerJob", id, existingJob.version);
}
// Fetch and return updated job
const updatedJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
if (!updatedJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found after update`);
}
return updatedJob;
});
}
/**
* Update job progress percentage
* Validate status transitions
*/
private isValidStatusTransition(
currentStatus: RunnerJobStatus,
newStatus: RunnerJobStatus
): boolean {
// Define valid transitions
const validTransitions: Record<RunnerJobStatus, RunnerJobStatus[]> = {
[RunnerJobStatus.PENDING]: [
RunnerJobStatus.QUEUED,
RunnerJobStatus.RUNNING,
RunnerJobStatus.CANCELLED,
],
[RunnerJobStatus.QUEUED]: [RunnerJobStatus.RUNNING, RunnerJobStatus.CANCELLED],
[RunnerJobStatus.RUNNING]: [
RunnerJobStatus.COMPLETED,
RunnerJobStatus.FAILED,
RunnerJobStatus.CANCELLED,
],
[RunnerJobStatus.COMPLETED]: [],
[RunnerJobStatus.FAILED]: [],
[RunnerJobStatus.CANCELLED]: [],
};
return validTransitions[currentStatus].includes(newStatus);
}
/**
* Update job progress percentage with optimistic locking
*/
async updateProgress(
id: string,
workspaceId: string,
progressPercent: number
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.update>>> {
// Verify job exists
const existingJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.findUnique>>> {
return this.retryOnConflict(async () => {
// Read current job state
const existingJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
if (!existingJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found`);
}
if (!existingJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found`);
}
return this.prisma.runnerJob.update({
where: { id, workspaceId },
data: { progressPercent },
// Use updateMany with version check for optimistic locking
const result = await this.prisma.runnerJob.updateMany({
where: {
id,
workspaceId,
version: existingJob.version,
},
data: {
progressPercent,
version: { increment: 1 },
},
});
if (result.count === 0) {
throw new ConcurrentUpdateException("RunnerJob", id, existingJob.version);
}
// Fetch and return updated job
const updatedJob = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
});
if (!updatedJob) {
throw new NotFoundException(`RunnerJob with ID ${id} not found after update`);
}
return updatedJob;
});
}
}