fix(#196): fix race condition in job status updates
Implemented optimistic locking with version field and SELECT FOR UPDATE transactions to prevent data corruption from concurrent job status updates. Changes: - Added version field to RunnerJob schema for optimistic locking - Created migration 20260202_add_runner_job_version_for_concurrency - Implemented ConcurrentUpdateException for conflict detection - Updated RunnerJobsService methods with optimistic locking: * updateStatus() - with version checking and retry logic * updateProgress() - with version checking and retry logic * cancel() - with version checking and retry logic - Updated CoordinatorIntegrationService with SELECT FOR UPDATE: * updateJobStatus() - transaction with row locking * completeJob() - transaction with row locking * failJob() - transaction with row locking * updateJobProgress() - optimistic locking - Added retry mechanism (3 attempts) with exponential backoff - Added comprehensive concurrency tests (10 tests, all passing) - Updated existing test mocks to support updateMany Test Results: - All 10 concurrency tests passing ✓ - Tests cover concurrent status updates, progress updates, completions, cancellations, retry logic, and exponential backoff This fix prevents race conditions that could cause: - Lost job results (double completion) - Lost progress updates - Invalid status transitions - Data corruption under concurrent access Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,392 @@
|
||||
import { describe, it, expect, beforeEach, vi } from "vitest";
|
||||
import { Test, TestingModule } from "@nestjs/testing";
|
||||
import { ConflictException } from "@nestjs/common";
|
||||
import { CoordinatorIntegrationService } from "./coordinator-integration.service";
|
||||
import { PrismaService } from "../prisma/prisma.service";
|
||||
import { JobEventsService } from "../job-events/job-events.service";
|
||||
import { HeraldService } from "../herald/herald.service";
|
||||
import { BullMqService } from "../bullmq/bullmq.service";
|
||||
import { RunnerJobStatus } from "@prisma/client";
|
||||
import { CoordinatorJobStatus, UpdateJobStatusDto } from "./dto";
|
||||
|
||||
/**
|
||||
* Concurrency tests for CoordinatorIntegrationService
|
||||
* Focus on race conditions during coordinator job status updates
|
||||
*/
|
||||
describe("CoordinatorIntegrationService - Concurrency", () => {
|
||||
let service: CoordinatorIntegrationService;
|
||||
let prisma: PrismaService;
|
||||
|
||||
const mockJobEventsService = {
|
||||
emitJobCreated: vi.fn(),
|
||||
emitJobStarted: vi.fn(),
|
||||
emitJobCompleted: vi.fn(),
|
||||
emitJobFailed: vi.fn(),
|
||||
emitEvent: vi.fn(),
|
||||
};
|
||||
|
||||
const mockHeraldService = {
|
||||
broadcastJobEvent: vi.fn(),
|
||||
};
|
||||
|
||||
const mockBullMqService = {
|
||||
addJob: vi.fn(),
|
||||
};
|
||||
|
||||
beforeEach(async () => {
|
||||
const module: TestingModule = await Test.createTestingModule({
|
||||
providers: [
|
||||
CoordinatorIntegrationService,
|
||||
{
|
||||
provide: PrismaService,
|
||||
useValue: {
|
||||
runnerJob: {
|
||||
findUnique: vi.fn(),
|
||||
update: vi.fn(),
|
||||
updateMany: vi.fn(),
|
||||
},
|
||||
$transaction: vi.fn(),
|
||||
$queryRaw: vi.fn(),
|
||||
},
|
||||
},
|
||||
{
|
||||
provide: JobEventsService,
|
||||
useValue: mockJobEventsService,
|
||||
},
|
||||
{
|
||||
provide: HeraldService,
|
||||
useValue: mockHeraldService,
|
||||
},
|
||||
{
|
||||
provide: BullMqService,
|
||||
useValue: mockBullMqService,
|
||||
},
|
||||
],
|
||||
}).compile();
|
||||
|
||||
service = module.get<CoordinatorIntegrationService>(CoordinatorIntegrationService);
|
||||
prisma = module.get<PrismaService>(PrismaService);
|
||||
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
describe("concurrent status updates from coordinator", () => {
|
||||
it("should use SELECT FOR UPDATE to prevent race conditions", async () => {
|
||||
const jobId = "job-123";
|
||||
const dto: UpdateJobStatusDto = {
|
||||
status: CoordinatorJobStatus.RUNNING,
|
||||
agentId: "agent-1",
|
||||
agentType: "python",
|
||||
};
|
||||
|
||||
const mockJob = {
|
||||
id: jobId,
|
||||
status: RunnerJobStatus.PENDING,
|
||||
workspaceId: "workspace-123",
|
||||
version: 1,
|
||||
};
|
||||
|
||||
const updatedJob = {
|
||||
...mockJob,
|
||||
status: RunnerJobStatus.RUNNING,
|
||||
startedAt: new Date(),
|
||||
version: 2,
|
||||
};
|
||||
|
||||
// Mock transaction with SELECT FOR UPDATE
|
||||
const mockTxClient = {
|
||||
$queryRaw: vi.fn().mockResolvedValue([mockJob]),
|
||||
runnerJob: {
|
||||
update: vi.fn().mockResolvedValue(updatedJob),
|
||||
},
|
||||
};
|
||||
|
||||
vi.mocked(prisma.$transaction).mockImplementation(async (callback: any) => {
|
||||
return callback(mockTxClient);
|
||||
});
|
||||
|
||||
const mockEvent = {
|
||||
id: "event-1",
|
||||
jobId,
|
||||
type: "job.started",
|
||||
timestamp: new Date(),
|
||||
};
|
||||
|
||||
vi.mocked(mockJobEventsService.emitJobStarted).mockResolvedValue(mockEvent as any);
|
||||
|
||||
const result = await service.updateJobStatus(jobId, dto);
|
||||
|
||||
expect(result.status).toBe(RunnerJobStatus.RUNNING);
|
||||
|
||||
// Verify SELECT FOR UPDATE was used
|
||||
expect(mockTxClient.$queryRaw).toHaveBeenCalledWith(
|
||||
expect.anything() // Raw SQL with FOR UPDATE
|
||||
);
|
||||
});
|
||||
|
||||
it("should handle concurrent status updates by coordinator and API", async () => {
|
||||
const jobId = "job-123";
|
||||
|
||||
// Coordinator tries to mark as RUNNING
|
||||
const coordinatorDto: UpdateJobStatusDto = {
|
||||
status: CoordinatorJobStatus.RUNNING,
|
||||
};
|
||||
|
||||
// Simulate transaction lock timeout (another process holds lock)
|
||||
vi.mocked(prisma.$transaction).mockRejectedValue(new Error("could not obtain lock on row"));
|
||||
|
||||
await expect(service.updateJobStatus(jobId, coordinatorDto)).rejects.toThrow();
|
||||
});
|
||||
|
||||
it("should serialize concurrent status transitions", async () => {
|
||||
const jobId = "job-123";
|
||||
|
||||
const mockJob = {
|
||||
id: jobId,
|
||||
status: RunnerJobStatus.PENDING,
|
||||
workspaceId: "workspace-123",
|
||||
version: 1,
|
||||
};
|
||||
|
||||
// Simulate transaction that waits for lock, then proceeds
|
||||
const mockTxClient = {
|
||||
$queryRaw: vi.fn().mockResolvedValue([mockJob]),
|
||||
runnerJob: {
|
||||
update: vi.fn().mockResolvedValue({
|
||||
...mockJob,
|
||||
status: RunnerJobStatus.RUNNING,
|
||||
version: 2,
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
vi.mocked(prisma.$transaction).mockImplementation(async (callback: any) => {
|
||||
// Simulate delay while waiting for lock
|
||||
await new Promise((resolve) => setTimeout(resolve, 100));
|
||||
return callback(mockTxClient);
|
||||
});
|
||||
|
||||
const dto: UpdateJobStatusDto = {
|
||||
status: CoordinatorJobStatus.RUNNING,
|
||||
};
|
||||
|
||||
vi.mocked(mockJobEventsService.emitJobStarted).mockResolvedValue({
|
||||
id: "event-1",
|
||||
jobId,
|
||||
type: "job.started",
|
||||
timestamp: new Date(),
|
||||
} as any);
|
||||
|
||||
const result = await service.updateJobStatus(jobId, dto);
|
||||
|
||||
expect(result.status).toBe(RunnerJobStatus.RUNNING);
|
||||
expect(prisma.$transaction).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe("concurrent completion from coordinator", () => {
|
||||
it("should prevent double completion using transaction", async () => {
|
||||
const jobId = "job-123";
|
||||
|
||||
const mockJob = {
|
||||
id: jobId,
|
||||
status: RunnerJobStatus.RUNNING,
|
||||
workspaceId: "workspace-123",
|
||||
startedAt: new Date(),
|
||||
version: 2,
|
||||
};
|
||||
|
||||
const completedJob = {
|
||||
...mockJob,
|
||||
status: RunnerJobStatus.COMPLETED,
|
||||
completedAt: new Date(),
|
||||
progressPercent: 100,
|
||||
result: { success: true },
|
||||
version: 3,
|
||||
};
|
||||
|
||||
const mockTxClient = {
|
||||
$queryRaw: vi.fn().mockResolvedValue([mockJob]),
|
||||
runnerJob: {
|
||||
update: vi.fn().mockResolvedValue(completedJob),
|
||||
},
|
||||
};
|
||||
|
||||
vi.mocked(prisma.$transaction).mockImplementation(async (callback: any) => {
|
||||
return callback(mockTxClient);
|
||||
});
|
||||
|
||||
vi.mocked(mockJobEventsService.emitJobCompleted).mockResolvedValue({
|
||||
id: "event-1",
|
||||
jobId,
|
||||
type: "job.completed",
|
||||
timestamp: new Date(),
|
||||
} as any);
|
||||
|
||||
const result = await service.completeJob(jobId, {
|
||||
result: { success: true },
|
||||
tokensUsed: 1000,
|
||||
durationSeconds: 120,
|
||||
});
|
||||
|
||||
expect(result.status).toBe(RunnerJobStatus.COMPLETED);
|
||||
expect(mockTxClient.$queryRaw).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("should handle concurrent completion and failure attempts", async () => {
|
||||
const jobId = "job-123";
|
||||
|
||||
const mockJob = {
|
||||
id: jobId,
|
||||
status: RunnerJobStatus.RUNNING,
|
||||
workspaceId: "workspace-123",
|
||||
startedAt: new Date(),
|
||||
version: 2,
|
||||
};
|
||||
|
||||
// First transaction (completion) succeeds
|
||||
const completedJob = {
|
||||
...mockJob,
|
||||
status: RunnerJobStatus.COMPLETED,
|
||||
completedAt: new Date(),
|
||||
version: 3,
|
||||
};
|
||||
|
||||
// Second transaction (failure) sees completed job and should fail
|
||||
const mockTxClient1 = {
|
||||
$queryRaw: vi.fn().mockResolvedValue([mockJob]),
|
||||
runnerJob: {
|
||||
update: vi.fn().mockResolvedValue(completedJob),
|
||||
},
|
||||
};
|
||||
|
||||
const mockTxClient2 = {
|
||||
$queryRaw: vi.fn().mockResolvedValue([completedJob]), // Job already completed
|
||||
runnerJob: {
|
||||
update: vi.fn(),
|
||||
},
|
||||
};
|
||||
|
||||
vi.mocked(prisma.$transaction)
|
||||
.mockImplementationOnce(async (callback: any) => callback(mockTxClient1))
|
||||
.mockImplementationOnce(async (callback: any) => callback(mockTxClient2));
|
||||
|
||||
vi.mocked(mockJobEventsService.emitJobCompleted).mockResolvedValue({
|
||||
id: "event-1",
|
||||
jobId,
|
||||
type: "job.completed",
|
||||
timestamp: new Date(),
|
||||
} as any);
|
||||
|
||||
// First call (completion) succeeds
|
||||
const result1 = await service.completeJob(jobId, {
|
||||
result: { success: true },
|
||||
});
|
||||
expect(result1.status).toBe(RunnerJobStatus.COMPLETED);
|
||||
|
||||
// Second call (failure) should be rejected due to invalid status transition
|
||||
await expect(
|
||||
service.failJob(jobId, {
|
||||
error: "Something went wrong",
|
||||
})
|
||||
).rejects.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe("concurrent progress updates from coordinator", () => {
|
||||
it("should handle rapid progress updates safely", async () => {
|
||||
const jobId = "job-123";
|
||||
|
||||
const progressUpdates = [25, 50, 75];
|
||||
|
||||
for (const progress of progressUpdates) {
|
||||
const mockJob = {
|
||||
id: jobId,
|
||||
status: RunnerJobStatus.RUNNING,
|
||||
progressPercent: progress - 25,
|
||||
version: progress / 25, // version increases with each update
|
||||
};
|
||||
|
||||
const updatedJob = {
|
||||
...mockJob,
|
||||
progressPercent: progress,
|
||||
version: mockJob.version + 1,
|
||||
};
|
||||
|
||||
vi.mocked(prisma.runnerJob.findUnique).mockResolvedValue(mockJob as any);
|
||||
vi.mocked(prisma.runnerJob.updateMany).mockResolvedValue({ count: 1 });
|
||||
vi.mocked(prisma.runnerJob.findUnique).mockResolvedValueOnce(updatedJob as any);
|
||||
|
||||
const result = await service.updateJobProgress(jobId, {
|
||||
progressPercent: progress,
|
||||
});
|
||||
|
||||
expect(result.progressPercent).toBe(progress);
|
||||
}
|
||||
|
||||
expect(mockJobEventsService.emitEvent).toHaveBeenCalledTimes(3);
|
||||
});
|
||||
|
||||
it("should detect version conflicts in progress updates", async () => {
|
||||
const jobId = "job-123";
|
||||
|
||||
const mockJob = {
|
||||
id: jobId,
|
||||
status: RunnerJobStatus.RUNNING,
|
||||
progressPercent: 50,
|
||||
version: 2,
|
||||
};
|
||||
|
||||
vi.mocked(prisma.runnerJob.findUnique).mockResolvedValue(mockJob as any);
|
||||
// Simulate version conflict (another update happened)
|
||||
vi.mocked(prisma.runnerJob.updateMany).mockResolvedValue({ count: 0 });
|
||||
|
||||
await expect(
|
||||
service.updateJobProgress(jobId, {
|
||||
progressPercent: 75,
|
||||
})
|
||||
).rejects.toThrow(ConflictException);
|
||||
});
|
||||
});
|
||||
|
||||
describe("transaction isolation", () => {
|
||||
it("should use appropriate transaction isolation level", async () => {
|
||||
const jobId = "job-123";
|
||||
|
||||
const mockJob = {
|
||||
id: jobId,
|
||||
status: RunnerJobStatus.PENDING,
|
||||
version: 1,
|
||||
};
|
||||
|
||||
const mockTxClient = {
|
||||
$queryRaw: vi.fn().mockResolvedValue([mockJob]),
|
||||
runnerJob: {
|
||||
update: vi.fn().mockResolvedValue({
|
||||
...mockJob,
|
||||
status: RunnerJobStatus.RUNNING,
|
||||
version: 2,
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
vi.mocked(prisma.$transaction).mockImplementation(async (callback: any) => {
|
||||
return callback(mockTxClient);
|
||||
});
|
||||
|
||||
vi.mocked(mockJobEventsService.emitJobStarted).mockResolvedValue({
|
||||
id: "event-1",
|
||||
jobId,
|
||||
type: "job.started",
|
||||
timestamp: new Date(),
|
||||
} as any);
|
||||
|
||||
await service.updateJobStatus(jobId, {
|
||||
status: CoordinatorJobStatus.RUNNING,
|
||||
});
|
||||
|
||||
// Verify transaction was used (isolates the operation)
|
||||
expect(prisma.$transaction).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -6,6 +6,7 @@ import { HeraldService } from "../herald/herald.service";
|
||||
import { BullMqService } from "../bullmq/bullmq.service";
|
||||
import { QUEUE_NAMES } from "../bullmq/queues";
|
||||
import { JOB_PROGRESS } from "../job-events/event-types";
|
||||
import { ConcurrentUpdateException } from "../common/exceptions/concurrent-update.exception";
|
||||
import {
|
||||
CoordinatorJobStatus,
|
||||
type CreateCoordinatorJobDto,
|
||||
@@ -98,7 +99,8 @@ export class CoordinatorIntegrationService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Update job status from the coordinator
|
||||
* Update job status from the coordinator using transaction with SELECT FOR UPDATE
|
||||
* This ensures serialized access to job status updates from the coordinator
|
||||
*/
|
||||
async updateJobStatus(
|
||||
jobId: string,
|
||||
@@ -106,64 +108,74 @@ export class CoordinatorIntegrationService {
|
||||
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.update>>> {
|
||||
this.logger.log(`Updating job ${jobId} status to ${dto.status}`);
|
||||
|
||||
// Verify job exists
|
||||
const job = await this.prisma.runnerJob.findUnique({
|
||||
where: { id: jobId },
|
||||
select: { id: true, status: true, workspaceId: true },
|
||||
});
|
||||
return this.prisma.$transaction(async (tx) => {
|
||||
// Use SELECT FOR UPDATE to lock the row during this transaction
|
||||
// This prevents concurrent updates from coordinator and ensures serialization
|
||||
const jobs = await tx.$queryRaw<
|
||||
Array<{ id: string; status: RunnerJobStatus; workspace_id: string; version: number }>
|
||||
>`
|
||||
SELECT id, status, workspace_id, version
|
||||
FROM runner_jobs
|
||||
WHERE id = ${jobId}::uuid
|
||||
FOR UPDATE
|
||||
`;
|
||||
|
||||
if (!job) {
|
||||
throw new NotFoundException(`RunnerJob with ID ${jobId} not found`);
|
||||
}
|
||||
if (!jobs || jobs.length === 0) {
|
||||
throw new NotFoundException(`RunnerJob with ID ${jobId} not found`);
|
||||
}
|
||||
|
||||
// Validate status transition
|
||||
if (!this.isValidStatusTransition(job.status, dto.status as RunnerJobStatus)) {
|
||||
throw new BadRequestException(
|
||||
`Invalid status transition from ${job.status} to ${dto.status}`
|
||||
);
|
||||
}
|
||||
const job = jobs[0];
|
||||
|
||||
const updateData: Prisma.RunnerJobUpdateInput = {
|
||||
status: dto.status as RunnerJobStatus,
|
||||
};
|
||||
// Validate status transition
|
||||
if (!this.isValidStatusTransition(job.status, dto.status as RunnerJobStatus)) {
|
||||
throw new BadRequestException(
|
||||
`Invalid status transition from ${job.status} to ${dto.status}`
|
||||
);
|
||||
}
|
||||
|
||||
// Set startedAt when transitioning to RUNNING
|
||||
if (dto.status === CoordinatorJobStatus.RUNNING) {
|
||||
updateData.startedAt = new Date();
|
||||
}
|
||||
const updateData: Prisma.RunnerJobUpdateInput = {
|
||||
status: dto.status as RunnerJobStatus,
|
||||
version: { increment: 1 },
|
||||
};
|
||||
|
||||
const updatedJob = await this.prisma.runnerJob.update({
|
||||
where: { id: jobId },
|
||||
data: updateData,
|
||||
});
|
||||
// Set startedAt when transitioning to RUNNING
|
||||
if (dto.status === CoordinatorJobStatus.RUNNING) {
|
||||
updateData.startedAt = new Date();
|
||||
}
|
||||
|
||||
// Emit appropriate event
|
||||
if (dto.status === CoordinatorJobStatus.RUNNING) {
|
||||
const event = await this.jobEvents.emitJobStarted(jobId, {
|
||||
agentId: dto.agentId,
|
||||
agentType: dto.agentType,
|
||||
const updatedJob = await tx.runnerJob.update({
|
||||
where: { id: jobId },
|
||||
data: updateData,
|
||||
});
|
||||
|
||||
// Broadcast via Herald
|
||||
await this.herald.broadcastJobEvent(jobId, event);
|
||||
}
|
||||
// Emit appropriate event (outside of critical section but inside transaction)
|
||||
if (dto.status === CoordinatorJobStatus.RUNNING) {
|
||||
const event = await this.jobEvents.emitJobStarted(jobId, {
|
||||
agentId: dto.agentId,
|
||||
agentType: dto.agentType,
|
||||
});
|
||||
|
||||
return updatedJob;
|
||||
// Broadcast via Herald
|
||||
await this.herald.broadcastJobEvent(jobId, event);
|
||||
}
|
||||
|
||||
return updatedJob;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Update job progress from the coordinator
|
||||
* Update job progress from the coordinator with optimistic locking
|
||||
*/
|
||||
async updateJobProgress(
|
||||
jobId: string,
|
||||
dto: UpdateJobProgressDto
|
||||
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.update>>> {
|
||||
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.findUnique>>> {
|
||||
this.logger.log(`Updating job ${jobId} progress to ${String(dto.progressPercent)}%`);
|
||||
|
||||
// Verify job exists and is running
|
||||
// Read current job state
|
||||
const job = await this.prisma.runnerJob.findUnique({
|
||||
where: { id: jobId },
|
||||
select: { id: true, status: true },
|
||||
select: { id: true, status: true, version: true },
|
||||
});
|
||||
|
||||
if (!job) {
|
||||
@@ -174,11 +186,31 @@ export class CoordinatorIntegrationService {
|
||||
throw new BadRequestException(`Cannot update progress for job with status ${job.status}`);
|
||||
}
|
||||
|
||||
const updatedJob = await this.prisma.runnerJob.update({
|
||||
where: { id: jobId },
|
||||
data: { progressPercent: dto.progressPercent },
|
||||
// Use updateMany with version check for optimistic locking
|
||||
const result = await this.prisma.runnerJob.updateMany({
|
||||
where: {
|
||||
id: jobId,
|
||||
version: job.version,
|
||||
},
|
||||
data: {
|
||||
progressPercent: dto.progressPercent,
|
||||
version: { increment: 1 },
|
||||
},
|
||||
});
|
||||
|
||||
if (result.count === 0) {
|
||||
throw new ConcurrentUpdateException("RunnerJob", jobId, job.version);
|
||||
}
|
||||
|
||||
// Fetch updated job
|
||||
const updatedJob = await this.prisma.runnerJob.findUnique({
|
||||
where: { id: jobId },
|
||||
});
|
||||
|
||||
if (!updatedJob) {
|
||||
throw new NotFoundException(`RunnerJob with ID ${jobId} not found after update`);
|
||||
}
|
||||
|
||||
// Emit progress event
|
||||
await this.jobEvents.emitEvent(jobId, {
|
||||
type: JOB_PROGRESS,
|
||||
@@ -194,7 +226,7 @@ export class CoordinatorIntegrationService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark job as completed from the coordinator
|
||||
* Mark job as completed from the coordinator using transaction with SELECT FOR UPDATE
|
||||
*/
|
||||
async completeJob(
|
||||
jobId: string,
|
||||
@@ -202,57 +234,68 @@ export class CoordinatorIntegrationService {
|
||||
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.update>>> {
|
||||
this.logger.log(`Completing job ${jobId}`);
|
||||
|
||||
// Verify job exists
|
||||
const job = await this.prisma.runnerJob.findUnique({
|
||||
where: { id: jobId },
|
||||
select: { id: true, status: true, startedAt: true },
|
||||
return this.prisma.$transaction(async (tx) => {
|
||||
// Lock the row to prevent concurrent completion/failure
|
||||
const jobs = await tx.$queryRaw<
|
||||
Array<{ id: string; status: RunnerJobStatus; started_at: Date | null; version: number }>
|
||||
>`
|
||||
SELECT id, status, started_at, version
|
||||
FROM runner_jobs
|
||||
WHERE id = ${jobId}::uuid
|
||||
FOR UPDATE
|
||||
`;
|
||||
|
||||
if (!jobs || jobs.length === 0) {
|
||||
throw new NotFoundException(`RunnerJob with ID ${jobId} not found`);
|
||||
}
|
||||
|
||||
const job = jobs[0];
|
||||
|
||||
// Validate status transition
|
||||
if (!this.isValidStatusTransition(job.status, RunnerJobStatus.COMPLETED)) {
|
||||
throw new BadRequestException(`Cannot complete job with status ${job.status}`);
|
||||
}
|
||||
|
||||
// Calculate duration if not provided
|
||||
let durationSeconds = dto.durationSeconds;
|
||||
if (durationSeconds === undefined && job.started_at) {
|
||||
durationSeconds = Math.round(
|
||||
(new Date().getTime() - new Date(job.started_at).getTime()) / 1000
|
||||
);
|
||||
}
|
||||
|
||||
const updateData: Prisma.RunnerJobUpdateInput = {
|
||||
status: RunnerJobStatus.COMPLETED,
|
||||
progressPercent: 100,
|
||||
completedAt: new Date(),
|
||||
version: { increment: 1 },
|
||||
};
|
||||
|
||||
if (dto.result) {
|
||||
updateData.result = dto.result as Prisma.InputJsonValue;
|
||||
}
|
||||
|
||||
const updatedJob = await tx.runnerJob.update({
|
||||
where: { id: jobId },
|
||||
data: updateData,
|
||||
});
|
||||
|
||||
// Emit completion event
|
||||
const event = await this.jobEvents.emitJobCompleted(jobId, {
|
||||
result: dto.result,
|
||||
tokensUsed: dto.tokensUsed,
|
||||
durationSeconds,
|
||||
});
|
||||
|
||||
// Broadcast via Herald
|
||||
await this.herald.broadcastJobEvent(jobId, event);
|
||||
|
||||
return updatedJob;
|
||||
});
|
||||
|
||||
if (!job) {
|
||||
throw new NotFoundException(`RunnerJob with ID ${jobId} not found`);
|
||||
}
|
||||
|
||||
// Validate status transition
|
||||
if (!this.isValidStatusTransition(job.status, RunnerJobStatus.COMPLETED)) {
|
||||
throw new BadRequestException(`Cannot complete job with status ${job.status}`);
|
||||
}
|
||||
|
||||
// Calculate duration if not provided
|
||||
let durationSeconds = dto.durationSeconds;
|
||||
if (durationSeconds === undefined && job.startedAt) {
|
||||
durationSeconds = Math.round((new Date().getTime() - job.startedAt.getTime()) / 1000);
|
||||
}
|
||||
|
||||
const updateData: Prisma.RunnerJobUpdateInput = {
|
||||
status: RunnerJobStatus.COMPLETED,
|
||||
progressPercent: 100,
|
||||
completedAt: new Date(),
|
||||
};
|
||||
|
||||
if (dto.result) {
|
||||
updateData.result = dto.result as Prisma.InputJsonValue;
|
||||
}
|
||||
|
||||
const updatedJob = await this.prisma.runnerJob.update({
|
||||
where: { id: jobId },
|
||||
data: updateData,
|
||||
});
|
||||
|
||||
// Emit completion event
|
||||
const event = await this.jobEvents.emitJobCompleted(jobId, {
|
||||
result: dto.result,
|
||||
tokensUsed: dto.tokensUsed,
|
||||
durationSeconds,
|
||||
});
|
||||
|
||||
// Broadcast via Herald
|
||||
await this.herald.broadcastJobEvent(jobId, event);
|
||||
|
||||
return updatedJob;
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark job as failed from the coordinator
|
||||
* Mark job as failed from the coordinator using transaction with SELECT FOR UPDATE
|
||||
*/
|
||||
async failJob(
|
||||
jobId: string,
|
||||
@@ -260,42 +303,51 @@ export class CoordinatorIntegrationService {
|
||||
): Promise<Awaited<ReturnType<typeof this.prisma.runnerJob.update>>> {
|
||||
this.logger.log(`Failing job ${jobId}: ${dto.error}`);
|
||||
|
||||
// Verify job exists
|
||||
const job = await this.prisma.runnerJob.findUnique({
|
||||
where: { id: jobId },
|
||||
select: { id: true, status: true },
|
||||
});
|
||||
return this.prisma.$transaction(async (tx) => {
|
||||
// Lock the row to prevent concurrent completion/failure
|
||||
const jobs = await tx.$queryRaw<
|
||||
Array<{ id: string; status: RunnerJobStatus; version: number }>
|
||||
>`
|
||||
SELECT id, status, version
|
||||
FROM runner_jobs
|
||||
WHERE id = ${jobId}::uuid
|
||||
FOR UPDATE
|
||||
`;
|
||||
|
||||
if (!job) {
|
||||
throw new NotFoundException(`RunnerJob with ID ${jobId} not found`);
|
||||
}
|
||||
if (!jobs || jobs.length === 0) {
|
||||
throw new NotFoundException(`RunnerJob with ID ${jobId} not found`);
|
||||
}
|
||||
|
||||
// Validate status transition
|
||||
if (!this.isValidStatusTransition(job.status, RunnerJobStatus.FAILED)) {
|
||||
throw new BadRequestException(`Cannot fail job with status ${job.status}`);
|
||||
}
|
||||
const job = jobs[0];
|
||||
|
||||
const updatedJob = await this.prisma.runnerJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: RunnerJobStatus.FAILED,
|
||||
// Validate status transition
|
||||
if (!this.isValidStatusTransition(job.status, RunnerJobStatus.FAILED)) {
|
||||
throw new BadRequestException(`Cannot fail job with status ${job.status}`);
|
||||
}
|
||||
|
||||
const updatedJob = await tx.runnerJob.update({
|
||||
where: { id: jobId },
|
||||
data: {
|
||||
status: RunnerJobStatus.FAILED,
|
||||
error: dto.error,
|
||||
completedAt: new Date(),
|
||||
version: { increment: 1 },
|
||||
},
|
||||
});
|
||||
|
||||
// Emit failure event
|
||||
const event = await this.jobEvents.emitJobFailed(jobId, {
|
||||
error: dto.error,
|
||||
completedAt: new Date(),
|
||||
},
|
||||
gateResults: dto.gateResults,
|
||||
failedStep: dto.failedStep,
|
||||
continuationPrompt: dto.continuationPrompt,
|
||||
});
|
||||
|
||||
// Broadcast via Herald
|
||||
await this.herald.broadcastJobEvent(jobId, event);
|
||||
|
||||
return updatedJob;
|
||||
});
|
||||
|
||||
// Emit failure event
|
||||
const event = await this.jobEvents.emitJobFailed(jobId, {
|
||||
error: dto.error,
|
||||
gateResults: dto.gateResults,
|
||||
failedStep: dto.failedStep,
|
||||
continuationPrompt: dto.continuationPrompt,
|
||||
});
|
||||
|
||||
// Broadcast via Herald
|
||||
await this.herald.broadcastJobEvent(jobId, event);
|
||||
|
||||
return updatedJob;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,4 +1,15 @@
|
||||
import { IsString, IsOptional, IsNumber, IsObject, Min, Max, IsUUID, MinLength, MaxLength, IsInt } from "class-validator";
|
||||
import {
|
||||
IsString,
|
||||
IsOptional,
|
||||
IsNumber,
|
||||
IsObject,
|
||||
Min,
|
||||
Max,
|
||||
IsUUID,
|
||||
MinLength,
|
||||
MaxLength,
|
||||
IsInt,
|
||||
} from "class-validator";
|
||||
|
||||
/**
|
||||
* DTO for creating a job from the coordinator
|
||||
|
||||
Reference in New Issue
Block a user