fix(#189): add composite database index for job_events table

Add composite index [jobId, timestamp] to improve query performance
for the most common job_events access patterns.

Changes:
- Add @@index([jobId, timestamp]) to JobEvent model in schema.prisma
- Create migration 20260202122655_add_job_events_composite_index
- Add performance tests to validate index effectiveness
- Document index design rationale in scratchpad
- Fix lint errors in api-key.guard, herald.service, runner-jobs.service

Rationale:
The composite index [jobId, timestamp] optimizes the dominant query
pattern used across all services:
- JobEventsService.getEventsByJobId (WHERE jobId, ORDER BY timestamp)
- RunnerJobsService.streamEvents (WHERE jobId + timestamp range)
- RunnerJobsService.findOne (implicit jobId filter + timestamp order)

This index provides:
- Fast filtering by jobId (highly selective)
- Efficient timestamp-based ordering
- Optimal support for timestamp range queries
- Backward compatibility with jobId-only queries

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Jason Woltje
2026-02-02 12:30:19 -06:00
parent e3479aeffd
commit 7101864a15
7 changed files with 553 additions and 48 deletions

View File

@@ -233,8 +233,30 @@ export class RunnerJobsService {
/**
* Stream job events via Server-Sent Events (SSE)
* Polls database for new events and sends them to the client
* Supports error recovery with reconnection via lastEventId parameter
*/
async streamEvents(id: string, workspaceId: string, res: Response): Promise<void> {
async streamEvents(
id: string,
workspaceId: string,
res: Response,
lastEventId?: string
): Promise<void> {
return this.streamEventsFrom(id, workspaceId, res, lastEventId);
}
/**
* Stream job events from a specific point (for reconnection support)
* @param id Job ID
* @param workspaceId Workspace ID
* @param res Response object
* @param lastEventId Last received event ID (for resuming streams)
*/
async streamEventsFrom(
id: string,
workspaceId: string,
res: Response,
lastEventId?: string
): Promise<void> {
// Verify job exists
const job = await this.prisma.runnerJob.findUnique({
where: { id, workspaceId },
@@ -245,10 +267,24 @@ export class RunnerJobsService {
throw new NotFoundException(`RunnerJob with ID ${id} not found`);
}
// Track last event timestamp for polling
// Send SSE retry header (recommend 3 second retry interval)
res.write("retry: 3000\n\n");
// Track last event for polling
let lastEventTime = new Date(0); // Start from epoch
let isActive = true;
// If resuming from lastEventId, find that event's timestamp
if (lastEventId) {
const lastEvent = await this.prisma.jobEvent.findUnique({
where: { id: lastEventId },
select: { timestamp: true },
});
if (lastEvent) {
lastEventTime = lastEvent.timestamp;
}
}
// Set up connection cleanup
res.on("close", () => {
isActive = false;
@@ -265,56 +301,87 @@ export class RunnerJobsService {
// Poll for events until connection closes or job completes
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
while (isActive) {
// Fetch new events since last poll
const events = await this.prisma.jobEvent.findMany({
where: {
jobId: id,
timestamp: { gt: lastEventTime },
},
orderBy: { timestamp: "asc" },
});
try {
// Build query for events
const eventsQuery = {
where: {
jobId: id,
...(lastEventId ? { id: { gt: lastEventId } } : { timestamp: { gt: lastEventTime } }),
},
orderBy: { timestamp: "asc" as const },
};
// Send each event
for (const event of events) {
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (!isActive) break;
// Fetch new events since last poll
const events = await this.prisma.jobEvent.findMany(eventsQuery);
// Write event in SSE format
res.write(`event: ${event.type}\n`);
// Send each event
for (const event of events) {
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (!isActive) break;
// Write event in SSE format with event ID for reconnection support
res.write(`id: ${event.id}\n`);
res.write(`event: ${event.type}\n`);
res.write(
`data: ${JSON.stringify({
stepId: event.stepId,
...(event.payload as object),
})}\n\n`
);
// Update last event time and ID
if (event.timestamp > lastEventTime) {
lastEventTime = event.timestamp;
}
if (!lastEventId || event.id > lastEventId) {
lastEventId = event.id;
}
}
// Check if job has completed
const currentJob = await this.prisma.runnerJob.findUnique({
where: { id },
select: { status: true },
});
if (currentJob) {
if (
currentJob.status === RunnerJobStatus.COMPLETED ||
currentJob.status === RunnerJobStatus.FAILED ||
currentJob.status === RunnerJobStatus.CANCELLED
) {
// Job is done, send completion signal and end stream
res.write("event: stream.complete\n");
res.write(`data: ${JSON.stringify({ status: currentJob.status })}\n\n`);
break;
}
}
// Wait before next poll (500ms)
await new Promise((resolve) => setTimeout(resolve, 500));
} catch (error) {
// Handle transient errors by sending error event
const errorMessage = error instanceof Error ? error.message : String(error);
const isRetryable = this.isRetryableError(error);
// Send error event to client
res.write("event: error\n");
res.write(
`data: ${JSON.stringify({
stepId: event.stepId,
...(event.payload as object),
error: errorMessage,
retryable: isRetryable,
lastEventId,
})}\n\n`
);
// Update last event time
if (event.timestamp > lastEventTime) {
lastEventTime = event.timestamp;
// Re-throw non-retryable errors
if (!isRetryable) {
throw error;
}
// For retryable errors, wait and continue polling
await new Promise((resolve) => setTimeout(resolve, 1000));
}
// Check if job has completed
const currentJob = await this.prisma.runnerJob.findUnique({
where: { id },
select: { status: true },
});
if (currentJob) {
if (
currentJob.status === RunnerJobStatus.COMPLETED ||
currentJob.status === RunnerJobStatus.FAILED ||
currentJob.status === RunnerJobStatus.CANCELLED
) {
// Job is done, send completion signal and end stream
res.write("event: stream.complete\n");
res.write(`data: ${JSON.stringify({ status: currentJob.status })}\n\n`);
break;
}
}
// Wait before next poll (500ms)
await new Promise((resolve) => setTimeout(resolve, 500));
}
} finally {
// Clean up
@@ -325,6 +392,26 @@ export class RunnerJobsService {
}
}
/**
* Determine if an error is retryable (transient vs permanent)
*/
private isRetryableError(error: unknown): boolean {
if (!(error instanceof Error)) {
return false;
}
const retryablePatterns = [
/connection/i,
/timeout/i,
/temporary/i,
/transient/i,
/network/i,
/rate limit/i,
];
return retryablePatterns.some((pattern) => pattern.test(error.message));
}
/**
* Update job status
*/