Files
stack/apps/api/src/telemetry/telemetry.service.ts
Jason Woltje 51e6ad0792 feat(#131): add OpenTelemetry tracing infrastructure
Implement comprehensive distributed tracing for HTTP requests and LLM
operations using OpenTelemetry with GenAI semantic conventions.

Features:
- TelemetryService: SDK initialization with OTLP HTTP exporter
- TelemetryInterceptor: Automatic HTTP request spans
- @TraceLlmCall decorator: LLM operation tracing
- GenAI semantic conventions for model/token tracking
- Graceful degradation when tracing disabled

Instrumented:
- All HTTP requests (automatic spans)
- OllamaProvider chat/chatStream/embed operations
- Token counts, model names, durations

Environment:
- OTEL_ENABLED (default: true)
- OTEL_SERVICE_NAME (default: mosaic-api)
- OTEL_EXPORTER_OTLP_ENDPOINT (default: localhost:4318)

Tests: 23 passing with full coverage

Fixes #131

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 12:55:11 -06:00

183 lines
5.1 KiB
TypeScript

import { Injectable, OnModuleInit, OnModuleDestroy, Logger } from "@nestjs/common";
import { NodeSDK } from "@opentelemetry/sdk-node";
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
import { Resource } from "@opentelemetry/resources";
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
import type { Tracer, Span, SpanOptions } from "@opentelemetry/api";
import { trace, SpanStatusCode } from "@opentelemetry/api";
/**
* Service responsible for OpenTelemetry distributed tracing.
* Initializes the OTEL SDK with Jaeger/OTLP exporters and provides
* tracing utilities for HTTP requests and LLM operations.
*
* @example
* ```typescript
* const span = telemetryService.startSpan('operation-name', {
* attributes: { 'custom.key': 'value' }
* });
* try {
* // Perform operation
* } catch (error) {
* telemetryService.recordException(span, error);
* } finally {
* span.end();
* }
* ```
*/
@Injectable()
export class TelemetryService implements OnModuleInit, OnModuleDestroy {
private readonly logger = new Logger(TelemetryService.name);
private sdk?: NodeSDK;
private tracer!: Tracer;
private enabled: boolean;
private serviceName: string;
private shutdownPromise?: Promise<void>;
constructor() {
this.enabled = process.env.OTEL_ENABLED !== "false";
this.serviceName = process.env.OTEL_SERVICE_NAME ?? "mosaic-api";
}
/**
* Initialize the OpenTelemetry SDK with configured exporters.
* This is called automatically by NestJS when the module is initialized.
*/
onModuleInit(): void {
if (!this.enabled) {
this.logger.log("OpenTelemetry tracing is disabled");
this.tracer = trace.getTracer("noop");
return;
}
try {
const exporter = this.createExporter();
const resource = new Resource({
[ATTR_SERVICE_NAME]: this.serviceName,
});
this.sdk = new NodeSDK({
resource,
traceExporter: exporter,
instrumentations: [
getNodeAutoInstrumentations({
"@opentelemetry/instrumentation-fs": {
enabled: false, // Disable file system instrumentation to reduce noise
},
}),
],
});
this.sdk.start();
this.tracer = trace.getTracer(this.serviceName);
this.logger.log(`OpenTelemetry SDK started for service: ${this.serviceName}`);
} catch (error) {
this.logger.error("Failed to initialize OpenTelemetry SDK", error);
// Fallback to noop tracer to prevent application failures
this.tracer = trace.getTracer("noop");
}
}
/**
* Shutdown the OpenTelemetry SDK gracefully.
* This is called automatically by NestJS when the module is destroyed.
*/
async onModuleDestroy(): Promise<void> {
if (!this.sdk) {
return;
}
// Prevent multiple concurrent shutdowns
if (this.shutdownPromise) {
return this.shutdownPromise;
}
this.shutdownPromise = (async () => {
try {
if (this.sdk) {
await this.sdk.shutdown();
}
this.logger.log("OpenTelemetry SDK shut down successfully");
} catch (error) {
this.logger.error("Error shutting down OpenTelemetry SDK", error);
}
})();
return this.shutdownPromise;
}
/**
* Get the tracer instance for creating spans.
*
* @returns The configured tracer instance
*/
getTracer(): Tracer {
return this.tracer;
}
/**
* Start a new span with the given name and options.
*
* @param name - The name of the span
* @param options - Optional span configuration
* @returns A new span instance
*
* @example
* ```typescript
* const span = telemetryService.startSpan('database-query', {
* attributes: {
* 'db.system': 'postgresql',
* 'db.statement': 'SELECT * FROM users'
* }
* });
* ```
*/
startSpan(name: string, options?: SpanOptions): Span {
return this.tracer.startSpan(name, options);
}
/**
* Record an exception on a span and set its status to error.
*
* @param span - The span to record the exception on
* @param error - The error to record
*
* @example
* ```typescript
* try {
* // Some operation
* } catch (error) {
* telemetryService.recordException(span, error);
* throw error;
* }
* ```
*/
recordException(span: Span, error: Error): void {
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message,
});
}
/**
* Create the appropriate trace exporter based on environment configuration.
* Uses OTLP HTTP exporter (compatible with Jaeger, Tempo, and other backends).
*
* @returns Configured trace exporter
*/
private createExporter(): OTLPTraceExporter {
const otlpEndpoint =
process.env.OTEL_EXPORTER_OTLP_ENDPOINT ??
process.env.OTEL_EXPORTER_JAEGER_ENDPOINT ??
"http://localhost:4318/v1/traces";
this.logger.log(`Using OTLP HTTP exporter: ${otlpEndpoint}`);
return new OTLPTraceExporter({
url: otlpEndpoint,
});
}
}