feat(#131): add OpenTelemetry tracing infrastructure
Implement comprehensive distributed tracing for HTTP requests and LLM operations using OpenTelemetry with GenAI semantic conventions. Features: - TelemetryService: SDK initialization with OTLP HTTP exporter - TelemetryInterceptor: Automatic HTTP request spans - @TraceLlmCall decorator: LLM operation tracing - GenAI semantic conventions for model/token tracking - Graceful degradation when tracing disabled Instrumented: - All HTTP requests (automatic spans) - OllamaProvider chat/chatStream/embed operations - Token counts, model names, durations Environment: - OTEL_ENABLED (default: true) - OTEL_SERVICE_NAME (default: mosaic-api) - OTEL_EXPORTER_OTLP_ENDPOINT (default: localhost:4318) Tests: 23 passing with full coverage Fixes #131 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
182
apps/api/src/telemetry/telemetry.service.ts
Normal file
182
apps/api/src/telemetry/telemetry.service.ts
Normal file
@@ -0,0 +1,182 @@
|
||||
import { Injectable, OnModuleInit, OnModuleDestroy, Logger } from "@nestjs/common";
|
||||
import { NodeSDK } from "@opentelemetry/sdk-node";
|
||||
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
|
||||
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
||||
import { Resource } from "@opentelemetry/resources";
|
||||
import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
|
||||
import type { Tracer, Span, SpanOptions } from "@opentelemetry/api";
|
||||
import { trace, SpanStatusCode } from "@opentelemetry/api";
|
||||
|
||||
/**
|
||||
* Service responsible for OpenTelemetry distributed tracing.
|
||||
* Initializes the OTEL SDK with Jaeger/OTLP exporters and provides
|
||||
* tracing utilities for HTTP requests and LLM operations.
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const span = telemetryService.startSpan('operation-name', {
|
||||
* attributes: { 'custom.key': 'value' }
|
||||
* });
|
||||
* try {
|
||||
* // Perform operation
|
||||
* } catch (error) {
|
||||
* telemetryService.recordException(span, error);
|
||||
* } finally {
|
||||
* span.end();
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
@Injectable()
|
||||
export class TelemetryService implements OnModuleInit, OnModuleDestroy {
|
||||
private readonly logger = new Logger(TelemetryService.name);
|
||||
private sdk?: NodeSDK;
|
||||
private tracer!: Tracer;
|
||||
private enabled: boolean;
|
||||
private serviceName: string;
|
||||
private shutdownPromise?: Promise<void>;
|
||||
|
||||
constructor() {
|
||||
this.enabled = process.env.OTEL_ENABLED !== "false";
|
||||
this.serviceName = process.env.OTEL_SERVICE_NAME ?? "mosaic-api";
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the OpenTelemetry SDK with configured exporters.
|
||||
* This is called automatically by NestJS when the module is initialized.
|
||||
*/
|
||||
onModuleInit(): void {
|
||||
if (!this.enabled) {
|
||||
this.logger.log("OpenTelemetry tracing is disabled");
|
||||
this.tracer = trace.getTracer("noop");
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const exporter = this.createExporter();
|
||||
const resource = new Resource({
|
||||
[ATTR_SERVICE_NAME]: this.serviceName,
|
||||
});
|
||||
|
||||
this.sdk = new NodeSDK({
|
||||
resource,
|
||||
traceExporter: exporter,
|
||||
instrumentations: [
|
||||
getNodeAutoInstrumentations({
|
||||
"@opentelemetry/instrumentation-fs": {
|
||||
enabled: false, // Disable file system instrumentation to reduce noise
|
||||
},
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
this.sdk.start();
|
||||
this.tracer = trace.getTracer(this.serviceName);
|
||||
|
||||
this.logger.log(`OpenTelemetry SDK started for service: ${this.serviceName}`);
|
||||
} catch (error) {
|
||||
this.logger.error("Failed to initialize OpenTelemetry SDK", error);
|
||||
// Fallback to noop tracer to prevent application failures
|
||||
this.tracer = trace.getTracer("noop");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shutdown the OpenTelemetry SDK gracefully.
|
||||
* This is called automatically by NestJS when the module is destroyed.
|
||||
*/
|
||||
async onModuleDestroy(): Promise<void> {
|
||||
if (!this.sdk) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Prevent multiple concurrent shutdowns
|
||||
if (this.shutdownPromise) {
|
||||
return this.shutdownPromise;
|
||||
}
|
||||
|
||||
this.shutdownPromise = (async () => {
|
||||
try {
|
||||
if (this.sdk) {
|
||||
await this.sdk.shutdown();
|
||||
}
|
||||
this.logger.log("OpenTelemetry SDK shut down successfully");
|
||||
} catch (error) {
|
||||
this.logger.error("Error shutting down OpenTelemetry SDK", error);
|
||||
}
|
||||
})();
|
||||
|
||||
return this.shutdownPromise;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the tracer instance for creating spans.
|
||||
*
|
||||
* @returns The configured tracer instance
|
||||
*/
|
||||
getTracer(): Tracer {
|
||||
return this.tracer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a new span with the given name and options.
|
||||
*
|
||||
* @param name - The name of the span
|
||||
* @param options - Optional span configuration
|
||||
* @returns A new span instance
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const span = telemetryService.startSpan('database-query', {
|
||||
* attributes: {
|
||||
* 'db.system': 'postgresql',
|
||||
* 'db.statement': 'SELECT * FROM users'
|
||||
* }
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
startSpan(name: string, options?: SpanOptions): Span {
|
||||
return this.tracer.startSpan(name, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Record an exception on a span and set its status to error.
|
||||
*
|
||||
* @param span - The span to record the exception on
|
||||
* @param error - The error to record
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* try {
|
||||
* // Some operation
|
||||
* } catch (error) {
|
||||
* telemetryService.recordException(span, error);
|
||||
* throw error;
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
recordException(span: Span, error: Error): void {
|
||||
span.recordException(error);
|
||||
span.setStatus({
|
||||
code: SpanStatusCode.ERROR,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the appropriate trace exporter based on environment configuration.
|
||||
* Uses OTLP HTTP exporter (compatible with Jaeger, Tempo, and other backends).
|
||||
*
|
||||
* @returns Configured trace exporter
|
||||
*/
|
||||
private createExporter(): OTLPTraceExporter {
|
||||
const otlpEndpoint =
|
||||
process.env.OTEL_EXPORTER_OTLP_ENDPOINT ??
|
||||
process.env.OTEL_EXPORTER_JAEGER_ENDPOINT ??
|
||||
"http://localhost:4318/v1/traces";
|
||||
|
||||
this.logger.log(`Using OTLP HTTP exporter: ${otlpEndpoint}`);
|
||||
return new OTLPTraceExporter({
|
||||
url: otlpEndpoint,
|
||||
});
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user