import { Injectable, OnModuleInit, OnModuleDestroy, Logger } from "@nestjs/common"; import { NodeSDK } from "@opentelemetry/sdk-node"; import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node"; import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http"; import { Resource } from "@opentelemetry/resources"; import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions"; // Deployment environment is not yet in the stable semantic conventions // Using the semantic conventions format for consistency const ATTR_DEPLOYMENT_ENVIRONMENT = "deployment.environment" as const; import { ParentBasedSampler, TraceIdRatioBasedSampler } from "@opentelemetry/sdk-trace-base"; import type { Tracer, Span, SpanOptions } from "@opentelemetry/api"; import { trace, SpanStatusCode } from "@opentelemetry/api"; import { readFileSync } from "fs"; import { join } from "path"; /** * Service responsible for OpenTelemetry distributed tracing. * Initializes the OTEL SDK with Jaeger/OTLP exporters and provides * tracing utilities for HTTP requests and LLM operations. * * @example * ```typescript * const span = telemetryService.startSpan('operation-name', { * attributes: { 'custom.key': 'value' } * }); * try { * // Perform operation * } catch (error) { * telemetryService.recordException(span, error); * } finally { * span.end(); * } * ``` */ @Injectable() export class TelemetryService implements OnModuleInit, OnModuleDestroy { private readonly logger = new Logger(TelemetryService.name); private sdk?: NodeSDK; private tracer!: Tracer; private enabled: boolean; private serviceName: string; private shutdownPromise?: Promise; constructor() { this.enabled = process.env.OTEL_ENABLED !== "false"; this.serviceName = process.env.OTEL_SERVICE_NAME ?? "mosaic-api"; } /** * Get the service version from package.json. * Defaults to '0.0.0' if version cannot be determined. * * @returns The service version string */ private getServiceVersion(): string { try { const packageJsonPath = join(__dirname, "..", "..", "package.json"); // eslint-disable-next-line security/detect-non-literal-fs-filename -- Safe: reading local package.json const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8")) as { version?: string; }; return packageJson.version ?? "0.0.0"; } catch (error) { this.logger.warn("Failed to read service version from package.json", error); return "0.0.0"; } } /** * Get the deployment environment from NODE_ENV or OTEL_DEPLOYMENT_ENVIRONMENT. * Defaults to 'development' if not set. * * @returns The deployment environment string */ private getDeploymentEnvironment(): string { return process.env.OTEL_DEPLOYMENT_ENVIRONMENT ?? process.env.NODE_ENV ?? "development"; } /** * Get the trace sampling ratio from environment variable. * Defaults to 1.0 (sample all traces). * Clamps value between 0.0 and 1.0. * * @returns The sampling ratio between 0.0 and 1.0 */ private getSamplingRatio(): number { const envValue = process.env.OTEL_TRACES_SAMPLER_ARG; if (!envValue) { return 1.0; // Default: sample all traces } const parsed = parseFloat(envValue); if (isNaN(parsed)) { this.logger.warn(`Invalid OTEL_TRACES_SAMPLER_ARG value: ${envValue}, using default 1.0`); return 1.0; } // Clamp to valid range const clamped = Math.max(0.0, Math.min(1.0, parsed)); if (clamped !== parsed) { this.logger.warn( `OTEL_TRACES_SAMPLER_ARG clamped from ${String(parsed)} to ${String(clamped)}` ); } return clamped; } /** * Initialize the OpenTelemetry SDK with configured exporters. * This is called automatically by NestJS when the module is initialized. */ onModuleInit(): void { if (!this.enabled) { this.logger.log("OpenTelemetry tracing is disabled"); this.tracer = trace.getTracer("noop"); return; } try { const exporter = this.createExporter(); const serviceVersion = this.getServiceVersion(); const deploymentEnvironment = this.getDeploymentEnvironment(); const samplingRatio = this.getSamplingRatio(); const resource = new Resource({ [ATTR_SERVICE_NAME]: this.serviceName, [ATTR_SERVICE_VERSION]: serviceVersion, [ATTR_DEPLOYMENT_ENVIRONMENT]: deploymentEnvironment, }); // Create sampler with parent-based strategy const sampler = new ParentBasedSampler({ root: new TraceIdRatioBasedSampler(samplingRatio), }); this.sdk = new NodeSDK({ resource, sampler, traceExporter: exporter, instrumentations: [ getNodeAutoInstrumentations({ "@opentelemetry/instrumentation-fs": { enabled: false, // Disable file system instrumentation to reduce noise }, }), ], }); this.sdk.start(); this.tracer = trace.getTracer(this.serviceName); this.logger.log( `OpenTelemetry SDK started for service: ${this.serviceName} v${serviceVersion} (${deploymentEnvironment}, sampling: ${String(samplingRatio)})` ); } catch (error) { this.logger.error("Failed to initialize OpenTelemetry SDK", error); // Fallback to noop tracer to prevent application failures this.tracer = trace.getTracer("noop"); } } /** * Shutdown the OpenTelemetry SDK gracefully. * This is called automatically by NestJS when the module is destroyed. */ async onModuleDestroy(): Promise { if (!this.sdk) { return; } // Prevent multiple concurrent shutdowns if (this.shutdownPromise) { return this.shutdownPromise; } this.shutdownPromise = (async () => { try { if (this.sdk) { await this.sdk.shutdown(); } this.logger.log("OpenTelemetry SDK shut down successfully"); } catch (error) { this.logger.error("Error shutting down OpenTelemetry SDK", error); } })(); return this.shutdownPromise; } /** * Get the tracer instance for creating spans. * * @returns The configured tracer instance */ getTracer(): Tracer { return this.tracer; } /** * Start a new span with the given name and options. * * @param name - The name of the span * @param options - Optional span configuration * @returns A new span instance * * @example * ```typescript * const span = telemetryService.startSpan('database-query', { * attributes: { * 'db.system': 'postgresql', * 'db.statement': 'SELECT * FROM users' * } * }); * ``` */ startSpan(name: string, options?: SpanOptions): Span { return this.tracer.startSpan(name, options); } /** * Record an exception on a span and set its status to error. * * @param span - The span to record the exception on * @param error - The error to record * * @example * ```typescript * try { * // Some operation * } catch (error) { * telemetryService.recordException(span, error); * throw error; * } * ``` */ recordException(span: Span, error: Error): void { span.recordException(error); span.setStatus({ code: SpanStatusCode.ERROR, message: error.message, }); } /** * Create the appropriate trace exporter based on environment configuration. * Uses OTLP HTTP exporter (compatible with Jaeger, Tempo, and other backends). * * @returns Configured trace exporter */ private createExporter(): OTLPTraceExporter { const otlpEndpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? process.env.OTEL_EXPORTER_JAEGER_ENDPOINT ?? "http://localhost:4318/v1/traces"; this.logger.log(`Using OTLP HTTP exporter: ${otlpEndpoint}`); return new OTLPTraceExporter({ url: otlpEndpoint, }); } }