Implement comprehensive distributed tracing for HTTP requests and LLM operations using OpenTelemetry with GenAI semantic conventions. Features: - TelemetryService: SDK initialization with OTLP HTTP exporter - TelemetryInterceptor: Automatic HTTP request spans - @TraceLlmCall decorator: LLM operation tracing - GenAI semantic conventions for model/token tracking - Graceful degradation when tracing disabled Instrumented: - All HTTP requests (automatic spans) - OllamaProvider chat/chatStream/embed operations - Token counts, model names, durations Environment: - OTEL_ENABLED (default: true) - OTEL_SERVICE_NAME (default: mosaic-api) - OTEL_EXPORTER_OTLP_ENDPOINT (default: localhost:4318) Tests: 23 passing with full coverage Fixes #131 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
313 lines
8.6 KiB
TypeScript
313 lines
8.6 KiB
TypeScript
import { Logger } from "@nestjs/common";
|
|
import { Ollama, type Message } from "ollama";
|
|
import type {
|
|
LlmProviderInterface,
|
|
LlmProviderConfig,
|
|
LlmProviderHealthStatus,
|
|
} from "./llm-provider.interface";
|
|
import type { ChatRequestDto, ChatResponseDto, EmbedRequestDto, EmbedResponseDto } from "../dto";
|
|
import { TraceLlmCall, createLlmSpan } from "../../telemetry";
|
|
import { SpanStatusCode } from "@opentelemetry/api";
|
|
|
|
/**
|
|
* Configuration for Ollama LLM provider.
|
|
* Extends base LlmProviderConfig with Ollama-specific options.
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const config: OllamaProviderConfig = {
|
|
* endpoint: "http://localhost:11434",
|
|
* timeout: 30000
|
|
* };
|
|
* ```
|
|
*/
|
|
export interface OllamaProviderConfig extends LlmProviderConfig {
|
|
/**
|
|
* Ollama server endpoint URL
|
|
* @default "http://localhost:11434"
|
|
*/
|
|
endpoint: string;
|
|
|
|
/**
|
|
* Request timeout in milliseconds
|
|
* @default 30000
|
|
*/
|
|
timeout?: number;
|
|
}
|
|
|
|
/**
|
|
* Ollama LLM provider implementation.
|
|
* Provides integration with locally-hosted or remote Ollama instances.
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const provider = new OllamaProvider({
|
|
* endpoint: "http://localhost:11434",
|
|
* timeout: 30000
|
|
* });
|
|
*
|
|
* await provider.initialize();
|
|
*
|
|
* const response = await provider.chat({
|
|
* model: "llama2",
|
|
* messages: [{ role: "user", content: "Hello" }]
|
|
* });
|
|
* ```
|
|
*/
|
|
export class OllamaProvider implements LlmProviderInterface {
|
|
readonly name = "Ollama";
|
|
readonly type = "ollama" as const;
|
|
|
|
private readonly logger = new Logger(OllamaProvider.name);
|
|
private readonly client: Ollama;
|
|
private readonly config: OllamaProviderConfig;
|
|
|
|
/**
|
|
* Creates a new Ollama provider instance.
|
|
*
|
|
* @param config - Ollama provider configuration
|
|
*/
|
|
constructor(config: OllamaProviderConfig) {
|
|
this.config = {
|
|
...config,
|
|
timeout: config.timeout ?? 30000,
|
|
};
|
|
|
|
this.client = new Ollama({ host: this.config.endpoint });
|
|
this.logger.log(`Ollama provider initialized with endpoint: ${this.config.endpoint}`);
|
|
}
|
|
|
|
/**
|
|
* Initialize the Ollama provider.
|
|
* This is a no-op for Ollama as the client is initialized in the constructor.
|
|
*/
|
|
async initialize(): Promise<void> {
|
|
// Ollama client is initialized in constructor
|
|
// No additional setup required
|
|
}
|
|
|
|
/**
|
|
* Check if the Ollama server is healthy and reachable.
|
|
*
|
|
* @returns Health status with available models if healthy
|
|
*/
|
|
async checkHealth(): Promise<LlmProviderHealthStatus> {
|
|
try {
|
|
const response = await this.client.list();
|
|
const models = response.models.map((m) => m.name);
|
|
|
|
return {
|
|
healthy: true,
|
|
provider: "ollama",
|
|
endpoint: this.config.endpoint,
|
|
models,
|
|
};
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
this.logger.warn(`Ollama health check failed: ${errorMessage}`);
|
|
|
|
return {
|
|
healthy: false,
|
|
provider: "ollama",
|
|
endpoint: this.config.endpoint,
|
|
error: errorMessage,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* List all available models from the Ollama server.
|
|
*
|
|
* @returns Array of model names
|
|
* @throws {Error} If the request fails
|
|
*/
|
|
async listModels(): Promise<string[]> {
|
|
try {
|
|
const response = await this.client.list();
|
|
return response.models.map((m) => m.name);
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
this.logger.error(`Failed to list models: ${errorMessage}`);
|
|
throw new Error(`Failed to list models: ${errorMessage}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Perform a synchronous chat completion.
|
|
*
|
|
* @param request - Chat request with messages and configuration
|
|
* @returns Complete chat response
|
|
* @throws {Error} If the request fails
|
|
*/
|
|
@TraceLlmCall({ system: "ollama", operation: "chat" })
|
|
async chat(request: ChatRequestDto): Promise<ChatResponseDto> {
|
|
try {
|
|
const messages = this.buildMessages(request);
|
|
const options = this.buildChatOptions(request);
|
|
|
|
const response = await this.client.chat({
|
|
model: request.model,
|
|
messages,
|
|
stream: false,
|
|
options,
|
|
});
|
|
|
|
return {
|
|
model: response.model,
|
|
message: {
|
|
role: response.message.role as "assistant",
|
|
content: response.message.content,
|
|
},
|
|
done: response.done,
|
|
totalDuration: response.total_duration,
|
|
promptEvalCount: response.prompt_eval_count,
|
|
evalCount: response.eval_count,
|
|
};
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
this.logger.error(`Chat completion failed: ${errorMessage}`);
|
|
throw new Error(`Chat completion failed: ${errorMessage}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Perform a streaming chat completion.
|
|
* Yields response chunks as they arrive from the Ollama server.
|
|
*
|
|
* @param request - Chat request with messages and configuration
|
|
* @yields Chat response chunks
|
|
* @throws {Error} If the request fails
|
|
*/
|
|
async *chatStream(request: ChatRequestDto): AsyncGenerator<ChatResponseDto> {
|
|
const span = createLlmSpan("ollama", "chat.stream", request.model);
|
|
|
|
try {
|
|
const messages = this.buildMessages(request);
|
|
const options = this.buildChatOptions(request);
|
|
|
|
const stream = await this.client.chat({
|
|
model: request.model,
|
|
messages,
|
|
stream: true,
|
|
options,
|
|
});
|
|
|
|
for await (const chunk of stream) {
|
|
yield {
|
|
model: chunk.model,
|
|
message: {
|
|
role: chunk.message.role as "assistant",
|
|
content: chunk.message.content,
|
|
},
|
|
done: chunk.done,
|
|
};
|
|
}
|
|
|
|
span.setStatus({ code: SpanStatusCode.OK });
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
this.logger.error(`Streaming failed: ${errorMessage}`);
|
|
|
|
span.recordException(error instanceof Error ? error : new Error(errorMessage));
|
|
span.setStatus({
|
|
code: SpanStatusCode.ERROR,
|
|
message: errorMessage,
|
|
});
|
|
|
|
throw new Error(`Streaming failed: ${errorMessage}`);
|
|
} finally {
|
|
span.end();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Generate embeddings for the given input texts.
|
|
*
|
|
* @param request - Embedding request with model and input texts
|
|
* @returns Embeddings response with vector arrays
|
|
* @throws {Error} If the request fails
|
|
*/
|
|
@TraceLlmCall({ system: "ollama", operation: "embed" })
|
|
async embed(request: EmbedRequestDto): Promise<EmbedResponseDto> {
|
|
try {
|
|
const response = await this.client.embed({
|
|
model: request.model,
|
|
input: request.input,
|
|
truncate: request.truncate === "none" ? false : true,
|
|
});
|
|
|
|
return {
|
|
model: response.model,
|
|
embeddings: response.embeddings,
|
|
totalDuration: response.total_duration,
|
|
};
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
this.logger.error(`Embedding failed: ${errorMessage}`);
|
|
throw new Error(`Embedding failed: ${errorMessage}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get the current provider configuration.
|
|
* Returns a copy to prevent external modification.
|
|
*
|
|
* @returns Provider configuration object
|
|
*/
|
|
getConfig(): OllamaProviderConfig {
|
|
return { ...this.config };
|
|
}
|
|
|
|
/**
|
|
* Build message array from chat request.
|
|
* Prepends system prompt if provided and not already in messages.
|
|
*
|
|
* @param request - Chat request
|
|
* @returns Array of messages for Ollama
|
|
*/
|
|
private buildMessages(request: ChatRequestDto): Message[] {
|
|
const messages: Message[] = [];
|
|
|
|
// Add system prompt if provided and not already in messages
|
|
if (request.systemPrompt && !request.messages.some((m) => m.role === "system")) {
|
|
messages.push({
|
|
role: "system",
|
|
content: request.systemPrompt,
|
|
});
|
|
}
|
|
|
|
// Add all request messages
|
|
for (const message of request.messages) {
|
|
messages.push({
|
|
role: message.role,
|
|
content: message.content,
|
|
});
|
|
}
|
|
|
|
return messages;
|
|
}
|
|
|
|
/**
|
|
* Build Ollama-specific chat options from request.
|
|
*
|
|
* @param request - Chat request
|
|
* @returns Ollama options object
|
|
*/
|
|
private buildChatOptions(request: ChatRequestDto): {
|
|
temperature?: number;
|
|
num_predict?: number;
|
|
} {
|
|
const options: { temperature?: number; num_predict?: number } = {};
|
|
|
|
if (request.temperature !== undefined) {
|
|
options.temperature = request.temperature;
|
|
}
|
|
|
|
if (request.maxTokens !== undefined) {
|
|
options.num_predict = request.maxTokens;
|
|
}
|
|
|
|
return options;
|
|
}
|
|
}
|