diff --git a/apps/api/src/speech/providers/speaches-stt.provider.spec.ts b/apps/api/src/speech/providers/speaches-stt.provider.spec.ts new file mode 100644 index 0000000..90ad8cd --- /dev/null +++ b/apps/api/src/speech/providers/speaches-stt.provider.spec.ts @@ -0,0 +1,468 @@ +/** + * SpeachesSttProvider Tests + * + * TDD tests for the Speaches/faster-whisper STT provider. + * Tests cover transcription, error handling, health checks, and config injection. + * + * Issue #390 + */ + +import { describe, it, expect, beforeEach, vi } from "vitest"; +import { SpeachesSttProvider } from "./speaches-stt.provider"; +import type { SpeechConfig } from "../speech.config"; +import type { TranscribeOptions } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => { + const mockCreate = vi.fn(); + const mockModelsList = vi.fn(); + const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => { + return new File([buffer], name); + }); + const mockOpenAIConstructorCalls: Array> = []; + return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls }; +}); + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + transcriptions: { + create: mockCreate, + }, + }; + models = { + list: mockModelsList, + }; + constructor(config: Record) { + mockOpenAIConstructorCalls.push(config); + } + } + return { + default: MockOpenAI, + toFile: mockToFile, + }; +}); + +// ========================================== +// Test helpers +// ========================================== + +function createTestConfig(overrides?: Partial): SpeechConfig { + return { + stt: { + enabled: true, + baseUrl: "http://speaches:8000/v1", + model: "Systran/faster-whisper-large-v3-turbo", + language: "en", + ...overrides, + }, + tts: { + default: { enabled: false, url: "", voice: "", format: "" }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, + }; +} + +function createMockVerboseResponse(overrides?: Record): Record { + return { + text: "Hello, world!", + language: "en", + duration: 3.5, + segments: [ + { + id: 0, + text: "Hello, world!", + start: 0.0, + end: 3.5, + avg_logprob: -0.25, + compression_ratio: 1.2, + no_speech_prob: 0.01, + seek: 0, + temperature: 0.0, + tokens: [1, 2, 3], + }, + ], + ...overrides, + }; +} + +describe("SpeachesSttProvider", () => { + let provider: SpeachesSttProvider; + let config: SpeechConfig; + + beforeEach(() => { + vi.clearAllMocks(); + mockOpenAIConstructorCalls.length = 0; + config = createTestConfig(); + provider = new SpeachesSttProvider(config); + }); + + // ========================================== + // Provider identity + // ========================================== + describe("name", () => { + it("should have the name 'speaches'", () => { + expect(provider.name).toBe("speaches"); + }); + }); + + // ========================================== + // transcribe + // ========================================== + describe("transcribe", () => { + it("should call OpenAI audio.transcriptions.create with correct parameters", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await provider.transcribe(audio); + + expect(mockCreate).toHaveBeenCalledOnce(); + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo"); + expect(callArgs.language).toBe("en"); + expect(callArgs.response_format).toBe("verbose_json"); + }); + + it("should convert Buffer to File using toFile", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await provider.transcribe(audio); + + expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", { + type: "audio/wav", + }); + }); + + it("should return TranscriptionResult with text and language", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Hello, world!"); + expect(result.language).toBe("en"); + }); + + it("should return durationSeconds from verbose response", async () => { + const mockResponse = createMockVerboseResponse({ duration: 5.25 }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.durationSeconds).toBe(5.25); + }); + + it("should map segments from verbose response", async () => { + const mockResponse = createMockVerboseResponse({ + segments: [ + { + id: 0, + text: "Hello,", + start: 0.0, + end: 1.5, + avg_logprob: -0.2, + compression_ratio: 1.1, + no_speech_prob: 0.01, + seek: 0, + temperature: 0.0, + tokens: [1, 2], + }, + { + id: 1, + text: " world!", + start: 1.5, + end: 3.5, + avg_logprob: -0.3, + compression_ratio: 1.3, + no_speech_prob: 0.02, + seek: 0, + temperature: 0.0, + tokens: [3, 4], + }, + ], + }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.segments).toHaveLength(2); + expect(result.segments?.[0]).toEqual({ + text: "Hello,", + start: 0.0, + end: 1.5, + }); + expect(result.segments?.[1]).toEqual({ + text: " world!", + start: 1.5, + end: 3.5, + }); + }); + + it("should handle response without segments gracefully", async () => { + const mockResponse = createMockVerboseResponse({ segments: undefined }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Hello, world!"); + expect(result.segments).toBeUndefined(); + }); + + it("should handle response without duration gracefully", async () => { + const mockResponse = createMockVerboseResponse({ duration: undefined }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Hello, world!"); + expect(result.durationSeconds).toBeUndefined(); + }); + + // ------------------------------------------ + // Options override + // ------------------------------------------ + describe("options override", () => { + it("should use custom model from options when provided", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { model: "custom-whisper-model" }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.model).toBe("custom-whisper-model"); + }); + + it("should use custom language from options when provided", async () => { + const mockResponse = createMockVerboseResponse({ language: "fr" }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { language: "fr" }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.language).toBe("fr"); + }); + + it("should pass through prompt option", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { prompt: "This is a meeting about project planning." }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.prompt).toBe("This is a meeting about project planning."); + }); + + it("should pass through temperature option", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { temperature: 0.3 }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.temperature).toBe(0.3); + }); + + it("should use custom mimeType for file conversion when provided", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { mimeType: "audio/mp3" }; + await provider.transcribe(audio, options); + + expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", { + type: "audio/mp3", + }); + }); + }); + + // ------------------------------------------ + // Simple response fallback + // ------------------------------------------ + describe("simple response fallback", () => { + it("should handle simple Transcription response (text only, no verbose fields)", async () => { + // Some configurations may return just { text: "..." } without verbose fields + const simpleResponse = { text: "Simple transcription result." }; + mockCreate.mockResolvedValueOnce(simpleResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Simple transcription result."); + expect(result.language).toBe("en"); // Falls back to config language + expect(result.durationSeconds).toBeUndefined(); + expect(result.segments).toBeUndefined(); + }); + }); + }); + + // ========================================== + // Error handling + // ========================================== + describe("error handling", () => { + it("should throw a descriptive error on connection refused", async () => { + const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000"); + mockCreate.mockRejectedValueOnce(connectionError); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000" + ); + }); + + it("should throw a descriptive error on timeout", async () => { + const timeoutError = new Error("Request timed out"); + mockCreate.mockRejectedValueOnce(timeoutError); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: Request timed out" + ); + }); + + it("should throw a descriptive error on API error", async () => { + const apiError = new Error("Invalid model: nonexistent-model"); + mockCreate.mockRejectedValueOnce(apiError); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: Invalid model: nonexistent-model" + ); + }); + + it("should handle non-Error thrown values", async () => { + mockCreate.mockRejectedValueOnce("unexpected string error"); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: unexpected string error" + ); + }); + }); + + // ========================================== + // isHealthy + // ========================================== + describe("isHealthy", () => { + it("should return true when the server is reachable", async () => { + mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] }); + + const healthy = await provider.isHealthy(); + expect(healthy).toBe(true); + }); + + it("should return false when the server is unreachable", async () => { + mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED")); + + const healthy = await provider.isHealthy(); + expect(healthy).toBe(false); + }); + + it("should not throw on health check failure", async () => { + mockModelsList.mockRejectedValueOnce(new Error("Network error")); + + await expect(provider.isHealthy()).resolves.toBe(false); + }); + + it("should return false on unexpected error types", async () => { + mockModelsList.mockRejectedValueOnce("string error"); + + const healthy = await provider.isHealthy(); + expect(healthy).toBe(false); + }); + }); + + // ========================================== + // Config injection + // ========================================== + describe("config injection", () => { + it("should create OpenAI client with baseURL from config", () => { + // The constructor was called in beforeEach + expect(mockOpenAIConstructorCalls).toHaveLength(1); + expect(mockOpenAIConstructorCalls[0]).toEqual( + expect.objectContaining({ + baseURL: "http://speaches:8000/v1", + }) + ); + }); + + it("should use custom baseURL from config", () => { + mockOpenAIConstructorCalls.length = 0; + const customConfig = createTestConfig({ + baseUrl: "http://custom-speaches:9000/v1", + }); + new SpeachesSttProvider(customConfig); + + expect(mockOpenAIConstructorCalls).toHaveLength(1); + expect(mockOpenAIConstructorCalls[0]).toEqual( + expect.objectContaining({ + baseURL: "http://custom-speaches:9000/v1", + }) + ); + }); + + it("should use default model from config for transcription", async () => { + const customConfig = createTestConfig({ + model: "Systran/faster-whisper-small", + }); + const customProvider = new SpeachesSttProvider(customConfig); + + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await customProvider.transcribe(audio); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.model).toBe("Systran/faster-whisper-small"); + }); + + it("should use default language from config for transcription", async () => { + const customConfig = createTestConfig({ language: "de" }); + const customProvider = new SpeachesSttProvider(customConfig); + + const mockResponse = createMockVerboseResponse({ language: "de" }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await customProvider.transcribe(audio); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.language).toBe("de"); + }); + + it("should set a dummy API key for local Speaches server", () => { + expect(mockOpenAIConstructorCalls).toHaveLength(1); + expect(mockOpenAIConstructorCalls[0]).toEqual( + expect.objectContaining({ + apiKey: "not-needed", + }) + ); + }); + }); +}); diff --git a/apps/api/src/speech/providers/speaches-stt.provider.ts b/apps/api/src/speech/providers/speaches-stt.provider.ts new file mode 100644 index 0000000..9186d90 --- /dev/null +++ b/apps/api/src/speech/providers/speaches-stt.provider.ts @@ -0,0 +1,180 @@ +/** + * SpeachesSttProvider + * + * Speech-to-text provider using Speaches (faster-whisper backend). + * Connects to the Speaches server via its OpenAI-compatible + * `/v1/audio/transcriptions` endpoint using the OpenAI SDK. + * + * Issue #390 + */ + +import { Injectable, Inject, Logger } from "@nestjs/common"; +import OpenAI from "openai"; +import { toFile } from "openai"; +import { speechConfig, type SpeechConfig } from "../speech.config"; +import type { ISTTProvider } from "../interfaces/stt-provider.interface"; +import type { + TranscribeOptions, + TranscriptionResult, + TranscriptionSegment, +} from "../interfaces/speech-types"; + +/** + * Derive file extension from a MIME type for use in the uploaded file name. + */ +function extensionFromMimeType(mimeType: string): string { + const mapping: Record = { + "audio/wav": "wav", + "audio/wave": "wav", + "audio/x-wav": "wav", + "audio/mp3": "mp3", + "audio/mpeg": "mp3", + "audio/mp4": "mp4", + "audio/m4a": "m4a", + "audio/ogg": "ogg", + "audio/flac": "flac", + "audio/webm": "webm", + "audio/mpga": "mpga", + }; + return mapping[mimeType] ?? "wav"; +} + +/** + * STT provider backed by a Speaches (faster-whisper) server. + * + * Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint, + * so we re-use the official OpenAI SDK with a custom `baseURL`. + * + * @example + * ```typescript + * const provider = new SpeachesSttProvider(speechConfig); + * const result = await provider.transcribe(audioBuffer, { language: "en" }); + * console.log(result.text); + * ``` + */ +@Injectable() +export class SpeachesSttProvider implements ISTTProvider { + readonly name = "speaches"; + + private readonly logger = new Logger(SpeachesSttProvider.name); + private readonly client: OpenAI; + private readonly config: SpeechConfig; + + constructor( + @Inject(speechConfig.KEY) + config: SpeechConfig + ) { + this.config = config; + + this.client = new OpenAI({ + baseURL: config.stt.baseUrl, + apiKey: "not-needed", // Speaches does not require an API key + }); + + this.logger.log( + `Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})` + ); + } + + /** + * Transcribe audio data to text using the Speaches server. + * + * Sends the audio buffer to the `/v1/audio/transcriptions` endpoint + * with `response_format=verbose_json` to get segments and duration data. + * + * @param audio - Raw audio data as a Buffer + * @param options - Optional transcription parameters (model, language, prompt, temperature) + * @returns Transcription result with text, language, duration, and optional segments + * @throws {Error} If transcription fails (connection error, API error, etc.) + */ + async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { + const model = options?.model ?? this.config.stt.model; + const language = options?.language ?? this.config.stt.language; + const mimeType = options?.mimeType ?? "audio/wav"; + const extension = extensionFromMimeType(mimeType); + + try { + const file = await toFile(audio, `audio.${extension}`, { + type: mimeType, + }); + + const response = await this.client.audio.transcriptions.create({ + file, + model, + language, + response_format: "verbose_json", + ...(options?.prompt !== undefined ? { prompt: options.prompt } : {}), + ...(options?.temperature !== undefined ? { temperature: options.temperature } : {}), + }); + + return this.mapResponse(response, language); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`Transcription failed: ${message}`); + throw new Error(`STT transcription failed: ${message}`); + } + } + + /** + * Check if the Speaches server is healthy and reachable. + * + * Attempts to list models from the server. Returns true if the request + * succeeds, false otherwise. + * + * @returns true if the Speaches server is reachable and ready + */ + async isHealthy(): Promise { + try { + await this.client.models.list(); + return true; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`Speaches health check failed: ${message}`); + return false; + } + } + + /** + * Map the OpenAI SDK transcription response to our TranscriptionResult type. + * + * Handles both verbose responses (with duration, segments) and simple + * responses (text only). + */ + private mapResponse( + response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record, + fallbackLanguage: string + ): TranscriptionResult { + const text = (response as { text: string }).text; + const verboseResponse = response as { + text: string; + language?: string; + duration?: number; + segments?: { + text: string; + start: number; + end: number; + }[]; + }; + + const result: TranscriptionResult = { + text, + language: verboseResponse.language ?? fallbackLanguage, + }; + + if (verboseResponse.duration !== undefined) { + result.durationSeconds = verboseResponse.duration; + } + + if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) { + result.segments = verboseResponse.segments.map( + (segment): TranscriptionSegment => ({ + text: segment.text, + start: segment.start, + end: segment.end, + }) + ); + } + + return result; + } +} diff --git a/apps/api/src/speech/speech.module.ts b/apps/api/src/speech/speech.module.ts index e18ada5..840123e 100644 --- a/apps/api/src/speech/speech.module.ts +++ b/apps/api/src/speech/speech.module.ts @@ -4,36 +4,60 @@ * NestJS module for speech-to-text (STT) and text-to-speech (TTS) services. * Provides a provider abstraction layer with graceful fallback for TTS tiers. * + * TTS providers are created dynamically based on configuration: + * - default: Kokoro-FastAPI (CPU, always available) + * - premium: Chatterbox (GPU, voice cloning) + * - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU) + * * Imports: * - ConfigModule.forFeature(speechConfig) for speech configuration * * Providers: * - SpeechService: High-level speech operations with provider selection - * - TTS_PROVIDERS: Empty Map (populated by provider modules) + * - TTS_PROVIDERS: Map populated by factory based on config * * Exports: * - SpeechService for use by other modules (e.g., controllers, brain) * - * Issue #389 + * Issue #389, #390, #391 */ import { Module, type OnModuleInit, Logger } from "@nestjs/common"; -import { ConfigModule } from "@nestjs/config"; -import { speechConfig, validateSpeechConfig } from "./speech.config"; +import { ConfigModule, ConfigService } from "@nestjs/config"; +import { + speechConfig, + validateSpeechConfig, + isSttEnabled, + type SpeechConfig, +} from "./speech.config"; import { SpeechService } from "./speech.service"; -import { TTS_PROVIDERS } from "./speech.constants"; -import type { SpeechTier } from "./interfaces/speech-types"; -import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { SpeachesSttProvider } from "./providers/speaches-stt.provider"; +import { createTTSProviders } from "./providers/tts-provider.factory"; @Module({ imports: [ConfigModule.forFeature(speechConfig)], providers: [ SpeechService, - // Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.) - // will register their providers in subsequent tasks. + // STT provider: conditionally register SpeachesSttProvider when STT is enabled + ...(isSttEnabled() + ? [ + { + provide: STT_PROVIDER, + useClass: SpeachesSttProvider, + }, + ] + : []), { provide: TTS_PROVIDERS, - useFactory: (): Map => new Map(), + useFactory: (configService: ConfigService) => { + const config = configService.get("speech"); + if (!config) { + return new Map(); + } + return createTTSProviders(config); + }, + inject: [ConfigService], }, ], exports: [SpeechService],