From c40373fa3ba1e4d2d6adcd392a506f79e5268367 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:09:45 -0600 Subject: [PATCH] feat(#389): create SpeechModule with provider abstraction layer Add SpeechModule with provider interfaces and service skeleton for multi-tier TTS fallback (premium -> default -> fallback) and STT transcription support. Includes 27 unit tests covering provider selection, fallback logic, and availability checks. - ISTTProvider interface with transcribe/isHealthy methods - ITTSProvider interface with synthesize/listVoices/isHealthy methods - Shared types: SpeechTier, TranscriptionResult, SynthesisResult, etc. - SpeechService with graceful TTS fallback chain - NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS) - SpeechModule registered in AppModule - ConfigModule integration via speechConfig registerAs factory Co-Authored-By: Claude Opus 4.6 --- apps/api/src/app.module.ts | 2 + apps/api/src/speech/interfaces/index.ts | 18 + .../api/src/speech/interfaces/speech-types.ts | 149 +++++ .../interfaces/stt-provider.interface.ts | 52 ++ .../interfaces/tts-provider.interface.ts | 68 +++ apps/api/src/speech/speech.constants.ts | 19 + apps/api/src/speech/speech.module.ts | 49 ++ apps/api/src/speech/speech.service.spec.ts | 541 ++++++++++++++++++ apps/api/src/speech/speech.service.ts | 231 ++++++++ 9 files changed, 1129 insertions(+) create mode 100644 apps/api/src/speech/interfaces/index.ts create mode 100644 apps/api/src/speech/interfaces/speech-types.ts create mode 100644 apps/api/src/speech/interfaces/stt-provider.interface.ts create mode 100644 apps/api/src/speech/interfaces/tts-provider.interface.ts create mode 100644 apps/api/src/speech/speech.constants.ts create mode 100644 apps/api/src/speech/speech.module.ts create mode 100644 apps/api/src/speech/speech.service.spec.ts create mode 100644 apps/api/src/speech/speech.service.ts diff --git a/apps/api/src/app.module.ts b/apps/api/src/app.module.ts index 43733e3..c353f3a 100644 --- a/apps/api/src/app.module.ts +++ b/apps/api/src/app.module.ts @@ -37,6 +37,7 @@ import { JobStepsModule } from "./job-steps/job-steps.module"; import { CoordinatorIntegrationModule } from "./coordinator-integration/coordinator-integration.module"; import { FederationModule } from "./federation/federation.module"; import { CredentialsModule } from "./credentials/credentials.module"; +import { SpeechModule } from "./speech/speech.module"; import { RlsContextInterceptor } from "./common/interceptors/rls-context.interceptor"; @Module({ @@ -97,6 +98,7 @@ import { RlsContextInterceptor } from "./common/interceptors/rls-context.interce CoordinatorIntegrationModule, FederationModule, CredentialsModule, + SpeechModule, ], controllers: [AppController, CsrfController], providers: [ diff --git a/apps/api/src/speech/interfaces/index.ts b/apps/api/src/speech/interfaces/index.ts new file mode 100644 index 0000000..ded8bd2 --- /dev/null +++ b/apps/api/src/speech/interfaces/index.ts @@ -0,0 +1,18 @@ +/** + * Speech interfaces barrel export. + * + * Issue #389 + */ + +export type { ISTTProvider } from "./stt-provider.interface"; +export type { ITTSProvider } from "./tts-provider.interface"; +export type { + SpeechTier, + AudioFormat, + TranscribeOptions, + TranscriptionResult, + TranscriptionSegment, + SynthesizeOptions, + SynthesisResult, + VoiceInfo, +} from "./speech-types"; diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts new file mode 100644 index 0000000..3f5a0b7 --- /dev/null +++ b/apps/api/src/speech/interfaces/speech-types.ts @@ -0,0 +1,149 @@ +/** + * Speech Types + * + * Shared types for speech-to-text (STT) and text-to-speech (TTS) services. + * Used by provider interfaces and the SpeechService. + * + * Issue #389 + */ + +// ========================================== +// Enums / Discriminators +// ========================================== + +/** + * TTS provider tier. + * Determines which TTS engine is used for synthesis. + * + * - default: Primary TTS engine (e.g., Kokoro) + * - premium: Higher quality TTS engine (e.g., Chatterbox) + * - fallback: Backup TTS engine (e.g., Piper/OpenedAI) + */ +export type SpeechTier = "default" | "premium" | "fallback"; + +/** + * Audio output format for TTS synthesis. + */ +export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm"; + +// ========================================== +// STT Types +// ========================================== + +/** + * Options for speech-to-text transcription. + */ +export interface TranscribeOptions { + /** Language code (e.g., "en", "fr", "de") */ + language?: string; + + /** Model to use for transcription */ + model?: string; + + /** MIME type of the audio (e.g., "audio/mp3", "audio/wav") */ + mimeType?: string; + + /** Optional prompt to guide transcription */ + prompt?: string; + + /** Temperature for transcription (0.0 - 1.0) */ + temperature?: number; +} + +/** + * Result of a speech-to-text transcription. + */ +export interface TranscriptionResult { + /** Transcribed text */ + text: string; + + /** Language detected or used */ + language: string; + + /** Duration of the audio in seconds */ + durationSeconds?: number; + + /** Confidence score (0.0 - 1.0, if available) */ + confidence?: number; + + /** Individual word or segment timings (if available) */ + segments?: TranscriptionSegment[]; +} + +/** + * A segment within a transcription result. + */ +export interface TranscriptionSegment { + /** Segment text */ + text: string; + + /** Start time in seconds */ + start: number; + + /** End time in seconds */ + end: number; + + /** Confidence for this segment */ + confidence?: number; +} + +// ========================================== +// TTS Types +// ========================================== + +/** + * Options for text-to-speech synthesis. + */ +export interface SynthesizeOptions { + /** Voice ID to use */ + voice?: string; + + /** Desired audio format */ + format?: AudioFormat; + + /** Speech speed multiplier (0.5 - 2.0) */ + speed?: number; + + /** Preferred TTS tier */ + tier?: SpeechTier; +} + +/** + * Result of a text-to-speech synthesis. + */ +export interface SynthesisResult { + /** Synthesized audio data */ + audio: Buffer; + + /** Audio format of the result */ + format: AudioFormat; + + /** Voice used for synthesis */ + voice: string; + + /** Tier that produced the synthesis */ + tier: SpeechTier; + + /** Duration of the generated audio in seconds (if available) */ + durationSeconds?: number; +} + +/** + * Information about an available TTS voice. + */ +export interface VoiceInfo { + /** Voice identifier */ + id: string; + + /** Human-readable voice name */ + name: string; + + /** Language code */ + language?: string; + + /** Tier this voice belongs to */ + tier: SpeechTier; + + /** Whether this is the default voice for its tier */ + isDefault?: boolean; +} diff --git a/apps/api/src/speech/interfaces/stt-provider.interface.ts b/apps/api/src/speech/interfaces/stt-provider.interface.ts new file mode 100644 index 0000000..871fdd1 --- /dev/null +++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts @@ -0,0 +1,52 @@ +/** + * STT Provider Interface + * + * Defines the contract for speech-to-text provider implementations. + * All STT providers (e.g., Speaches/faster-whisper) must implement this interface. + * + * Issue #389 + */ + +import type { TranscribeOptions, TranscriptionResult } from "./speech-types"; + +/** + * Interface for speech-to-text providers. + * + * Implementations wrap an OpenAI-compatible API endpoint for transcription. + * + * @example + * ```typescript + * class SpeachesProvider implements ISTTProvider { + * readonly name = "speaches"; + * + * async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { + * // Call speaches API via OpenAI SDK + * } + * + * async isHealthy(): Promise { + * // Check endpoint health + * } + * } + * ``` + */ +export interface ISTTProvider { + /** Provider name for logging and identification */ + readonly name: string; + + /** + * Transcribe audio data to text. + * + * @param audio - Raw audio data as a Buffer + * @param options - Optional transcription parameters + * @returns Transcription result with text and metadata + * @throws {Error} If transcription fails + */ + transcribe(audio: Buffer, options?: TranscribeOptions): Promise; + + /** + * Check if the provider is healthy and available. + * + * @returns true if the provider endpoint is reachable and ready + */ + isHealthy(): Promise; +} diff --git a/apps/api/src/speech/interfaces/tts-provider.interface.ts b/apps/api/src/speech/interfaces/tts-provider.interface.ts new file mode 100644 index 0000000..9c378fa --- /dev/null +++ b/apps/api/src/speech/interfaces/tts-provider.interface.ts @@ -0,0 +1,68 @@ +/** + * TTS Provider Interface + * + * Defines the contract for text-to-speech provider implementations. + * All TTS providers (e.g., Kokoro, Chatterbox, Piper/OpenedAI) must implement this interface. + * + * Issue #389 + */ + +import type { SynthesizeOptions, SynthesisResult, VoiceInfo, SpeechTier } from "./speech-types"; + +/** + * Interface for text-to-speech providers. + * + * Implementations wrap an OpenAI-compatible API endpoint for speech synthesis. + * Each provider is associated with a SpeechTier (default, premium, fallback). + * + * @example + * ```typescript + * class KokoroProvider implements ITTSProvider { + * readonly name = "kokoro"; + * readonly tier = "default"; + * + * async synthesize(text: string, options?: SynthesizeOptions): Promise { + * // Call Kokoro API via OpenAI SDK + * } + * + * async listVoices(): Promise { + * // Return available voices + * } + * + * async isHealthy(): Promise { + * // Check endpoint health + * } + * } + * ``` + */ +export interface ITTSProvider { + /** Provider name for logging and identification */ + readonly name: string; + + /** Tier this provider serves (default, premium, fallback) */ + readonly tier: SpeechTier; + + /** + * Synthesize text to audio. + * + * @param text - Text to convert to speech + * @param options - Optional synthesis parameters (voice, format, speed) + * @returns Synthesis result with audio buffer and metadata + * @throws {Error} If synthesis fails + */ + synthesize(text: string, options?: SynthesizeOptions): Promise; + + /** + * List available voices for this provider. + * + * @returns Array of voice information objects + */ + listVoices(): Promise; + + /** + * Check if the provider is healthy and available. + * + * @returns true if the provider endpoint is reachable and ready + */ + isHealthy(): Promise; +} diff --git a/apps/api/src/speech/speech.constants.ts b/apps/api/src/speech/speech.constants.ts new file mode 100644 index 0000000..b3a0814 --- /dev/null +++ b/apps/api/src/speech/speech.constants.ts @@ -0,0 +1,19 @@ +/** + * Speech Module Constants + * + * NestJS injection tokens for speech providers. + * + * Issue #389 + */ + +/** + * Injection token for the STT (speech-to-text) provider. + * Providers implementing ISTTProvider register under this token. + */ +export const STT_PROVIDER = Symbol("STT_PROVIDER"); + +/** + * Injection token for TTS (text-to-speech) providers map. + * Registered as Map. + */ +export const TTS_PROVIDERS = Symbol("TTS_PROVIDERS"); diff --git a/apps/api/src/speech/speech.module.ts b/apps/api/src/speech/speech.module.ts new file mode 100644 index 0000000..e18ada5 --- /dev/null +++ b/apps/api/src/speech/speech.module.ts @@ -0,0 +1,49 @@ +/** + * SpeechModule + * + * NestJS module for speech-to-text (STT) and text-to-speech (TTS) services. + * Provides a provider abstraction layer with graceful fallback for TTS tiers. + * + * Imports: + * - ConfigModule.forFeature(speechConfig) for speech configuration + * + * Providers: + * - SpeechService: High-level speech operations with provider selection + * - TTS_PROVIDERS: Empty Map (populated by provider modules) + * + * Exports: + * - SpeechService for use by other modules (e.g., controllers, brain) + * + * Issue #389 + */ + +import { Module, type OnModuleInit, Logger } from "@nestjs/common"; +import { ConfigModule } from "@nestjs/config"; +import { speechConfig, validateSpeechConfig } from "./speech.config"; +import { SpeechService } from "./speech.service"; +import { TTS_PROVIDERS } from "./speech.constants"; +import type { SpeechTier } from "./interfaces/speech-types"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; + +@Module({ + imports: [ConfigModule.forFeature(speechConfig)], + providers: [ + SpeechService, + // Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.) + // will register their providers in subsequent tasks. + { + provide: TTS_PROVIDERS, + useFactory: (): Map => new Map(), + }, + ], + exports: [SpeechService], +}) +export class SpeechModule implements OnModuleInit { + private readonly logger = new Logger(SpeechModule.name); + + onModuleInit(): void { + // Validate configuration at startup (fail fast) + validateSpeechConfig(); + this.logger.log("Speech module initialized"); + } +} diff --git a/apps/api/src/speech/speech.service.spec.ts b/apps/api/src/speech/speech.service.spec.ts new file mode 100644 index 0000000..9e5b0dd --- /dev/null +++ b/apps/api/src/speech/speech.service.spec.ts @@ -0,0 +1,541 @@ +/** + * SpeechService Tests + * + * Issue #389: Tests for provider abstraction layer with fallback logic. + * Written FIRST following TDD (Red-Green-Refactor). + */ + +import { describe, it, expect, beforeEach, vi } from "vitest"; +import { Test, TestingModule } from "@nestjs/testing"; +import { ServiceUnavailableException } from "@nestjs/common"; +import { SpeechService } from "./speech.service"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { speechConfig } from "./speech.config"; +import type { ISTTProvider } from "./interfaces/stt-provider.interface"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import type { + SpeechTier, + TranscriptionResult, + SynthesisResult, + VoiceInfo, +} from "./interfaces/speech-types"; + +// ========================================== +// Mock provider factories +// ========================================== + +function createMockSttProvider(overrides?: Partial): ISTTProvider { + return { + name: "mock-stt", + transcribe: vi.fn().mockResolvedValue({ + text: "Hello world", + language: "en", + durationSeconds: 2.5, + } satisfies TranscriptionResult), + isHealthy: vi.fn().mockResolvedValue(true), + ...overrides, + }; +} + +function createMockTtsProvider(tier: SpeechTier, overrides?: Partial): ITTSProvider { + return { + name: `mock-tts-${tier}`, + tier, + synthesize: vi.fn().mockResolvedValue({ + audio: Buffer.from("fake-audio"), + format: "mp3", + voice: "test-voice", + tier, + } satisfies SynthesisResult), + listVoices: vi + .fn() + .mockResolvedValue([ + { id: `${tier}-voice-1`, name: `${tier} Voice 1`, tier, isDefault: true }, + ] satisfies VoiceInfo[]), + isHealthy: vi.fn().mockResolvedValue(true), + ...overrides, + }; +} + +// ========================================== +// Default config for tests +// ========================================== + +function createTestConfig(): ReturnType { + return { + stt: { + enabled: true, + baseUrl: "http://localhost:8000/v1", + model: "test-model", + language: "en", + }, + tts: { + default: { + enabled: true, + url: "http://localhost:8880/v1", + voice: "test-voice", + format: "mp3", + }, + premium: { + enabled: true, + url: "http://localhost:8881/v1", + }, + fallback: { + enabled: true, + url: "http://localhost:8882/v1", + }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, + } as ReturnType; +} + +// ========================================== +// Test helper: create testing module +// ========================================== + +async function createTestModule(options: { + sttProvider?: ISTTProvider | null; + ttsProviders?: Map; + config?: ReturnType; +}): Promise { + const config = options.config ?? createTestConfig(); + const ttsProviders = options.ttsProviders ?? new Map(); + + const providers: Array<{ provide: symbol | string; useValue: unknown }> = [ + { provide: speechConfig.KEY, useValue: config }, + { provide: TTS_PROVIDERS, useValue: ttsProviders }, + ]; + + if (options.sttProvider !== undefined) { + providers.push({ provide: STT_PROVIDER, useValue: options.sttProvider }); + } + + return Test.createTestingModule({ + providers: [SpeechService, ...providers], + }).compile(); +} + +// ========================================== +// Tests +// ========================================== + +describe("SpeechService", () => { + // ========================================== + // Construction and initialization + // ========================================== + describe("construction", () => { + it("should be defined when all providers are injected", async () => { + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + ttsProviders: new Map([["default", createMockTtsProvider("default")]]), + }); + + const service = module.get(SpeechService); + expect(service).toBeDefined(); + }); + + it("should be defined with no STT provider", async () => { + const module = await createTestModule({ + sttProvider: null, + ttsProviders: new Map([["default", createMockTtsProvider("default")]]), + }); + + const service = module.get(SpeechService); + expect(service).toBeDefined(); + }); + + it("should be defined with empty TTS providers map", async () => { + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + ttsProviders: new Map(), + }); + + const service = module.get(SpeechService); + expect(service).toBeDefined(); + }); + }); + + // ========================================== + // transcribe() + // ========================================== + describe("transcribe", () => { + let service: SpeechService; + let mockStt: ISTTProvider; + + beforeEach(async () => { + mockStt = createMockSttProvider(); + const module = await createTestModule({ sttProvider: mockStt }); + service = module.get(SpeechService); + }); + + it("should delegate to the STT provider", async () => { + const audio = Buffer.from("test-audio"); + const result = await service.transcribe(audio); + + expect(mockStt.transcribe).toHaveBeenCalledWith(audio, undefined); + expect(result.text).toBe("Hello world"); + expect(result.language).toBe("en"); + }); + + it("should pass options to the STT provider", async () => { + const audio = Buffer.from("test-audio"); + const options = { language: "fr", model: "custom-model" }; + await service.transcribe(audio, options); + + expect(mockStt.transcribe).toHaveBeenCalledWith(audio, options); + }); + + it("should throw ServiceUnavailableException when STT is disabled in config", async () => { + const config = createTestConfig(); + config.stt.enabled = false; + const module = await createTestModule({ sttProvider: mockStt, config }); + service = module.get(SpeechService); + + await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow( + ServiceUnavailableException + ); + }); + + it("should throw ServiceUnavailableException when no STT provider is registered", async () => { + const module = await createTestModule({ sttProvider: null }); + service = module.get(SpeechService); + + await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow( + ServiceUnavailableException + ); + }); + + it("should propagate provider errors as ServiceUnavailableException", async () => { + const failingStt = createMockSttProvider({ + transcribe: vi.fn().mockRejectedValue(new Error("Connection refused")), + }); + const module = await createTestModule({ sttProvider: failingStt }); + service = module.get(SpeechService); + + await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow( + ServiceUnavailableException + ); + }); + }); + + // ========================================== + // synthesize() + // ========================================== + describe("synthesize", () => { + let service: SpeechService; + let defaultProvider: ITTSProvider; + let premiumProvider: ITTSProvider; + let fallbackProvider: ITTSProvider; + + beforeEach(async () => { + defaultProvider = createMockTtsProvider("default"); + premiumProvider = createMockTtsProvider("premium"); + fallbackProvider = createMockTtsProvider("fallback"); + + const ttsProviders = new Map([ + ["default", defaultProvider], + ["premium", premiumProvider], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + service = module.get(SpeechService); + }); + + it("should use the default tier when no tier is specified", async () => { + const result = await service.synthesize("Hello world"); + + expect(defaultProvider.synthesize).toHaveBeenCalledWith("Hello world", undefined); + expect(result.tier).toBe("default"); + }); + + it("should use the requested tier when specified", async () => { + const result = await service.synthesize("Hello world", { tier: "premium" }); + + expect(premiumProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("premium"); + }); + + it("should pass options to the TTS provider", async () => { + const options = { voice: "custom-voice", format: "wav" as const }; + await service.synthesize("Hello", options); + + expect(defaultProvider.synthesize).toHaveBeenCalledWith("Hello", options); + }); + + it("should throw ServiceUnavailableException when TTS default is disabled and no tier specified", async () => { + const config = createTestConfig(); + config.tts.default.enabled = false; + config.tts.premium.enabled = false; + config.tts.fallback.enabled = false; + const module = await createTestModule({ + ttsProviders: new Map([["default", defaultProvider]]), + config, + }); + service = module.get(SpeechService); + + await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException); + }); + + it("should throw ServiceUnavailableException when no TTS providers are registered", async () => { + const module = await createTestModule({ ttsProviders: new Map() }); + service = module.get(SpeechService); + + await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException); + }); + }); + + // ========================================== + // synthesize() fallback logic + // ========================================== + describe("synthesize fallback", () => { + it("should fall back from premium to default when premium provider fails", async () => { + const failingPremium = createMockTtsProvider("premium", { + synthesize: vi.fn().mockRejectedValue(new Error("Premium unavailable")), + }); + const defaultProvider = createMockTtsProvider("default"); + + const ttsProviders = new Map([ + ["premium", failingPremium], + ["default", defaultProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello", { tier: "premium" }); + + expect(failingPremium.synthesize).toHaveBeenCalled(); + expect(defaultProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("default"); + }); + + it("should fall back from default to fallback when default provider fails", async () => { + const failingDefault = createMockTtsProvider("default", { + synthesize: vi.fn().mockRejectedValue(new Error("Default unavailable")), + }); + const fallbackProvider = createMockTtsProvider("fallback"); + + const ttsProviders = new Map([ + ["default", failingDefault], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello"); + + expect(failingDefault.synthesize).toHaveBeenCalled(); + expect(fallbackProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("fallback"); + }); + + it("should fall back premium -> default -> fallback", async () => { + const failingPremium = createMockTtsProvider("premium", { + synthesize: vi.fn().mockRejectedValue(new Error("Premium fail")), + }); + const failingDefault = createMockTtsProvider("default", { + synthesize: vi.fn().mockRejectedValue(new Error("Default fail")), + }); + const fallbackProvider = createMockTtsProvider("fallback"); + + const ttsProviders = new Map([ + ["premium", failingPremium], + ["default", failingDefault], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello", { tier: "premium" }); + + expect(failingPremium.synthesize).toHaveBeenCalled(); + expect(failingDefault.synthesize).toHaveBeenCalled(); + expect(fallbackProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("fallback"); + }); + + it("should throw ServiceUnavailableException when all tiers fail", async () => { + const failingDefault = createMockTtsProvider("default", { + synthesize: vi.fn().mockRejectedValue(new Error("Default fail")), + }); + const failingFallback = createMockTtsProvider("fallback", { + synthesize: vi.fn().mockRejectedValue(new Error("Fallback fail")), + }); + + const ttsProviders = new Map([ + ["default", failingDefault], + ["fallback", failingFallback], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException); + }); + + it("should skip unavailable tiers in fallback chain", async () => { + // premium requested, but only fallback registered (no default) + const failingPremium = createMockTtsProvider("premium", { + synthesize: vi.fn().mockRejectedValue(new Error("Premium fail")), + }); + const fallbackProvider = createMockTtsProvider("fallback"); + + const config = createTestConfig(); + config.tts.default.enabled = false; + + const ttsProviders = new Map([ + ["premium", failingPremium], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders, config }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello", { tier: "premium" }); + expect(result.tier).toBe("fallback"); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + describe("listVoices", () => { + it("should aggregate voices from all registered TTS providers", async () => { + const defaultProvider = createMockTtsProvider("default", { + listVoices: vi.fn().mockResolvedValue([ + { id: "voice-1", name: "Voice 1", tier: "default" as SpeechTier, isDefault: true }, + { id: "voice-2", name: "Voice 2", tier: "default" as SpeechTier }, + ]), + }); + const premiumProvider = createMockTtsProvider("premium", { + listVoices: vi + .fn() + .mockResolvedValue([ + { id: "voice-3", name: "Voice 3", tier: "premium" as SpeechTier, isDefault: true }, + ]), + }); + + const ttsProviders = new Map([ + ["default", defaultProvider], + ["premium", premiumProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const voices = await service.listVoices(); + + expect(voices).toHaveLength(3); + expect(voices.map((v) => v.id)).toEqual(["voice-1", "voice-2", "voice-3"]); + }); + + it("should filter voices by tier when specified", async () => { + const defaultProvider = createMockTtsProvider("default", { + listVoices: vi + .fn() + .mockResolvedValue([{ id: "voice-1", name: "Voice 1", tier: "default" as SpeechTier }]), + }); + const premiumProvider = createMockTtsProvider("premium", { + listVoices: vi + .fn() + .mockResolvedValue([{ id: "voice-2", name: "Voice 2", tier: "premium" as SpeechTier }]), + }); + + const ttsProviders = new Map([ + ["default", defaultProvider], + ["premium", premiumProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const voices = await service.listVoices("premium"); + + expect(voices).toHaveLength(1); + expect(voices[0].id).toBe("voice-2"); + // Only the premium provider should have been called + expect(premiumProvider.listVoices).toHaveBeenCalled(); + expect(defaultProvider.listVoices).not.toHaveBeenCalled(); + }); + + it("should return empty array when no TTS providers are registered", async () => { + const module = await createTestModule({ ttsProviders: new Map() }); + const service = module.get(SpeechService); + + const voices = await service.listVoices(); + expect(voices).toEqual([]); + }); + + it("should return empty array when requested tier has no provider", async () => { + const defaultProvider = createMockTtsProvider("default"); + const ttsProviders = new Map([["default", defaultProvider]]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const voices = await service.listVoices("premium"); + expect(voices).toEqual([]); + }); + }); + + // ========================================== + // isSTTAvailable / isTTSAvailable + // ========================================== + describe("availability checks", () => { + it("should report STT as available when enabled and provider registered", async () => { + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + }); + const service = module.get(SpeechService); + + expect(service.isSTTAvailable()).toBe(true); + }); + + it("should report STT as unavailable when disabled in config", async () => { + const config = createTestConfig(); + config.stt.enabled = false; + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + config, + }); + const service = module.get(SpeechService); + + expect(service.isSTTAvailable()).toBe(false); + }); + + it("should report STT as unavailable when no provider registered", async () => { + const module = await createTestModule({ sttProvider: null }); + const service = module.get(SpeechService); + + expect(service.isSTTAvailable()).toBe(false); + }); + + it("should report TTS as available when at least one tier is enabled with a provider", async () => { + const ttsProviders = new Map([ + ["default", createMockTtsProvider("default")], + ]); + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + expect(service.isTTSAvailable()).toBe(true); + }); + + it("should report TTS as unavailable when no providers registered", async () => { + const config = createTestConfig(); + config.tts.default.enabled = false; + config.tts.premium.enabled = false; + config.tts.fallback.enabled = false; + const module = await createTestModule({ ttsProviders: new Map(), config }); + const service = module.get(SpeechService); + + expect(service.isTTSAvailable()).toBe(false); + }); + }); +}); diff --git a/apps/api/src/speech/speech.service.ts b/apps/api/src/speech/speech.service.ts new file mode 100644 index 0000000..4905918 --- /dev/null +++ b/apps/api/src/speech/speech.service.ts @@ -0,0 +1,231 @@ +/** + * SpeechService + * + * High-level service for speech-to-text (STT) and text-to-speech (TTS) operations. + * Manages provider selection and graceful fallback for TTS tiers. + * + * Fallback chain for TTS: premium -> default -> fallback + * Each tier is only attempted if enabled in config and a provider is registered. + * + * Issue #389 + */ + +import { Injectable, Inject, Optional, Logger, ServiceUnavailableException } from "@nestjs/common"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { speechConfig, type SpeechConfig } from "./speech.config"; +import type { ISTTProvider } from "./interfaces/stt-provider.interface"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import type { + SpeechTier, + TranscribeOptions, + TranscriptionResult, + SynthesizeOptions, + SynthesisResult, + VoiceInfo, +} from "./interfaces/speech-types"; + +/** + * Fallback order for TTS tiers. + * When a tier fails, the next tier in this array is attempted. + */ +const TTS_FALLBACK_ORDER: readonly SpeechTier[] = ["premium", "default", "fallback"] as const; + +@Injectable() +export class SpeechService { + private readonly logger = new Logger(SpeechService.name); + + constructor( + @Inject(speechConfig.KEY) + private readonly config: SpeechConfig, + + @Optional() + @Inject(STT_PROVIDER) + private readonly sttProvider: ISTTProvider | null, + + @Inject(TTS_PROVIDERS) + private readonly ttsProviders: Map + ) { + this.logger.log("Speech service initialized"); + + if (this.sttProvider) { + this.logger.log(`STT provider registered: ${this.sttProvider.name}`); + } + + if (this.ttsProviders.size > 0) { + const tierNames = Array.from(this.ttsProviders.keys()).join(", "); + this.logger.log(`TTS providers registered: ${tierNames}`); + } + } + + // ========================================== + // STT Operations + // ========================================== + + /** + * Transcribe audio data to text using the registered STT provider. + * + * @param audio - Raw audio data as a Buffer + * @param options - Optional transcription parameters + * @returns Transcription result with text and metadata + * @throws {ServiceUnavailableException} If STT is disabled or no provider is registered + */ + async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { + if (!this.config.stt.enabled) { + throw new ServiceUnavailableException("Speech-to-text is not enabled"); + } + + if (!this.sttProvider) { + throw new ServiceUnavailableException("No STT provider is registered"); + } + + try { + return await this.sttProvider.transcribe(audio, options); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`STT transcription failed: ${message}`); + throw new ServiceUnavailableException(`Transcription failed: ${message}`); + } + } + + // ========================================== + // TTS Operations + // ========================================== + + /** + * Synthesize text to audio using TTS providers with graceful fallback. + * + * Fallback chain: requested tier -> default -> fallback. + * Only enabled tiers with registered providers are attempted. + * + * @param text - Text to convert to speech + * @param options - Optional synthesis parameters (voice, format, tier) + * @returns Synthesis result with audio buffer and metadata + * @throws {ServiceUnavailableException} If no TTS provider can fulfill the request + */ + async synthesize(text: string, options?: SynthesizeOptions): Promise { + const requestedTier = options?.tier ?? "default"; + const fallbackChain = this.buildFallbackChain(requestedTier); + + if (fallbackChain.length === 0) { + throw new ServiceUnavailableException( + "No TTS providers are available. Check that TTS is enabled and providers are registered." + ); + } + + let lastError: Error | undefined; + + for (const tier of fallbackChain) { + const provider = this.ttsProviders.get(tier); + if (!provider) { + continue; + } + + try { + return await provider.synthesize(text, options); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`TTS tier "${tier}" (${provider.name}) failed: ${message}`); + lastError = error instanceof Error ? error : new Error(message); + } + } + + const errorMessage = lastError?.message ?? "No providers available"; + throw new ServiceUnavailableException(`All TTS providers failed: ${errorMessage}`); + } + + /** + * List available voices across all TTS providers, optionally filtered by tier. + * + * @param tier - Optional tier filter. If omitted, voices from all tiers are returned. + * @returns Array of voice information objects + */ + async listVoices(tier?: SpeechTier): Promise { + const voices: VoiceInfo[] = []; + + if (tier) { + const provider = this.ttsProviders.get(tier); + if (!provider) { + return []; + } + + try { + return await provider.listVoices(); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`Failed to list voices for tier "${tier}": ${message}`); + return []; + } + } + + // Aggregate voices from all providers + for (const [providerTier, provider] of this.ttsProviders) { + try { + const tierVoices = await provider.listVoices(); + voices.push(...tierVoices); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`Failed to list voices for tier "${providerTier}": ${message}`); + } + } + + return voices; + } + + // ========================================== + // Availability Checks + // ========================================== + + /** + * Check if STT is available (enabled in config and provider registered). + */ + isSTTAvailable(): boolean { + return this.config.stt.enabled && this.sttProvider !== null; + } + + /** + * Check if TTS is available (at least one tier enabled with a registered provider). + */ + isTTSAvailable(): boolean { + return this.getEnabledTiers().some((tier) => this.ttsProviders.has(tier)); + } + + // ========================================== + // Private helpers + // ========================================== + + /** + * Build the fallback chain starting from the requested tier. + * Only includes tiers that are enabled in config and have a registered provider. + */ + private buildFallbackChain(requestedTier: SpeechTier): SpeechTier[] { + const startIndex = TTS_FALLBACK_ORDER.indexOf(requestedTier); + if (startIndex === -1) { + return []; + } + + const enabledTiers = this.getEnabledTiers(); + + return TTS_FALLBACK_ORDER.slice(startIndex).filter( + (tier) => enabledTiers.includes(tier) && this.ttsProviders.has(tier) + ); + } + + /** + * Get the list of TTS tiers that are enabled in the configuration. + */ + private getEnabledTiers(): SpeechTier[] { + const tiers: SpeechTier[] = []; + + if (this.config.tts.default.enabled) { + tiers.push("default"); + } + if (this.config.tts.premium.enabled) { + tiers.push("premium"); + } + if (this.config.tts.fallback.enabled) { + tiers.push("fallback"); + } + + return tiers; + } +}