feat(#389): create SpeechModule with provider abstraction layer

Add SpeechModule with provider interfaces and service skeleton for multi-tier TTS fallback (premium -> default -> fallback) and STT transcription support. Includes 27 unit tests covering provider selection, fallback logic, and availability checks. - ISTTProvider interface with transcribe/isHealthy methods - ITTSProvider interface with synthesize/listVoices/isHealthy methods - Shared types: SpeechTier, TranscriptionResult, SynthesisResult, etc. - SpeechService with graceful TTS fallback chain - NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS) - SpeechModule registered in AppModule - ConfigModule integration via speechConfig registerAs factory Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 02:09:45 -06:00
parent 52553c8266
commit c40373fa3b
9 changed files with 1129 additions and 0 deletions
--- a/apps/api/src/speech/interfaces/index.ts
+++ b/apps/api/src/speech/interfaces/index.ts
@@ -0,0 +1,18 @@
+/**
+ * Speech interfaces barrel export.
+ *
+ * Issue #389
+ */
+
+export type { ISTTProvider } from "./stt-provider.interface";
+export type { ITTSProvider } from "./tts-provider.interface";
+export type {
+  SpeechTier,
+  AudioFormat,
+  TranscribeOptions,
+  TranscriptionResult,
+  TranscriptionSegment,
+  SynthesizeOptions,
+  SynthesisResult,
+  VoiceInfo,
+} from "./speech-types";
--- a/apps/api/src/speech/interfaces/speech-types.ts
+++ b/apps/api/src/speech/interfaces/speech-types.ts
@@ -0,0 +1,149 @@
+/**
+ * Speech Types
+ *
+ * Shared types for speech-to-text (STT) and text-to-speech (TTS) services.
+ * Used by provider interfaces and the SpeechService.
+ *
+ * Issue #389
+ */
+
+// ==========================================
+// Enums / Discriminators
+// ==========================================
+
+/**
+ * TTS provider tier.
+ * Determines which TTS engine is used for synthesis.
+ *
+ * - default: Primary TTS engine (e.g., Kokoro)
+ * - premium: Higher quality TTS engine (e.g., Chatterbox)
+ * - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
+ */
+export type SpeechTier = "default" | "premium" | "fallback";
+
+/**
+ * Audio output format for TTS synthesis.
+ */
+export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm";
+
+// ==========================================
+// STT Types
+// ==========================================
+
+/**
+ * Options for speech-to-text transcription.
+ */
+export interface TranscribeOptions {
+  /** Language code (e.g., "en", "fr", "de") */
+  language?: string;
+
+  /** Model to use for transcription */
+  model?: string;
+
+  /** MIME type of the audio (e.g., "audio/mp3", "audio/wav") */
+  mimeType?: string;
+
+  /** Optional prompt to guide transcription */
+  prompt?: string;
+
+  /** Temperature for transcription (0.0 - 1.0) */
+  temperature?: number;
+}
+
+/**
+ * Result of a speech-to-text transcription.
+ */
+export interface TranscriptionResult {
+  /** Transcribed text */
+  text: string;
+
+  /** Language detected or used */
+  language: string;
+
+  /** Duration of the audio in seconds */
+  durationSeconds?: number;
+
+  /** Confidence score (0.0 - 1.0, if available) */
+  confidence?: number;
+
+  /** Individual word or segment timings (if available) */
+  segments?: TranscriptionSegment[];
+}
+
+/**
+ * A segment within a transcription result.
+ */
+export interface TranscriptionSegment {
+  /** Segment text */
+  text: string;
+
+  /** Start time in seconds */
+  start: number;
+
+  /** End time in seconds */
+  end: number;
+
+  /** Confidence for this segment */
+  confidence?: number;
+}
+
+// ==========================================
+// TTS Types
+// ==========================================
+
+/**
+ * Options for text-to-speech synthesis.
+ */
+export interface SynthesizeOptions {
+  /** Voice ID to use */
+  voice?: string;
+
+  /** Desired audio format */
+  format?: AudioFormat;
+
+  /** Speech speed multiplier (0.5 - 2.0) */
+  speed?: number;
+
+  /** Preferred TTS tier */
+  tier?: SpeechTier;
+}
+
+/**
+ * Result of a text-to-speech synthesis.
+ */
+export interface SynthesisResult {
+  /** Synthesized audio data */
+  audio: Buffer;
+
+  /** Audio format of the result */
+  format: AudioFormat;
+
+  /** Voice used for synthesis */
+  voice: string;
+
+  /** Tier that produced the synthesis */
+  tier: SpeechTier;
+
+  /** Duration of the generated audio in seconds (if available) */
+  durationSeconds?: number;
+}
+
+/**
+ * Information about an available TTS voice.
+ */
+export interface VoiceInfo {
+  /** Voice identifier */
+  id: string;
+
+  /** Human-readable voice name */
+  name: string;
+
+  /** Language code */
+  language?: string;
+
+  /** Tier this voice belongs to */
+  tier: SpeechTier;
+
+  /** Whether this is the default voice for its tier */
+  isDefault?: boolean;
+}
--- a/apps/api/src/speech/interfaces/stt-provider.interface.ts
+++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts
@@ -0,0 +1,52 @@
+/**
+ * STT Provider Interface
+ *
+ * Defines the contract for speech-to-text provider implementations.
+ * All STT providers (e.g., Speaches/faster-whisper) must implement this interface.
+ *
+ * Issue #389
+ */
+
+import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
+
+/**
+ * Interface for speech-to-text providers.
+ *
+ * Implementations wrap an OpenAI-compatible API endpoint for transcription.
+ *
+ * @example
+ * ```typescript
+ * class SpeachesProvider implements ISTTProvider {
+ *   readonly name = "speaches";
+ *
+ *   async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
+ *     // Call speaches API via OpenAI SDK
+ *   }
+ *
+ *   async isHealthy(): Promise<boolean> {
+ *     // Check endpoint health
+ *   }
+ * }
+ * ```
+ */
+export interface ISTTProvider {
+  /** Provider name for logging and identification */
+  readonly name: string;
+
+  /**
+   * Transcribe audio data to text.
+   *
+   * @param audio - Raw audio data as a Buffer
+   * @param options - Optional transcription parameters
+   * @returns Transcription result with text and metadata
+   * @throws {Error} If transcription fails
+   */
+  transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult>;
+
+  /**
+   * Check if the provider is healthy and available.
+   *
+   * @returns true if the provider endpoint is reachable and ready
+   */
+  isHealthy(): Promise<boolean>;
+}
--- a/apps/api/src/speech/interfaces/tts-provider.interface.ts
+++ b/apps/api/src/speech/interfaces/tts-provider.interface.ts
@@ -0,0 +1,68 @@
+/**
+ * TTS Provider Interface
+ *
+ * Defines the contract for text-to-speech provider implementations.
+ * All TTS providers (e.g., Kokoro, Chatterbox, Piper/OpenedAI) must implement this interface.
+ *
+ * Issue #389
+ */
+
+import type { SynthesizeOptions, SynthesisResult, VoiceInfo, SpeechTier } from "./speech-types";
+
+/**
+ * Interface for text-to-speech providers.
+ *
+ * Implementations wrap an OpenAI-compatible API endpoint for speech synthesis.
+ * Each provider is associated with a SpeechTier (default, premium, fallback).
+ *
+ * @example
+ * ```typescript
+ * class KokoroProvider implements ITTSProvider {
+ *   readonly name = "kokoro";
+ *   readonly tier = "default";
+ *
+ *   async synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult> {
+ *     // Call Kokoro API via OpenAI SDK
+ *   }
+ *
+ *   async listVoices(): Promise<VoiceInfo[]> {
+ *     // Return available voices
+ *   }
+ *
+ *   async isHealthy(): Promise<boolean> {
+ *     // Check endpoint health
+ *   }
+ * }
+ * ```
+ */
+export interface ITTSProvider {
+  /** Provider name for logging and identification */
+  readonly name: string;
+
+  /** Tier this provider serves (default, premium, fallback) */
+  readonly tier: SpeechTier;
+
+  /**
+   * Synthesize text to audio.
+   *
+   * @param text - Text to convert to speech
+   * @param options - Optional synthesis parameters (voice, format, speed)
+   * @returns Synthesis result with audio buffer and metadata
+   * @throws {Error} If synthesis fails
+   */
+  synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult>;
+
+  /**
+   * List available voices for this provider.
+   *
+   * @returns Array of voice information objects
+   */
+  listVoices(): Promise<VoiceInfo[]>;
+
+  /**
+   * Check if the provider is healthy and available.
+   *
+   * @returns true if the provider endpoint is reachable and ready
+   */
+  isHealthy(): Promise<boolean>;
+}