feat(#389): create SpeechModule with provider abstraction layer
All checks were successful
ci/woodpecker/push/api Pipeline was successful

Add SpeechModule with provider interfaces and service skeleton for
multi-tier TTS fallback (premium -> default -> fallback) and STT
transcription support. Includes 27 unit tests covering provider
selection, fallback logic, and availability checks.

- ISTTProvider interface with transcribe/isHealthy methods
- ITTSProvider interface with synthesize/listVoices/isHealthy methods
- Shared types: SpeechTier, TranscriptionResult, SynthesisResult, etc.
- SpeechService with graceful TTS fallback chain
- NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS)
- SpeechModule registered in AppModule
- ConfigModule integration via speechConfig registerAs factory

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 02:09:45 -06:00
parent 52553c8266
commit c40373fa3b
9 changed files with 1129 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
/**
* Speech interfaces barrel export.
*
* Issue #389
*/
export type { ISTTProvider } from "./stt-provider.interface";
export type { ITTSProvider } from "./tts-provider.interface";
export type {
SpeechTier,
AudioFormat,
TranscribeOptions,
TranscriptionResult,
TranscriptionSegment,
SynthesizeOptions,
SynthesisResult,
VoiceInfo,
} from "./speech-types";

View File

@@ -0,0 +1,149 @@
/**
* Speech Types
*
* Shared types for speech-to-text (STT) and text-to-speech (TTS) services.
* Used by provider interfaces and the SpeechService.
*
* Issue #389
*/
// ==========================================
// Enums / Discriminators
// ==========================================
/**
* TTS provider tier.
* Determines which TTS engine is used for synthesis.
*
* - default: Primary TTS engine (e.g., Kokoro)
* - premium: Higher quality TTS engine (e.g., Chatterbox)
* - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
*/
export type SpeechTier = "default" | "premium" | "fallback";
/**
* Audio output format for TTS synthesis.
*/
export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm";
// ==========================================
// STT Types
// ==========================================
/**
* Options for speech-to-text transcription.
*/
export interface TranscribeOptions {
/** Language code (e.g., "en", "fr", "de") */
language?: string;
/** Model to use for transcription */
model?: string;
/** MIME type of the audio (e.g., "audio/mp3", "audio/wav") */
mimeType?: string;
/** Optional prompt to guide transcription */
prompt?: string;
/** Temperature for transcription (0.0 - 1.0) */
temperature?: number;
}
/**
* Result of a speech-to-text transcription.
*/
export interface TranscriptionResult {
/** Transcribed text */
text: string;
/** Language detected or used */
language: string;
/** Duration of the audio in seconds */
durationSeconds?: number;
/** Confidence score (0.0 - 1.0, if available) */
confidence?: number;
/** Individual word or segment timings (if available) */
segments?: TranscriptionSegment[];
}
/**
* A segment within a transcription result.
*/
export interface TranscriptionSegment {
/** Segment text */
text: string;
/** Start time in seconds */
start: number;
/** End time in seconds */
end: number;
/** Confidence for this segment */
confidence?: number;
}
// ==========================================
// TTS Types
// ==========================================
/**
* Options for text-to-speech synthesis.
*/
export interface SynthesizeOptions {
/** Voice ID to use */
voice?: string;
/** Desired audio format */
format?: AudioFormat;
/** Speech speed multiplier (0.5 - 2.0) */
speed?: number;
/** Preferred TTS tier */
tier?: SpeechTier;
}
/**
* Result of a text-to-speech synthesis.
*/
export interface SynthesisResult {
/** Synthesized audio data */
audio: Buffer;
/** Audio format of the result */
format: AudioFormat;
/** Voice used for synthesis */
voice: string;
/** Tier that produced the synthesis */
tier: SpeechTier;
/** Duration of the generated audio in seconds (if available) */
durationSeconds?: number;
}
/**
* Information about an available TTS voice.
*/
export interface VoiceInfo {
/** Voice identifier */
id: string;
/** Human-readable voice name */
name: string;
/** Language code */
language?: string;
/** Tier this voice belongs to */
tier: SpeechTier;
/** Whether this is the default voice for its tier */
isDefault?: boolean;
}

View File

@@ -0,0 +1,52 @@
/**
* STT Provider Interface
*
* Defines the contract for speech-to-text provider implementations.
* All STT providers (e.g., Speaches/faster-whisper) must implement this interface.
*
* Issue #389
*/
import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
/**
* Interface for speech-to-text providers.
*
* Implementations wrap an OpenAI-compatible API endpoint for transcription.
*
* @example
* ```typescript
* class SpeachesProvider implements ISTTProvider {
* readonly name = "speaches";
*
* async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
* // Call speaches API via OpenAI SDK
* }
*
* async isHealthy(): Promise<boolean> {
* // Check endpoint health
* }
* }
* ```
*/
export interface ISTTProvider {
/** Provider name for logging and identification */
readonly name: string;
/**
* Transcribe audio data to text.
*
* @param audio - Raw audio data as a Buffer
* @param options - Optional transcription parameters
* @returns Transcription result with text and metadata
* @throws {Error} If transcription fails
*/
transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult>;
/**
* Check if the provider is healthy and available.
*
* @returns true if the provider endpoint is reachable and ready
*/
isHealthy(): Promise<boolean>;
}

View File

@@ -0,0 +1,68 @@
/**
* TTS Provider Interface
*
* Defines the contract for text-to-speech provider implementations.
* All TTS providers (e.g., Kokoro, Chatterbox, Piper/OpenedAI) must implement this interface.
*
* Issue #389
*/
import type { SynthesizeOptions, SynthesisResult, VoiceInfo, SpeechTier } from "./speech-types";
/**
* Interface for text-to-speech providers.
*
* Implementations wrap an OpenAI-compatible API endpoint for speech synthesis.
* Each provider is associated with a SpeechTier (default, premium, fallback).
*
* @example
* ```typescript
* class KokoroProvider implements ITTSProvider {
* readonly name = "kokoro";
* readonly tier = "default";
*
* async synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult> {
* // Call Kokoro API via OpenAI SDK
* }
*
* async listVoices(): Promise<VoiceInfo[]> {
* // Return available voices
* }
*
* async isHealthy(): Promise<boolean> {
* // Check endpoint health
* }
* }
* ```
*/
export interface ITTSProvider {
/** Provider name for logging and identification */
readonly name: string;
/** Tier this provider serves (default, premium, fallback) */
readonly tier: SpeechTier;
/**
* Synthesize text to audio.
*
* @param text - Text to convert to speech
* @param options - Optional synthesis parameters (voice, format, speed)
* @returns Synthesis result with audio buffer and metadata
* @throws {Error} If synthesis fails
*/
synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult>;
/**
* List available voices for this provider.
*
* @returns Array of voice information objects
*/
listVoices(): Promise<VoiceInfo[]>;
/**
* Check if the provider is healthy and available.
*
* @returns true if the provider endpoint is reachable and ready
*/
isHealthy(): Promise<boolean>;
}