feat(#389): create SpeechModule with provider abstraction layer
All checks were successful
ci/woodpecker/push/api Pipeline was successful
All checks were successful
ci/woodpecker/push/api Pipeline was successful
Add SpeechModule with provider interfaces and service skeleton for multi-tier TTS fallback (premium -> default -> fallback) and STT transcription support. Includes 27 unit tests covering provider selection, fallback logic, and availability checks. - ISTTProvider interface with transcribe/isHealthy methods - ITTSProvider interface with synthesize/listVoices/isHealthy methods - Shared types: SpeechTier, TranscriptionResult, SynthesisResult, etc. - SpeechService with graceful TTS fallback chain - NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS) - SpeechModule registered in AppModule - ConfigModule integration via speechConfig registerAs factory Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
18
apps/api/src/speech/interfaces/index.ts
Normal file
18
apps/api/src/speech/interfaces/index.ts
Normal file
@@ -0,0 +1,18 @@
|
||||
/**
|
||||
* Speech interfaces barrel export.
|
||||
*
|
||||
* Issue #389
|
||||
*/
|
||||
|
||||
export type { ISTTProvider } from "./stt-provider.interface";
|
||||
export type { ITTSProvider } from "./tts-provider.interface";
|
||||
export type {
|
||||
SpeechTier,
|
||||
AudioFormat,
|
||||
TranscribeOptions,
|
||||
TranscriptionResult,
|
||||
TranscriptionSegment,
|
||||
SynthesizeOptions,
|
||||
SynthesisResult,
|
||||
VoiceInfo,
|
||||
} from "./speech-types";
|
||||
149
apps/api/src/speech/interfaces/speech-types.ts
Normal file
149
apps/api/src/speech/interfaces/speech-types.ts
Normal file
@@ -0,0 +1,149 @@
|
||||
/**
|
||||
* Speech Types
|
||||
*
|
||||
* Shared types for speech-to-text (STT) and text-to-speech (TTS) services.
|
||||
* Used by provider interfaces and the SpeechService.
|
||||
*
|
||||
* Issue #389
|
||||
*/
|
||||
|
||||
// ==========================================
|
||||
// Enums / Discriminators
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* TTS provider tier.
|
||||
* Determines which TTS engine is used for synthesis.
|
||||
*
|
||||
* - default: Primary TTS engine (e.g., Kokoro)
|
||||
* - premium: Higher quality TTS engine (e.g., Chatterbox)
|
||||
* - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
|
||||
*/
|
||||
export type SpeechTier = "default" | "premium" | "fallback";
|
||||
|
||||
/**
|
||||
* Audio output format for TTS synthesis.
|
||||
*/
|
||||
export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm";
|
||||
|
||||
// ==========================================
|
||||
// STT Types
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* Options for speech-to-text transcription.
|
||||
*/
|
||||
export interface TranscribeOptions {
|
||||
/** Language code (e.g., "en", "fr", "de") */
|
||||
language?: string;
|
||||
|
||||
/** Model to use for transcription */
|
||||
model?: string;
|
||||
|
||||
/** MIME type of the audio (e.g., "audio/mp3", "audio/wav") */
|
||||
mimeType?: string;
|
||||
|
||||
/** Optional prompt to guide transcription */
|
||||
prompt?: string;
|
||||
|
||||
/** Temperature for transcription (0.0 - 1.0) */
|
||||
temperature?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of a speech-to-text transcription.
|
||||
*/
|
||||
export interface TranscriptionResult {
|
||||
/** Transcribed text */
|
||||
text: string;
|
||||
|
||||
/** Language detected or used */
|
||||
language: string;
|
||||
|
||||
/** Duration of the audio in seconds */
|
||||
durationSeconds?: number;
|
||||
|
||||
/** Confidence score (0.0 - 1.0, if available) */
|
||||
confidence?: number;
|
||||
|
||||
/** Individual word or segment timings (if available) */
|
||||
segments?: TranscriptionSegment[];
|
||||
}
|
||||
|
||||
/**
|
||||
* A segment within a transcription result.
|
||||
*/
|
||||
export interface TranscriptionSegment {
|
||||
/** Segment text */
|
||||
text: string;
|
||||
|
||||
/** Start time in seconds */
|
||||
start: number;
|
||||
|
||||
/** End time in seconds */
|
||||
end: number;
|
||||
|
||||
/** Confidence for this segment */
|
||||
confidence?: number;
|
||||
}
|
||||
|
||||
// ==========================================
|
||||
// TTS Types
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* Options for text-to-speech synthesis.
|
||||
*/
|
||||
export interface SynthesizeOptions {
|
||||
/** Voice ID to use */
|
||||
voice?: string;
|
||||
|
||||
/** Desired audio format */
|
||||
format?: AudioFormat;
|
||||
|
||||
/** Speech speed multiplier (0.5 - 2.0) */
|
||||
speed?: number;
|
||||
|
||||
/** Preferred TTS tier */
|
||||
tier?: SpeechTier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of a text-to-speech synthesis.
|
||||
*/
|
||||
export interface SynthesisResult {
|
||||
/** Synthesized audio data */
|
||||
audio: Buffer;
|
||||
|
||||
/** Audio format of the result */
|
||||
format: AudioFormat;
|
||||
|
||||
/** Voice used for synthesis */
|
||||
voice: string;
|
||||
|
||||
/** Tier that produced the synthesis */
|
||||
tier: SpeechTier;
|
||||
|
||||
/** Duration of the generated audio in seconds (if available) */
|
||||
durationSeconds?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Information about an available TTS voice.
|
||||
*/
|
||||
export interface VoiceInfo {
|
||||
/** Voice identifier */
|
||||
id: string;
|
||||
|
||||
/** Human-readable voice name */
|
||||
name: string;
|
||||
|
||||
/** Language code */
|
||||
language?: string;
|
||||
|
||||
/** Tier this voice belongs to */
|
||||
tier: SpeechTier;
|
||||
|
||||
/** Whether this is the default voice for its tier */
|
||||
isDefault?: boolean;
|
||||
}
|
||||
52
apps/api/src/speech/interfaces/stt-provider.interface.ts
Normal file
52
apps/api/src/speech/interfaces/stt-provider.interface.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
/**
|
||||
* STT Provider Interface
|
||||
*
|
||||
* Defines the contract for speech-to-text provider implementations.
|
||||
* All STT providers (e.g., Speaches/faster-whisper) must implement this interface.
|
||||
*
|
||||
* Issue #389
|
||||
*/
|
||||
|
||||
import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
|
||||
|
||||
/**
|
||||
* Interface for speech-to-text providers.
|
||||
*
|
||||
* Implementations wrap an OpenAI-compatible API endpoint for transcription.
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* class SpeachesProvider implements ISTTProvider {
|
||||
* readonly name = "speaches";
|
||||
*
|
||||
* async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
|
||||
* // Call speaches API via OpenAI SDK
|
||||
* }
|
||||
*
|
||||
* async isHealthy(): Promise<boolean> {
|
||||
* // Check endpoint health
|
||||
* }
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
export interface ISTTProvider {
|
||||
/** Provider name for logging and identification */
|
||||
readonly name: string;
|
||||
|
||||
/**
|
||||
* Transcribe audio data to text.
|
||||
*
|
||||
* @param audio - Raw audio data as a Buffer
|
||||
* @param options - Optional transcription parameters
|
||||
* @returns Transcription result with text and metadata
|
||||
* @throws {Error} If transcription fails
|
||||
*/
|
||||
transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult>;
|
||||
|
||||
/**
|
||||
* Check if the provider is healthy and available.
|
||||
*
|
||||
* @returns true if the provider endpoint is reachable and ready
|
||||
*/
|
||||
isHealthy(): Promise<boolean>;
|
||||
}
|
||||
68
apps/api/src/speech/interfaces/tts-provider.interface.ts
Normal file
68
apps/api/src/speech/interfaces/tts-provider.interface.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
/**
|
||||
* TTS Provider Interface
|
||||
*
|
||||
* Defines the contract for text-to-speech provider implementations.
|
||||
* All TTS providers (e.g., Kokoro, Chatterbox, Piper/OpenedAI) must implement this interface.
|
||||
*
|
||||
* Issue #389
|
||||
*/
|
||||
|
||||
import type { SynthesizeOptions, SynthesisResult, VoiceInfo, SpeechTier } from "./speech-types";
|
||||
|
||||
/**
|
||||
* Interface for text-to-speech providers.
|
||||
*
|
||||
* Implementations wrap an OpenAI-compatible API endpoint for speech synthesis.
|
||||
* Each provider is associated with a SpeechTier (default, premium, fallback).
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* class KokoroProvider implements ITTSProvider {
|
||||
* readonly name = "kokoro";
|
||||
* readonly tier = "default";
|
||||
*
|
||||
* async synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult> {
|
||||
* // Call Kokoro API via OpenAI SDK
|
||||
* }
|
||||
*
|
||||
* async listVoices(): Promise<VoiceInfo[]> {
|
||||
* // Return available voices
|
||||
* }
|
||||
*
|
||||
* async isHealthy(): Promise<boolean> {
|
||||
* // Check endpoint health
|
||||
* }
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
export interface ITTSProvider {
|
||||
/** Provider name for logging and identification */
|
||||
readonly name: string;
|
||||
|
||||
/** Tier this provider serves (default, premium, fallback) */
|
||||
readonly tier: SpeechTier;
|
||||
|
||||
/**
|
||||
* Synthesize text to audio.
|
||||
*
|
||||
* @param text - Text to convert to speech
|
||||
* @param options - Optional synthesis parameters (voice, format, speed)
|
||||
* @returns Synthesis result with audio buffer and metadata
|
||||
* @throws {Error} If synthesis fails
|
||||
*/
|
||||
synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult>;
|
||||
|
||||
/**
|
||||
* List available voices for this provider.
|
||||
*
|
||||
* @returns Array of voice information objects
|
||||
*/
|
||||
listVoices(): Promise<VoiceInfo[]>;
|
||||
|
||||
/**
|
||||
* Check if the provider is healthy and available.
|
||||
*
|
||||
* @returns true if the provider endpoint is reachable and ready
|
||||
*/
|
||||
isHealthy(): Promise<boolean>;
|
||||
}
|
||||
Reference in New Issue
Block a user