/** * Chatterbox TTS Provider * * Premium-tier TTS provider with voice cloning and emotion exaggeration support. * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body * parameters for voice cloning (reference_audio) and emotion control (exaggeration). * * Key capabilities: * - Voice cloning via reference audio sample * - Emotion exaggeration control (0.0 - 1.0) * - Cross-language voice transfer (23 languages) * - Graceful degradation when GPU is unavailable (isHealthy returns false) * * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true. * * Issue #394 */ import type { SpeechCreateParams } from "openai/resources/audio/speech"; import { BaseTTSProvider } from "./base-tts.provider"; import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types"; import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types"; /** Default voice for Chatterbox */ const CHATTERBOX_DEFAULT_VOICE = "default"; /** Default audio format for Chatterbox (WAV for highest quality) */ const CHATTERBOX_DEFAULT_FORMAT = "wav" as const; /** Default TTS model identifier */ const DEFAULT_MODEL = "tts-1"; /** Default speech speed multiplier */ const DEFAULT_SPEED = 1.0; /** * Languages supported by Chatterbox for cross-language voice transfer. * Chatterbox supports 23 languages for voice cloning and synthesis. */ const SUPPORTED_LANGUAGES: readonly string[] = [ "en", // English "fr", // French "de", // German "es", // Spanish "it", // Italian "pt", // Portuguese "nl", // Dutch "pl", // Polish "ru", // Russian "uk", // Ukrainian "ja", // Japanese "zh", // Chinese "ko", // Korean "ar", // Arabic "hi", // Hindi "tr", // Turkish "sv", // Swedish "da", // Danish "fi", // Finnish "no", // Norwegian "cs", // Czech "el", // Greek "ro", // Romanian ] as const; /** * Chatterbox TTS provider (premium tier). * * Extends BaseTTSProvider with voice cloning and emotion exaggeration support. * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional * body parameters for its advanced features. * * @example * ```typescript * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1"); * * // Basic synthesis * const result = await provider.synthesize("Hello!"); * * // Voice cloning with emotion * const clonedResult = await provider.synthesize("Hello!", { * referenceAudio: myAudioBuffer, * emotionExaggeration: 0.7, * }); * ``` */ export class ChatterboxTTSProvider extends BaseTTSProvider { readonly name = "chatterbox"; readonly tier: SpeechTier = "premium"; /** * Languages supported for cross-language voice transfer. */ readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES; constructor(baseURL: string) { super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT); } /** * Synthesize text to audio with optional voice cloning and emotion control. * * Overrides the base synthesize() to support Chatterbox-specific options: * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64) * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped) * * These are passed as extra body parameters to the OpenAI-compatible endpoint, * which Chatterbox's API accepts alongside the standard parameters. * * @param text - Text to convert to speech * @param options - Synthesis options, optionally including Chatterbox-specific params * @returns Synthesis result with audio buffer and metadata * @throws {Error} If synthesis fails (e.g., GPU unavailable) */ async synthesize( text: string, options?: SynthesizeOptions | ChatterboxSynthesizeOptions ): Promise { const voice = options?.voice ?? this.defaultVoice; const format = options?.format ?? this.defaultFormat; const speed = options?.speed ?? DEFAULT_SPEED; // Build the request body with standard OpenAI-compatible params const requestBody: Record = { model: DEFAULT_MODEL, input: text, voice, response_format: format, speed, }; // Add Chatterbox-specific params if provided const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined; if (chatterboxOptions?.referenceAudio) { requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64"); } if (chatterboxOptions?.emotionExaggeration !== undefined) { // Clamp to valid range [0.0, 1.0] requestBody.exaggeration = Math.max( 0.0, Math.min(1.0, chatterboxOptions.emotionExaggeration) ); } try { // Use the OpenAI SDK's create method, passing extra params // The OpenAI SDK allows additional body params to be passed through const response = await this.client.audio.speech.create( requestBody as unknown as SpeechCreateParams ); const arrayBuffer = await response.arrayBuffer(); const audio = Buffer.from(arrayBuffer); return { audio, format, voice, tier: this.tier, }; } catch (error: unknown) { const message = error instanceof Error ? error.message : String(error); this.logger.error(`TTS synthesis failed: ${message}`); throw new Error(`TTS synthesis failed for ${this.name}: ${message}`); } } }