Files
stack/apps/api/src/speech/providers/chatterbox-tts.provider.ts
Jason Woltje 79b1d81d27
Some checks failed
ci/woodpecker/push/api Pipeline failed
feat(#393): implement Kokoro-FastAPI TTS provider with voice catalog
Extract KokoroTtsProvider from factory into its own module with:
- Full voice catalog of 54 built-in voices across 8 languages
- Voice metadata parsing from ID prefix (language, gender, accent)
- Exported constants for supported formats and speed range
- Comprehensive unit tests (48 tests)
- Fix lint/type errors in chatterbox provider (Prettier + unsafe cast)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 02:27:47 -06:00

170 lines
5.3 KiB
TypeScript

/**
* Chatterbox TTS Provider
*
* Premium-tier TTS provider with voice cloning and emotion exaggeration support.
* Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
* parameters for voice cloning (reference_audio) and emotion control (exaggeration).
*
* Key capabilities:
* - Voice cloning via reference audio sample
* - Emotion exaggeration control (0.0 - 1.0)
* - Cross-language voice transfer (23 languages)
* - Graceful degradation when GPU is unavailable (isHealthy returns false)
*
* The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
*
* Issue #394
*/
import type { SpeechCreateParams } from "openai/resources/audio/speech";
import { BaseTTSProvider } from "./base-tts.provider";
import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
/** Default voice for Chatterbox */
const CHATTERBOX_DEFAULT_VOICE = "default";
/** Default audio format for Chatterbox (WAV for highest quality) */
const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
/** Default TTS model identifier */
const DEFAULT_MODEL = "tts-1";
/** Default speech speed multiplier */
const DEFAULT_SPEED = 1.0;
/**
* Languages supported by Chatterbox for cross-language voice transfer.
* Chatterbox supports 23 languages for voice cloning and synthesis.
*/
const SUPPORTED_LANGUAGES: readonly string[] = [
"en", // English
"fr", // French
"de", // German
"es", // Spanish
"it", // Italian
"pt", // Portuguese
"nl", // Dutch
"pl", // Polish
"ru", // Russian
"uk", // Ukrainian
"ja", // Japanese
"zh", // Chinese
"ko", // Korean
"ar", // Arabic
"hi", // Hindi
"tr", // Turkish
"sv", // Swedish
"da", // Danish
"fi", // Finnish
"no", // Norwegian
"cs", // Czech
"el", // Greek
"ro", // Romanian
] as const;
/**
* Chatterbox TTS provider (premium tier).
*
* Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
* The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
* body parameters for its advanced features.
*
* @example
* ```typescript
* const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
*
* // Basic synthesis
* const result = await provider.synthesize("Hello!");
*
* // Voice cloning with emotion
* const clonedResult = await provider.synthesize("Hello!", {
* referenceAudio: myAudioBuffer,
* emotionExaggeration: 0.7,
* });
* ```
*/
export class ChatterboxTTSProvider extends BaseTTSProvider {
readonly name = "chatterbox";
readonly tier: SpeechTier = "premium";
/**
* Languages supported for cross-language voice transfer.
*/
readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
constructor(baseURL: string) {
super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
}
/**
* Synthesize text to audio with optional voice cloning and emotion control.
*
* Overrides the base synthesize() to support Chatterbox-specific options:
* - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
* - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
*
* These are passed as extra body parameters to the OpenAI-compatible endpoint,
* which Chatterbox's API accepts alongside the standard parameters.
*
* @param text - Text to convert to speech
* @param options - Synthesis options, optionally including Chatterbox-specific params
* @returns Synthesis result with audio buffer and metadata
* @throws {Error} If synthesis fails (e.g., GPU unavailable)
*/
async synthesize(
text: string,
options?: SynthesizeOptions | ChatterboxSynthesizeOptions
): Promise<SynthesisResult> {
const voice = options?.voice ?? this.defaultVoice;
const format = options?.format ?? this.defaultFormat;
const speed = options?.speed ?? DEFAULT_SPEED;
// Build the request body with standard OpenAI-compatible params
const requestBody: Record<string, unknown> = {
model: DEFAULT_MODEL,
input: text,
voice,
response_format: format,
speed,
};
// Add Chatterbox-specific params if provided
const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
if (chatterboxOptions?.referenceAudio) {
requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
}
if (chatterboxOptions?.emotionExaggeration !== undefined) {
// Clamp to valid range [0.0, 1.0]
requestBody.exaggeration = Math.max(
0.0,
Math.min(1.0, chatterboxOptions.emotionExaggeration)
);
}
try {
// Use the OpenAI SDK's create method, passing extra params
// The OpenAI SDK allows additional body params to be passed through
const response = await this.client.audio.speech.create(
requestBody as unknown as SpeechCreateParams
);
const arrayBuffer = await response.arrayBuffer();
const audio = Buffer.from(arrayBuffer);
return {
audio,
format,
voice,
tier: this.tier,
};
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.error(`TTS synthesis failed: ${message}`);
throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
}
}
}