feat(#393): implement Kokoro-FastAPI TTS provider with voice catalog
Some checks failed
ci/woodpecker/push/api Pipeline failed
Some checks failed
ci/woodpecker/push/api Pipeline failed
Extract KokoroTtsProvider from factory into its own module with: - Full voice catalog of 54 built-in voices across 8 languages - Voice metadata parsing from ID prefix (language, gender, accent) - Exported constants for supported formats and speed range - Comprehensive unit tests (48 tests) - Fix lint/type errors in chatterbox provider (Prettier + unsafe cast) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
169
apps/api/src/speech/providers/chatterbox-tts.provider.ts
Normal file
169
apps/api/src/speech/providers/chatterbox-tts.provider.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
/**
|
||||
* Chatterbox TTS Provider
|
||||
*
|
||||
* Premium-tier TTS provider with voice cloning and emotion exaggeration support.
|
||||
* Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
|
||||
* parameters for voice cloning (reference_audio) and emotion control (exaggeration).
|
||||
*
|
||||
* Key capabilities:
|
||||
* - Voice cloning via reference audio sample
|
||||
* - Emotion exaggeration control (0.0 - 1.0)
|
||||
* - Cross-language voice transfer (23 languages)
|
||||
* - Graceful degradation when GPU is unavailable (isHealthy returns false)
|
||||
*
|
||||
* The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
|
||||
*
|
||||
* Issue #394
|
||||
*/
|
||||
|
||||
import type { SpeechCreateParams } from "openai/resources/audio/speech";
|
||||
import { BaseTTSProvider } from "./base-tts.provider";
|
||||
import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
|
||||
import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
|
||||
|
||||
/** Default voice for Chatterbox */
|
||||
const CHATTERBOX_DEFAULT_VOICE = "default";
|
||||
|
||||
/** Default audio format for Chatterbox (WAV for highest quality) */
|
||||
const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
|
||||
|
||||
/** Default TTS model identifier */
|
||||
const DEFAULT_MODEL = "tts-1";
|
||||
|
||||
/** Default speech speed multiplier */
|
||||
const DEFAULT_SPEED = 1.0;
|
||||
|
||||
/**
|
||||
* Languages supported by Chatterbox for cross-language voice transfer.
|
||||
* Chatterbox supports 23 languages for voice cloning and synthesis.
|
||||
*/
|
||||
const SUPPORTED_LANGUAGES: readonly string[] = [
|
||||
"en", // English
|
||||
"fr", // French
|
||||
"de", // German
|
||||
"es", // Spanish
|
||||
"it", // Italian
|
||||
"pt", // Portuguese
|
||||
"nl", // Dutch
|
||||
"pl", // Polish
|
||||
"ru", // Russian
|
||||
"uk", // Ukrainian
|
||||
"ja", // Japanese
|
||||
"zh", // Chinese
|
||||
"ko", // Korean
|
||||
"ar", // Arabic
|
||||
"hi", // Hindi
|
||||
"tr", // Turkish
|
||||
"sv", // Swedish
|
||||
"da", // Danish
|
||||
"fi", // Finnish
|
||||
"no", // Norwegian
|
||||
"cs", // Czech
|
||||
"el", // Greek
|
||||
"ro", // Romanian
|
||||
] as const;
|
||||
|
||||
/**
|
||||
* Chatterbox TTS provider (premium tier).
|
||||
*
|
||||
* Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
|
||||
* The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
|
||||
* body parameters for its advanced features.
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
|
||||
*
|
||||
* // Basic synthesis
|
||||
* const result = await provider.synthesize("Hello!");
|
||||
*
|
||||
* // Voice cloning with emotion
|
||||
* const clonedResult = await provider.synthesize("Hello!", {
|
||||
* referenceAudio: myAudioBuffer,
|
||||
* emotionExaggeration: 0.7,
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
export class ChatterboxTTSProvider extends BaseTTSProvider {
|
||||
readonly name = "chatterbox";
|
||||
readonly tier: SpeechTier = "premium";
|
||||
|
||||
/**
|
||||
* Languages supported for cross-language voice transfer.
|
||||
*/
|
||||
readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
|
||||
|
||||
constructor(baseURL: string) {
|
||||
super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Synthesize text to audio with optional voice cloning and emotion control.
|
||||
*
|
||||
* Overrides the base synthesize() to support Chatterbox-specific options:
|
||||
* - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
|
||||
* - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
|
||||
*
|
||||
* These are passed as extra body parameters to the OpenAI-compatible endpoint,
|
||||
* which Chatterbox's API accepts alongside the standard parameters.
|
||||
*
|
||||
* @param text - Text to convert to speech
|
||||
* @param options - Synthesis options, optionally including Chatterbox-specific params
|
||||
* @returns Synthesis result with audio buffer and metadata
|
||||
* @throws {Error} If synthesis fails (e.g., GPU unavailable)
|
||||
*/
|
||||
async synthesize(
|
||||
text: string,
|
||||
options?: SynthesizeOptions | ChatterboxSynthesizeOptions
|
||||
): Promise<SynthesisResult> {
|
||||
const voice = options?.voice ?? this.defaultVoice;
|
||||
const format = options?.format ?? this.defaultFormat;
|
||||
const speed = options?.speed ?? DEFAULT_SPEED;
|
||||
|
||||
// Build the request body with standard OpenAI-compatible params
|
||||
const requestBody: Record<string, unknown> = {
|
||||
model: DEFAULT_MODEL,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: format,
|
||||
speed,
|
||||
};
|
||||
|
||||
// Add Chatterbox-specific params if provided
|
||||
const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
|
||||
|
||||
if (chatterboxOptions?.referenceAudio) {
|
||||
requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
|
||||
}
|
||||
|
||||
if (chatterboxOptions?.emotionExaggeration !== undefined) {
|
||||
// Clamp to valid range [0.0, 1.0]
|
||||
requestBody.exaggeration = Math.max(
|
||||
0.0,
|
||||
Math.min(1.0, chatterboxOptions.emotionExaggeration)
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
// Use the OpenAI SDK's create method, passing extra params
|
||||
// The OpenAI SDK allows additional body params to be passed through
|
||||
const response = await this.client.audio.speech.create(
|
||||
requestBody as unknown as SpeechCreateParams
|
||||
);
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
const audio = Buffer.from(arrayBuffer);
|
||||
|
||||
return {
|
||||
audio,
|
||||
format,
|
||||
voice,
|
||||
tier: this.tier,
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
this.logger.error(`TTS synthesis failed: ${message}`);
|
||||
throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user