Some checks failed
ci/woodpecker/push/api Pipeline failed
Extract KokoroTtsProvider from factory into its own module with: - Full voice catalog of 54 built-in voices across 8 languages - Voice metadata parsing from ID prefix (language, gender, accent) - Exported constants for supported formats and speed range - Comprehensive unit tests (48 tests) - Fix lint/type errors in chatterbox provider (Prettier + unsafe cast) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
170 lines
5.3 KiB
TypeScript
170 lines
5.3 KiB
TypeScript
/**
|
|
* Chatterbox TTS Provider
|
|
*
|
|
* Premium-tier TTS provider with voice cloning and emotion exaggeration support.
|
|
* Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
|
|
* parameters for voice cloning (reference_audio) and emotion control (exaggeration).
|
|
*
|
|
* Key capabilities:
|
|
* - Voice cloning via reference audio sample
|
|
* - Emotion exaggeration control (0.0 - 1.0)
|
|
* - Cross-language voice transfer (23 languages)
|
|
* - Graceful degradation when GPU is unavailable (isHealthy returns false)
|
|
*
|
|
* The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
|
|
*
|
|
* Issue #394
|
|
*/
|
|
|
|
import type { SpeechCreateParams } from "openai/resources/audio/speech";
|
|
import { BaseTTSProvider } from "./base-tts.provider";
|
|
import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
|
|
import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
|
|
|
|
/** Default voice for Chatterbox */
|
|
const CHATTERBOX_DEFAULT_VOICE = "default";
|
|
|
|
/** Default audio format for Chatterbox (WAV for highest quality) */
|
|
const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
|
|
|
|
/** Default TTS model identifier */
|
|
const DEFAULT_MODEL = "tts-1";
|
|
|
|
/** Default speech speed multiplier */
|
|
const DEFAULT_SPEED = 1.0;
|
|
|
|
/**
|
|
* Languages supported by Chatterbox for cross-language voice transfer.
|
|
* Chatterbox supports 23 languages for voice cloning and synthesis.
|
|
*/
|
|
const SUPPORTED_LANGUAGES: readonly string[] = [
|
|
"en", // English
|
|
"fr", // French
|
|
"de", // German
|
|
"es", // Spanish
|
|
"it", // Italian
|
|
"pt", // Portuguese
|
|
"nl", // Dutch
|
|
"pl", // Polish
|
|
"ru", // Russian
|
|
"uk", // Ukrainian
|
|
"ja", // Japanese
|
|
"zh", // Chinese
|
|
"ko", // Korean
|
|
"ar", // Arabic
|
|
"hi", // Hindi
|
|
"tr", // Turkish
|
|
"sv", // Swedish
|
|
"da", // Danish
|
|
"fi", // Finnish
|
|
"no", // Norwegian
|
|
"cs", // Czech
|
|
"el", // Greek
|
|
"ro", // Romanian
|
|
] as const;
|
|
|
|
/**
|
|
* Chatterbox TTS provider (premium tier).
|
|
*
|
|
* Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
|
|
* The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
|
|
* body parameters for its advanced features.
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
|
|
*
|
|
* // Basic synthesis
|
|
* const result = await provider.synthesize("Hello!");
|
|
*
|
|
* // Voice cloning with emotion
|
|
* const clonedResult = await provider.synthesize("Hello!", {
|
|
* referenceAudio: myAudioBuffer,
|
|
* emotionExaggeration: 0.7,
|
|
* });
|
|
* ```
|
|
*/
|
|
export class ChatterboxTTSProvider extends BaseTTSProvider {
|
|
readonly name = "chatterbox";
|
|
readonly tier: SpeechTier = "premium";
|
|
|
|
/**
|
|
* Languages supported for cross-language voice transfer.
|
|
*/
|
|
readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
|
|
|
|
constructor(baseURL: string) {
|
|
super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
|
|
}
|
|
|
|
/**
|
|
* Synthesize text to audio with optional voice cloning and emotion control.
|
|
*
|
|
* Overrides the base synthesize() to support Chatterbox-specific options:
|
|
* - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
|
|
* - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
|
|
*
|
|
* These are passed as extra body parameters to the OpenAI-compatible endpoint,
|
|
* which Chatterbox's API accepts alongside the standard parameters.
|
|
*
|
|
* @param text - Text to convert to speech
|
|
* @param options - Synthesis options, optionally including Chatterbox-specific params
|
|
* @returns Synthesis result with audio buffer and metadata
|
|
* @throws {Error} If synthesis fails (e.g., GPU unavailable)
|
|
*/
|
|
async synthesize(
|
|
text: string,
|
|
options?: SynthesizeOptions | ChatterboxSynthesizeOptions
|
|
): Promise<SynthesisResult> {
|
|
const voice = options?.voice ?? this.defaultVoice;
|
|
const format = options?.format ?? this.defaultFormat;
|
|
const speed = options?.speed ?? DEFAULT_SPEED;
|
|
|
|
// Build the request body with standard OpenAI-compatible params
|
|
const requestBody: Record<string, unknown> = {
|
|
model: DEFAULT_MODEL,
|
|
input: text,
|
|
voice,
|
|
response_format: format,
|
|
speed,
|
|
};
|
|
|
|
// Add Chatterbox-specific params if provided
|
|
const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
|
|
|
|
if (chatterboxOptions?.referenceAudio) {
|
|
requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
|
|
}
|
|
|
|
if (chatterboxOptions?.emotionExaggeration !== undefined) {
|
|
// Clamp to valid range [0.0, 1.0]
|
|
requestBody.exaggeration = Math.max(
|
|
0.0,
|
|
Math.min(1.0, chatterboxOptions.emotionExaggeration)
|
|
);
|
|
}
|
|
|
|
try {
|
|
// Use the OpenAI SDK's create method, passing extra params
|
|
// The OpenAI SDK allows additional body params to be passed through
|
|
const response = await this.client.audio.speech.create(
|
|
requestBody as unknown as SpeechCreateParams
|
|
);
|
|
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
const audio = Buffer.from(arrayBuffer);
|
|
|
|
return {
|
|
audio,
|
|
format,
|
|
voice,
|
|
tier: this.tier,
|
|
};
|
|
} catch (error: unknown) {
|
|
const message = error instanceof Error ? error.message : String(error);
|
|
this.logger.error(`TTS synthesis failed: ${message}`);
|
|
throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
|
|
}
|
|
}
|
|
}
|