feat(#394): implement Chatterbox TTS provider with voice cloning

Add ChatterboxSynthesizeOptions interface with referenceAudio and emotionExaggeration fields, and comprehensive unit tests (26 tests) covering voice cloning, emotion control, clamping, graceful degradation, and cross-language support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 02:29:38 -06:00
parent 79b1d81d27
commit d37c78f503
2 changed files with 463 additions and 0 deletions
--- a/apps/api/src/speech/interfaces/speech-types.ts
+++ b/apps/api/src/speech/interfaces/speech-types.ts
@@ -128,6 +128,33 @@ export interface SynthesisResult {
  durationSeconds?: number;
 }

+/**
+ * Extended options for Chatterbox TTS synthesis.
+ *
+ * Chatterbox supports voice cloning via a reference audio buffer and
+ * emotion exaggeration control. These are passed as extra body parameters
+ * to the OpenAI-compatible API.
+ *
+ * Issue #394
+ */
+export interface ChatterboxSynthesizeOptions extends SynthesizeOptions {
+  /**
+   * Reference audio buffer for voice cloning.
+   * When provided, Chatterbox will clone the voice from this audio sample.
+   * Should be a WAV or MP3 file of 5-30 seconds for best results.
+   */
+  referenceAudio?: Buffer;
+
+  /**
+   * Emotion exaggeration factor (0.0 to 1.0).
+   * Controls how much emotional expression is applied to the synthesized speech.
+   * - 0.0: Neutral, minimal emotion
+   * - 0.5: Moderate emotion (default when not specified)
+   * - 1.0: Maximum emotion exaggeration
+   */
+  emotionExaggeration?: number;
+}
+
 /**
 * Information about an available TTS voice.
 */