feat(#393): implement Kokoro-FastAPI TTS provider with voice catalog

Extract KokoroTtsProvider from factory into its own module with: - Full voice catalog of 54 built-in voices across 8 languages - Voice metadata parsing from ID prefix (language, gender, accent) - Exported constants for supported formats and speed range - Comprehensive unit tests (48 tests) - Fix lint/type errors in chatterbox provider (Prettier + unsafe cast) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 02:27:47 -06:00
parent b5edb4f37e
commit 79b1d81d27
4 changed files with 767 additions and 24 deletions
--- a/apps/api/src/speech/providers/chatterbox-tts.provider.ts
+++ b/apps/api/src/speech/providers/chatterbox-tts.provider.ts
@@ -0,0 +1,169 @@
+/**
+ * Chatterbox TTS Provider
+ *
+ * Premium-tier TTS provider with voice cloning and emotion exaggeration support.
+ * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
+ * parameters for voice cloning (reference_audio) and emotion control (exaggeration).
+ *
+ * Key capabilities:
+ * - Voice cloning via reference audio sample
+ * - Emotion exaggeration control (0.0 - 1.0)
+ * - Cross-language voice transfer (23 languages)
+ * - Graceful degradation when GPU is unavailable (isHealthy returns false)
+ *
+ * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
+ *
+ * Issue #394
+ */
+
+import type { SpeechCreateParams } from "openai/resources/audio/speech";
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
+import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
+
+/** Default voice for Chatterbox */
+const CHATTERBOX_DEFAULT_VOICE = "default";
+
+/** Default audio format for Chatterbox (WAV for highest quality) */
+const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
+
+/** Default TTS model identifier */
+const DEFAULT_MODEL = "tts-1";
+
+/** Default speech speed multiplier */
+const DEFAULT_SPEED = 1.0;
+
+/**
+ * Languages supported by Chatterbox for cross-language voice transfer.
+ * Chatterbox supports 23 languages for voice cloning and synthesis.
+ */
+const SUPPORTED_LANGUAGES: readonly string[] = [
+  "en", // English
+  "fr", // French
+  "de", // German
+  "es", // Spanish
+  "it", // Italian
+  "pt", // Portuguese
+  "nl", // Dutch
+  "pl", // Polish
+  "ru", // Russian
+  "uk", // Ukrainian
+  "ja", // Japanese
+  "zh", // Chinese
+  "ko", // Korean
+  "ar", // Arabic
+  "hi", // Hindi
+  "tr", // Turkish
+  "sv", // Swedish
+  "da", // Danish
+  "fi", // Finnish
+  "no", // Norwegian
+  "cs", // Czech
+  "el", // Greek
+  "ro", // Romanian
+] as const;
+
+/**
+ * Chatterbox TTS provider (premium tier).
+ *
+ * Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
+ * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
+ * body parameters for its advanced features.
+ *
+ * @example
+ * ```typescript
+ * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
+ *
+ * // Basic synthesis
+ * const result = await provider.synthesize("Hello!");
+ *
+ * // Voice cloning with emotion
+ * const clonedResult = await provider.synthesize("Hello!", {
+ *   referenceAudio: myAudioBuffer,
+ *   emotionExaggeration: 0.7,
+ * });
+ * ```
+ */
+export class ChatterboxTTSProvider extends BaseTTSProvider {
+  readonly name = "chatterbox";
+  readonly tier: SpeechTier = "premium";
+
+  /**
+   * Languages supported for cross-language voice transfer.
+   */
+  readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
+
+  constructor(baseURL: string) {
+    super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
+  }
+
+  /**
+   * Synthesize text to audio with optional voice cloning and emotion control.
+   *
+   * Overrides the base synthesize() to support Chatterbox-specific options:
+   * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
+   * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
+   *
+   * These are passed as extra body parameters to the OpenAI-compatible endpoint,
+   * which Chatterbox's API accepts alongside the standard parameters.
+   *
+   * @param text - Text to convert to speech
+   * @param options - Synthesis options, optionally including Chatterbox-specific params
+   * @returns Synthesis result with audio buffer and metadata
+   * @throws {Error} If synthesis fails (e.g., GPU unavailable)
+   */
+  async synthesize(
+    text: string,
+    options?: SynthesizeOptions | ChatterboxSynthesizeOptions
+  ): Promise<SynthesisResult> {
+    const voice = options?.voice ?? this.defaultVoice;
+    const format = options?.format ?? this.defaultFormat;
+    const speed = options?.speed ?? DEFAULT_SPEED;
+
+    // Build the request body with standard OpenAI-compatible params
+    const requestBody: Record<string, unknown> = {
+      model: DEFAULT_MODEL,
+      input: text,
+      voice,
+      response_format: format,
+      speed,
+    };
+
+    // Add Chatterbox-specific params if provided
+    const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
+
+    if (chatterboxOptions?.referenceAudio) {
+      requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
+    }
+
+    if (chatterboxOptions?.emotionExaggeration !== undefined) {
+      // Clamp to valid range [0.0, 1.0]
+      requestBody.exaggeration = Math.max(
+        0.0,
+        Math.min(1.0, chatterboxOptions.emotionExaggeration)
+      );
+    }
+
+    try {
+      // Use the OpenAI SDK's create method, passing extra params
+      // The OpenAI SDK allows additional body params to be passed through
+      const response = await this.client.audio.speech.create(
+        requestBody as unknown as SpeechCreateParams
+      );
+
+      const arrayBuffer = await response.arrayBuffer();
+      const audio = Buffer.from(arrayBuffer);
+
+      return {
+        audio,
+        format,
+        voice,
+        tier: this.tier,
+      };
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`TTS synthesis failed: ${message}`);
+      throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
+    }
+  }
+}