stack/apps/api/src/speech/providers/chatterbox-tts.provider.ts

/**
 * Chatterbox TTS Provider
 *
 * Premium-tier TTS provider with voice cloning and emotion exaggeration support.
 * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
 * parameters for voice cloning (reference_audio) and emotion control (exaggeration).
 *
 * Key capabilities:
 * - Voice cloning via reference audio sample
 * - Emotion exaggeration control (0.0 - 1.0)
 * - Cross-language voice transfer (23 languages)
 * - Graceful degradation when GPU is unavailable (isHealthy returns false)
 *
 * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
 *
 * Issue #394
 */

import type { SpeechCreateParams } from "openai/resources/audio/speech";
import { BaseTTSProvider } from "./base-tts.provider";
import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";

/** Default voice for Chatterbox */
const CHATTERBOX_DEFAULT_VOICE = "default";

/** Default audio format for Chatterbox (WAV for highest quality) */
const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;

/** Default TTS model identifier */
const DEFAULT_MODEL = "tts-1";

/** Default speech speed multiplier */
const DEFAULT_SPEED = 1.0;

/**
 * Languages supported by Chatterbox for cross-language voice transfer.
 * Chatterbox supports 23 languages for voice cloning and synthesis.
 */
const SUPPORTED_LANGUAGES: readonly string[] = [
  "en", // English
  "fr", // French
  "de", // German
  "es", // Spanish
  "it", // Italian
  "pt", // Portuguese
  "nl", // Dutch
  "pl", // Polish
  "ru", // Russian
  "uk", // Ukrainian
  "ja", // Japanese
  "zh", // Chinese
  "ko", // Korean
  "ar", // Arabic
  "hi", // Hindi
  "tr", // Turkish
  "sv", // Swedish
  "da", // Danish
  "fi", // Finnish
  "no", // Norwegian
  "cs", // Czech
  "el", // Greek
  "ro", // Romanian
] as const;

/**
 * Chatterbox TTS provider (premium tier).
 *
 * Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
 * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
 * body parameters for its advanced features.
 *
 * @example
 * ```typescript
 * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
 *
 * // Basic synthesis
 * const result = await provider.synthesize("Hello!");
 *
 * // Voice cloning with emotion
 * const clonedResult = await provider.synthesize("Hello!", {
 *   referenceAudio: myAudioBuffer,
 *   emotionExaggeration: 0.7,
 * });
 * ```
 */
export class ChatterboxTTSProvider extends BaseTTSProvider {
  readonly name = "chatterbox";
  readonly tier: SpeechTier = "premium";

  /**
   * Languages supported for cross-language voice transfer.
   */
  readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;

  constructor(baseURL: string) {
    super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
  }

  /**
   * Synthesize text to audio with optional voice cloning and emotion control.
   *
   * Overrides the base synthesize() to support Chatterbox-specific options:
   * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
   * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
   *
   * These are passed as extra body parameters to the OpenAI-compatible endpoint,
   * which Chatterbox's API accepts alongside the standard parameters.
   *
   * @param text - Text to convert to speech
   * @param options - Synthesis options, optionally including Chatterbox-specific params
   * @returns Synthesis result with audio buffer and metadata
   * @throws {Error} If synthesis fails (e.g., GPU unavailable)
   */
  async synthesize(
    text: string,
    options?: SynthesizeOptions | ChatterboxSynthesizeOptions
  ): Promise<SynthesisResult> {
    const voice = options?.voice ?? this.defaultVoice;
    const format = options?.format ?? this.defaultFormat;
    const speed = options?.speed ?? DEFAULT_SPEED;

    // Build the request body with standard OpenAI-compatible params
    const requestBody: Record<string, unknown> = {
      model: DEFAULT_MODEL,
      input: text,
      voice,
      response_format: format,
      speed,
    };

    // Add Chatterbox-specific params if provided
    const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;

    if (chatterboxOptions?.referenceAudio) {
      requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
    }

    if (chatterboxOptions?.emotionExaggeration !== undefined) {
      // Clamp to valid range [0.0, 1.0]
      requestBody.exaggeration = Math.max(
        0.0,
        Math.min(1.0, chatterboxOptions.emotionExaggeration)
      );
    }

    try {
      // Use the OpenAI SDK's create method, passing extra params
      // The OpenAI SDK allows additional body params to be passed through
      const response = await this.client.audio.speech.create(
        requestBody as unknown as SpeechCreateParams
      );

      const arrayBuffer = await response.arrayBuffer();
      const audio = Buffer.from(arrayBuffer);

      return {
        audio,
        format,
        voice,
        tier: this.tier,
      };
    } catch (error: unknown) {
      const message = error instanceof Error ? error.message : String(error);
      this.logger.error(`TTS synthesis failed: ${message}`);
      throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
    }
  }
}