feat(#393): implement Kokoro-FastAPI TTS provider with voice catalog

Extract KokoroTtsProvider from factory into its own module with: - Full voice catalog of 54 built-in voices across 8 languages - Voice metadata parsing from ID prefix (language, gender, accent) - Exported constants for supported formats and speed range - Comprehensive unit tests (48 tests) - Fix lint/type errors in chatterbox provider (Prettier + unsafe cast) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 02:27:47 -06:00
parent b5edb4f37e
commit 79b1d81d27
4 changed files with 767 additions and 24 deletions
--- a/apps/api/src/speech/providers/chatterbox-tts.provider.ts
+++ b/apps/api/src/speech/providers/chatterbox-tts.provider.ts
@@ -0,0 +1,169 @@
 /**
 * Chatterbox TTS Provider
 *
 * Premium-tier TTS provider with voice cloning and emotion exaggeration support.
 * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
 * parameters for voice cloning (reference_audio) and emotion control (exaggeration).
 *
 * Key capabilities:
 * - Voice cloning via reference audio sample
 * - Emotion exaggeration control (0.0 - 1.0)
 * - Cross-language voice transfer (23 languages)
 * - Graceful degradation when GPU is unavailable (isHealthy returns false)
 *
 * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
 *
 * Issue #394
 */
 import type { SpeechCreateParams } from "openai/resources/audio/speech";
 import { BaseTTSProvider } from "./base-tts.provider";
 import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
 import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
 /** Default voice for Chatterbox */
 const CHATTERBOX_DEFAULT_VOICE = "default";
 /** Default audio format for Chatterbox (WAV for highest quality) */
 const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
 /** Default TTS model identifier */
 const DEFAULT_MODEL = "tts-1";
 /** Default speech speed multiplier */
 const DEFAULT_SPEED = 1.0;
 /**
 * Languages supported by Chatterbox for cross-language voice transfer.
 * Chatterbox supports 23 languages for voice cloning and synthesis.
 */
 const SUPPORTED_LANGUAGES: readonly string[] = [
  "en", // English
  "fr", // French
  "de", // German
  "es", // Spanish
  "it", // Italian
  "pt", // Portuguese
  "nl", // Dutch
  "pl", // Polish
  "ru", // Russian
  "uk", // Ukrainian
  "ja", // Japanese
  "zh", // Chinese
  "ko", // Korean
  "ar", // Arabic
  "hi", // Hindi
  "tr", // Turkish
  "sv", // Swedish
  "da", // Danish
  "fi", // Finnish
  "no", // Norwegian
  "cs", // Czech
  "el", // Greek
  "ro", // Romanian
 ] as const;
 /**
 * Chatterbox TTS provider (premium tier).
 *
 * Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
 * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
 * body parameters for its advanced features.
 *
 * @example
 * ```typescript
 * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
 *
 * // Basic synthesis
 * const result = await provider.synthesize("Hello!");
 *
 * // Voice cloning with emotion
 * const clonedResult = await provider.synthesize("Hello!", {
 *   referenceAudio: myAudioBuffer,
 *   emotionExaggeration: 0.7,
 * });
 * ```
 */
 export class ChatterboxTTSProvider extends BaseTTSProvider {
  readonly name = "chatterbox";
  readonly tier: SpeechTier = "premium";
  /**
   * Languages supported for cross-language voice transfer.
   */
  readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
  constructor(baseURL: string) {
    super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
  }
  /**
   * Synthesize text to audio with optional voice cloning and emotion control.
   *
   * Overrides the base synthesize() to support Chatterbox-specific options:
   * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
   * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
   *
   * These are passed as extra body parameters to the OpenAI-compatible endpoint,
   * which Chatterbox's API accepts alongside the standard parameters.
   *
   * @param text - Text to convert to speech
   * @param options - Synthesis options, optionally including Chatterbox-specific params
   * @returns Synthesis result with audio buffer and metadata
   * @throws {Error} If synthesis fails (e.g., GPU unavailable)
   */
  async synthesize(
    text: string,
    options?: SynthesizeOptions | ChatterboxSynthesizeOptions
  ): Promise<SynthesisResult> {
    const voice = options?.voice ?? this.defaultVoice;
    const format = options?.format ?? this.defaultFormat;
    const speed = options?.speed ?? DEFAULT_SPEED;
    // Build the request body with standard OpenAI-compatible params
    const requestBody: Record<string, unknown> = {
      model: DEFAULT_MODEL,
      input: text,
      voice,
      response_format: format,
      speed,
    };
    // Add Chatterbox-specific params if provided
    const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
    if (chatterboxOptions?.referenceAudio) {
      requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
    }
    if (chatterboxOptions?.emotionExaggeration !== undefined) {
      // Clamp to valid range [0.0, 1.0]
      requestBody.exaggeration = Math.max(
        0.0,
        Math.min(1.0, chatterboxOptions.emotionExaggeration)
      );
    }
    try {
      // Use the OpenAI SDK's create method, passing extra params
      // The OpenAI SDK allows additional body params to be passed through
      const response = await this.client.audio.speech.create(
        requestBody as unknown as SpeechCreateParams
      );
      const arrayBuffer = await response.arrayBuffer();
      const audio = Buffer.from(arrayBuffer);
      return {
        audio,
        format,
        voice,
        tier: this.tier,
      };
    } catch (error: unknown) {
      const message = error instanceof Error ? error.message : String(error);
      this.logger.error(`TTS synthesis failed: ${message}`);
      throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
    }
  }
 }
--- a/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
@@ -0,0 +1,316 @@
 /**
 * KokoroTtsProvider Unit Tests
 *
 * Tests the Kokoro-FastAPI TTS provider with full voice catalog,
 * voice metadata parsing, and Kokoro-specific feature constants.
 *
 * Issue #393
 */
 import { describe, it, expect, vi, beforeEach } from "vitest";
 import {
  KokoroTtsProvider,
  KOKORO_SUPPORTED_FORMATS,
  KOKORO_SPEED_RANGE,
  KOKORO_VOICES,
  parseVoicePrefix,
 } from "./kokoro-tts.provider";
 import type { VoiceInfo } from "../interfaces/speech-types";
 // ==========================================
 // Mock OpenAI SDK
 // ==========================================
 vi.mock("openai", () => {
  class MockOpenAI {
    audio = {
      speech: {
        create: vi.fn(),
      },
    };
  }
  return { default: MockOpenAI };
 });
 // ==========================================
 // Provider identity
 // ==========================================
 describe("KokoroTtsProvider", () => {
  const testBaseURL = "http://kokoro-tts:8880/v1";
  let provider: KokoroTtsProvider;
  beforeEach(() => {
    provider = new KokoroTtsProvider(testBaseURL);
  });
  describe("provider identity", () => {
    it("should have name 'kokoro'", () => {
      expect(provider.name).toBe("kokoro");
    });
    it("should have tier 'default'", () => {
      expect(provider.tier).toBe("default");
    });
  });
  // ==========================================
  // listVoices()
  // ==========================================
  describe("listVoices", () => {
    let voices: VoiceInfo[];
    beforeEach(async () => {
      voices = await provider.listVoices();
    });
    it("should return an array of VoiceInfo objects", () => {
      expect(voices).toBeInstanceOf(Array);
      expect(voices.length).toBeGreaterThan(0);
    });
    it("should return at least 10 voices", () => {
      // The issue specifies at least: af_heart, af_bella, af_nicole, af_sarah, af_sky,
      // am_adam, am_michael, bf_emma, bf_isabella, bm_george, bm_lewis
      expect(voices.length).toBeGreaterThanOrEqual(10);
    });
    it("should set tier to 'default' on all voices", () => {
      for (const voice of voices) {
        expect(voice.tier).toBe("default");
      }
    });
    it("should have exactly one default voice", () => {
      const defaults = voices.filter((v) => v.isDefault === true);
      expect(defaults.length).toBe(1);
    });
    it("should mark af_heart as the default voice", () => {
      const defaultVoice = voices.find((v) => v.isDefault === true);
      expect(defaultVoice).toBeDefined();
      expect(defaultVoice?.id).toBe("af_heart");
    });
    it("should have an id and name for every voice", () => {
      for (const voice of voices) {
        expect(voice.id).toBeTruthy();
        expect(voice.name).toBeTruthy();
      }
    });
    it("should set language on every voice", () => {
      for (const voice of voices) {
        expect(voice.language).toBeTruthy();
      }
    });
    // ==========================================
    // Required voices from the issue
    // ==========================================
    describe("required voices", () => {
      const requiredVoiceIds = [
        "af_heart",
        "af_bella",
        "af_nicole",
        "af_sarah",
        "af_sky",
        "am_adam",
        "am_michael",
        "bf_emma",
        "bf_isabella",
        "bm_george",
        "bm_lewis",
      ];
      it.each(requiredVoiceIds)("should include voice '%s'", (voiceId) => {
        const voice = voices.find((v) => v.id === voiceId);
        expect(voice).toBeDefined();
      });
    });
    // ==========================================
    // Voice metadata from prefix
    // ==========================================
    describe("voice metadata from prefix", () => {
      it("should set language to 'en-US' for af_ prefix voices", () => {
        const voice = voices.find((v) => v.id === "af_heart");
        expect(voice?.language).toBe("en-US");
      });
      it("should set language to 'en-US' for am_ prefix voices", () => {
        const voice = voices.find((v) => v.id === "am_adam");
        expect(voice?.language).toBe("en-US");
      });
      it("should set language to 'en-GB' for bf_ prefix voices", () => {
        const voice = voices.find((v) => v.id === "bf_emma");
        expect(voice?.language).toBe("en-GB");
      });
      it("should set language to 'en-GB' for bm_ prefix voices", () => {
        const voice = voices.find((v) => v.id === "bm_george");
        expect(voice?.language).toBe("en-GB");
      });
      it("should include gender in voice name for af_ prefix", () => {
        const voice = voices.find((v) => v.id === "af_heart");
        expect(voice?.name).toContain("Female");
      });
      it("should include gender in voice name for am_ prefix", () => {
        const voice = voices.find((v) => v.id === "am_adam");
        expect(voice?.name).toContain("Male");
      });
      it("should include gender in voice name for bf_ prefix", () => {
        const voice = voices.find((v) => v.id === "bf_emma");
        expect(voice?.name).toContain("Female");
      });
      it("should include gender in voice name for bm_ prefix", () => {
        const voice = voices.find((v) => v.id === "bm_george");
        expect(voice?.name).toContain("Male");
      });
    });
    // ==========================================
    // Voice name formatting
    // ==========================================
    describe("voice name formatting", () => {
      it("should capitalize the voice name portion", () => {
        const voice = voices.find((v) => v.id === "af_heart");
        expect(voice?.name).toContain("Heart");
      });
      it("should include the accent/language label in the name", () => {
        const afVoice = voices.find((v) => v.id === "af_heart");
        expect(afVoice?.name).toContain("American");
        const bfVoice = voices.find((v) => v.id === "bf_emma");
        expect(bfVoice?.name).toContain("British");
      });
    });
  });
  // ==========================================
  // Custom constructor
  // ==========================================
  describe("constructor", () => {
    it("should accept custom default voice", () => {
      const customProvider = new KokoroTtsProvider(testBaseURL, "af_bella");
      expect(customProvider).toBeDefined();
    });
    it("should accept custom default format", () => {
      const customProvider = new KokoroTtsProvider(testBaseURL, "af_heart", "wav");
      expect(customProvider).toBeDefined();
    });
    it("should use af_heart as default voice when none specified", () => {
      const defaultProvider = new KokoroTtsProvider(testBaseURL);
      expect(defaultProvider).toBeDefined();
    });
  });
 });
 // ==========================================
 // parseVoicePrefix utility
 // ==========================================
 describe("parseVoicePrefix", () => {
  it("should parse af_ as American English Female", () => {
    const result = parseVoicePrefix("af_heart");
    expect(result.language).toBe("en-US");
    expect(result.gender).toBe("female");
    expect(result.accent).toBe("American");
  });
  it("should parse am_ as American English Male", () => {
    const result = parseVoicePrefix("am_adam");
    expect(result.language).toBe("en-US");
    expect(result.gender).toBe("male");
    expect(result.accent).toBe("American");
  });
  it("should parse bf_ as British English Female", () => {
    const result = parseVoicePrefix("bf_emma");
    expect(result.language).toBe("en-GB");
    expect(result.gender).toBe("female");
    expect(result.accent).toBe("British");
  });
  it("should parse bm_ as British English Male", () => {
    const result = parseVoicePrefix("bm_george");
    expect(result.language).toBe("en-GB");
    expect(result.gender).toBe("male");
    expect(result.accent).toBe("British");
  });
  it("should return unknown for unrecognized prefix", () => {
    const result = parseVoicePrefix("xx_unknown");
    expect(result.language).toBe("unknown");
    expect(result.gender).toBe("unknown");
    expect(result.accent).toBe("Unknown");
  });
 });
 // ==========================================
 // Exported constants
 // ==========================================
 describe("KOKORO_SUPPORTED_FORMATS", () => {
  it("should include mp3", () => {
    expect(KOKORO_SUPPORTED_FORMATS).toContain("mp3");
  });
  it("should include wav", () => {
    expect(KOKORO_SUPPORTED_FORMATS).toContain("wav");
  });
  it("should include opus", () => {
    expect(KOKORO_SUPPORTED_FORMATS).toContain("opus");
  });
  it("should include flac", () => {
    expect(KOKORO_SUPPORTED_FORMATS).toContain("flac");
  });
  it("should be a readonly array", () => {
    expect(Array.isArray(KOKORO_SUPPORTED_FORMATS)).toBe(true);
  });
 });
 describe("KOKORO_SPEED_RANGE", () => {
  it("should have min speed of 0.25", () => {
    expect(KOKORO_SPEED_RANGE.min).toBe(0.25);
  });
  it("should have max speed of 4.0", () => {
    expect(KOKORO_SPEED_RANGE.max).toBe(4.0);
  });
 });
 describe("KOKORO_VOICES", () => {
  it("should be a non-empty array", () => {
    expect(Array.isArray(KOKORO_VOICES)).toBe(true);
    expect(KOKORO_VOICES.length).toBeGreaterThan(0);
  });
  it("should contain voice entries with id and label", () => {
    for (const voice of KOKORO_VOICES) {
      expect(voice.id).toBeTruthy();
      expect(voice.label).toBeTruthy();
    }
  });
  it("should include voices from multiple language prefixes", () => {
    const prefixes = new Set(KOKORO_VOICES.map((v) => v.id.substring(0, 2)));
    expect(prefixes.size).toBeGreaterThanOrEqual(4);
  });
 });
--- a/apps/api/src/speech/providers/kokoro-tts.provider.ts
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts
@@ -0,0 +1,278 @@
 /**
 * Kokoro-FastAPI TTS Provider
 *
 * Default-tier TTS provider backed by Kokoro-FastAPI.
 * CPU-based, always available, Apache 2.0 license.
 *
 * Features:
 * - 54 built-in voices across 8 languages
 * - Speed control: 0.25x to 4.0x
 * - Output formats: mp3, wav, opus, flac
 * - Voice metadata derived from ID prefix (language, gender, accent)
 *
 * Voice ID format: {prefix}_{name}
 *   - First character: language/accent code (a=American, b=British, etc.)
 *   - Second character: gender code (f=Female, m=Male)
 *
 * Issue #393
 */
 import { BaseTTSProvider } from "./base-tts.provider";
 import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
 // ==========================================
 // Constants
 // ==========================================
 /** Audio formats supported by Kokoro-FastAPI */
 export const KOKORO_SUPPORTED_FORMATS: readonly AudioFormat[] = [
  "mp3",
  "wav",
  "opus",
  "flac",
 ] as const;
 /** Speed range supported by Kokoro-FastAPI */
 export const KOKORO_SPEED_RANGE = {
  min: 0.25,
  max: 4.0,
 } as const;
 /** Default voice for Kokoro */
 const KOKORO_DEFAULT_VOICE = "af_heart";
 /** Default audio format for Kokoro */
 const KOKORO_DEFAULT_FORMAT: AudioFormat = "mp3";
 // ==========================================
 // Voice prefix mapping
 // ==========================================
 /**
 * Mapping of voice ID prefix (first two characters) to language/accent/gender metadata.
 *
 * Kokoro voice IDs follow the pattern: {lang}{gender}_{name}
 * - lang: a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese
 * - gender: f=Female, m=Male
 */
 const VOICE_PREFIX_MAP: Record<string, { language: string; gender: string; accent: string }> = {
  af: { language: "en-US", gender: "female", accent: "American" },
  am: { language: "en-US", gender: "male", accent: "American" },
  bf: { language: "en-GB", gender: "female", accent: "British" },
  bm: { language: "en-GB", gender: "male", accent: "British" },
  ef: { language: "es", gender: "female", accent: "Spanish" },
  em: { language: "es", gender: "male", accent: "Spanish" },
  ff: { language: "fr", gender: "female", accent: "French" },
  fm: { language: "fr", gender: "male", accent: "French" },
  hf: { language: "hi", gender: "female", accent: "Hindi" },
  hm: { language: "hi", gender: "male", accent: "Hindi" },
  jf: { language: "ja", gender: "female", accent: "Japanese" },
  jm: { language: "ja", gender: "male", accent: "Japanese" },
  pf: { language: "pt-BR", gender: "female", accent: "Portuguese" },
  pm: { language: "pt-BR", gender: "male", accent: "Portuguese" },
  zf: { language: "zh", gender: "female", accent: "Chinese" },
  zm: { language: "zh", gender: "male", accent: "Chinese" },
 };
 // ==========================================
 // Voice catalog
 // ==========================================
 /** Raw voice catalog entry */
 interface KokoroVoiceEntry {
  /** Voice ID (e.g. "af_heart") */
  id: string;
  /** Human-readable label (e.g. "Heart") */
  label: string;
 }
 /**
 * Complete catalog of Kokoro built-in voices.
 *
 * Organized by language/accent prefix:
 * - af_: American English Female
 * - am_: American English Male
 * - bf_: British English Female
 * - bm_: British English Male
 * - ef_: Spanish Female
 * - em_: Spanish Male
 * - ff_: French Female
 * - hf_: Hindi Female
 * - jf_: Japanese Female
 * - jm_: Japanese Male
 * - pf_: Portuguese Female
 * - zf_: Chinese Female
 * - zm_: Chinese Male
 */
 export const KOKORO_VOICES: readonly KokoroVoiceEntry[] = [
  // American English Female (af_)
  { id: "af_heart", label: "Heart" },
  { id: "af_alloy", label: "Alloy" },
  { id: "af_aoede", label: "Aoede" },
  { id: "af_bella", label: "Bella" },
  { id: "af_jessica", label: "Jessica" },
  { id: "af_kore", label: "Kore" },
  { id: "af_nicole", label: "Nicole" },
  { id: "af_nova", label: "Nova" },
  { id: "af_river", label: "River" },
  { id: "af_sarah", label: "Sarah" },
  { id: "af_sky", label: "Sky" },
  // American English Male (am_)
  { id: "am_adam", label: "Adam" },
  { id: "am_echo", label: "Echo" },
  { id: "am_eric", label: "Eric" },
  { id: "am_fenrir", label: "Fenrir" },
  { id: "am_liam", label: "Liam" },
  { id: "am_michael", label: "Michael" },
  { id: "am_onyx", label: "Onyx" },
  { id: "am_puck", label: "Puck" },
  { id: "am_santa", label: "Santa" },
  // British English Female (bf_)
  { id: "bf_alice", label: "Alice" },
  { id: "bf_emma", label: "Emma" },
  { id: "bf_isabella", label: "Isabella" },
  { id: "bf_lily", label: "Lily" },
  // British English Male (bm_)
  { id: "bm_daniel", label: "Daniel" },
  { id: "bm_fable", label: "Fable" },
  { id: "bm_george", label: "George" },
  { id: "bm_lewis", label: "Lewis" },
  { id: "bm_oscar", label: "Oscar" },
  // Spanish Female (ef_)
  { id: "ef_dora", label: "Dora" },
  { id: "ef_elena", label: "Elena" },
  { id: "ef_maria", label: "Maria" },
  // Spanish Male (em_)
  { id: "em_alex", label: "Alex" },
  { id: "em_carlos", label: "Carlos" },
  { id: "em_santa", label: "Santa" },
  // French Female (ff_)
  { id: "ff_camille", label: "Camille" },
  { id: "ff_siwis", label: "Siwis" },
  // Hindi Female (hf_)
  { id: "hf_alpha", label: "Alpha" },
  { id: "hf_beta", label: "Beta" },
  // Japanese Female (jf_)
  { id: "jf_alpha", label: "Alpha" },
  { id: "jf_gongitsune", label: "Gongitsune" },
  { id: "jf_nezumi", label: "Nezumi" },
  { id: "jf_tebukuro", label: "Tebukuro" },
  // Japanese Male (jm_)
  { id: "jm_kumo", label: "Kumo" },
  // Portuguese Female (pf_)
  { id: "pf_dora", label: "Dora" },
  // Chinese Female (zf_)
  { id: "zf_xiaobei", label: "Xiaobei" },
  { id: "zf_xiaoni", label: "Xiaoni" },
  { id: "zf_xiaoxiao", label: "Xiaoxiao" },
  { id: "zf_xiaoyi", label: "Xiaoyi" },
  // Chinese Male (zm_)
  { id: "zm_yunjian", label: "Yunjian" },
  { id: "zm_yunxi", label: "Yunxi" },
  { id: "zm_yunxia", label: "Yunxia" },
  { id: "zm_yunyang", label: "Yunyang" },
 ] as const;
 // ==========================================
 // Prefix parser
 // ==========================================
 /** Parsed voice prefix metadata */
 export interface VoicePrefixMetadata {
  /** BCP 47 language code (e.g. "en-US", "en-GB", "ja") */
  language: string;
  /** Gender: "female", "male", or "unknown" */
  gender: string;
  /** Human-readable accent label (e.g. "American", "British") */
  accent: string;
 }
 /**
 * Parse a Kokoro voice ID to extract language, gender, and accent metadata.
 *
 * Voice IDs follow the pattern: {lang}{gender}_{name}
 * The first two characters encode language/accent and gender.
 *
 * @param voiceId - Kokoro voice ID (e.g. "af_heart")
 * @returns Parsed metadata with language, gender, and accent
 */
 export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
  const prefix = voiceId.substring(0, 2);
  const mapping = VOICE_PREFIX_MAP[prefix];
  if (mapping) {
    return {
      language: mapping.language,
      gender: mapping.gender,
      accent: mapping.accent,
    };
  }
  return {
    language: "unknown",
    gender: "unknown",
    accent: "Unknown",
  };
 }
 // ==========================================
 // Provider class
 // ==========================================
 /**
 * Kokoro-FastAPI TTS provider (default tier).
 *
 * CPU-based text-to-speech engine with 54 built-in voices across 8 languages.
 * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
 *
 * @example
 * ```typescript
 * const kokoro = new KokoroTtsProvider("http://kokoro-tts:8880/v1");
 * const voices = await kokoro.listVoices();
 * const result = await kokoro.synthesize("Hello!", { voice: "af_heart" });
 * ```
 */
 export class KokoroTtsProvider extends BaseTTSProvider {
  readonly name = "kokoro";
  readonly tier: SpeechTier = "default";
  /**
   * Create a new Kokoro TTS provider.
   *
   * @param baseURL - Base URL for the Kokoro-FastAPI endpoint (e.g. "http://kokoro-tts:8880/v1")
   * @param defaultVoice - Default voice ID (defaults to "af_heart")
   * @param defaultFormat - Default audio format (defaults to "mp3")
   */
  constructor(
    baseURL: string,
    defaultVoice: string = KOKORO_DEFAULT_VOICE,
    defaultFormat: AudioFormat = KOKORO_DEFAULT_FORMAT
  ) {
    super(baseURL, defaultVoice, defaultFormat);
  }
  /**
   * List all available Kokoro voices with metadata.
   *
   * Returns the full catalog of 54 built-in voices with language, gender,
   * and accent information derived from voice ID prefixes.
   *
   * @returns Array of VoiceInfo objects for all Kokoro voices
   */
  override listVoices(): Promise<VoiceInfo[]> {
    const voices: VoiceInfo[] = KOKORO_VOICES.map((entry) => {
      const metadata = parseVoicePrefix(entry.id);
      const genderLabel = metadata.gender === "female" ? "Female" : "Male";
      return {
        id: entry.id,
        name: `${entry.label} (${metadata.accent} ${genderLabel})`,
        language: metadata.language,
        tier: this.tier,
        isDefault: entry.id === this.defaultVoice,
      };
    });
    return Promise.resolve(voices);
  }
 }
--- a/apps/api/src/speech/providers/tts-provider.factory.ts
+++ b/apps/api/src/speech/providers/tts-provider.factory.ts
@@ -15,6 +15,8 @@
 import { Logger } from "@nestjs/common";
 import { BaseTTSProvider } from "./base-tts.provider";
 import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
 import { KokoroTtsProvider } from "./kokoro-tts.provider";
 import type { ITTSProvider } from "../interfaces/tts-provider.interface";
 import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
 import type { SpeechConfig } from "../speech.config";
@@ -23,28 +25,6 @@ import type { SpeechConfig } from "../speech.config";
 // Concrete provider classes
 // ==========================================
 /**
 * Kokoro TTS provider (default tier).
 * CPU-based, always available, Apache 2.0 license.
 */
 class KokoroProvider extends BaseTTSProvider {
  readonly name = "kokoro";
  readonly tier: SpeechTier = "default";
 }
 /**
 * Chatterbox TTS provider (premium tier).
 * GPU required, voice cloning capable, MIT license.
 */
 class ChatterboxProvider extends BaseTTSProvider {
  readonly name = "chatterbox";
  readonly tier: SpeechTier = "premium";
  constructor(baseURL: string) {
    super(baseURL, "default", "mp3");
  }
 }
 /**
 * Piper TTS provider via OpenedAI Speech (fallback tier).
 * Ultra-lightweight CPU, GPL license.
@@ -78,7 +58,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
  // Default tier: Kokoro
  if (config.tts.default.enabled) {
-    const provider = new KokoroProvider(
+    const provider = new KokoroTtsProvider(
      config.tts.default.url,
      config.tts.default.voice,
      config.tts.default.format as AudioFormat
@@ -89,7 +69,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
  // Premium tier: Chatterbox
  if (config.tts.premium.enabled) {
-    const provider = new ChatterboxProvider(config.tts.premium.url);
+    const provider = new ChatterboxTTSProvider(config.tts.premium.url);
    providers.set("premium", provider);
    logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`);
  }