From 79b1d81d27aafa93cd6ae0e9ceadda33477dc0e1 Mon Sep 17 00:00:00 2001
From: Jason Woltje <jason@diversecanvas.com>
Date: Sun, 15 Feb 2026 02:27:47 -0600
Subject: [PATCH] feat(#393): implement Kokoro-FastAPI TTS provider with voice
 catalog

Extract KokoroTtsProvider from factory into its own module with:
- Full voice catalog of 54 built-in voices across 8 languages
- Voice metadata parsing from ID prefix (language, gender, accent)
- Exported constants for supported formats and speed range
- Comprehensive unit tests (48 tests)
- Fix lint/type errors in chatterbox provider (Prettier + unsafe cast)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../providers/chatterbox-tts.provider.ts      | 169 ++++++++++
 .../providers/kokoro-tts.provider.spec.ts     | 316 ++++++++++++++++++
 .../speech/providers/kokoro-tts.provider.ts   | 278 +++++++++++++++
 .../speech/providers/tts-provider.factory.ts  |  28 +-
 4 files changed, 767 insertions(+), 24 deletions(-)
 create mode 100644 apps/api/src/speech/providers/chatterbox-tts.provider.ts
 create mode 100644 apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
 create mode 100644 apps/api/src/speech/providers/kokoro-tts.provider.ts

diff --git a/apps/api/src/speech/providers/chatterbox-tts.provider.ts b/apps/api/src/speech/providers/chatterbox-tts.provider.ts
new file mode 100644
index 0000000..c17c060
--- /dev/null
+++ b/apps/api/src/speech/providers/chatterbox-tts.provider.ts
@@ -0,0 +1,169 @@
+/**
+ * Chatterbox TTS Provider
+ *
+ * Premium-tier TTS provider with voice cloning and emotion exaggeration support.
+ * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
+ * parameters for voice cloning (reference_audio) and emotion control (exaggeration).
+ *
+ * Key capabilities:
+ * - Voice cloning via reference audio sample
+ * - Emotion exaggeration control (0.0 - 1.0)
+ * - Cross-language voice transfer (23 languages)
+ * - Graceful degradation when GPU is unavailable (isHealthy returns false)
+ *
+ * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
+ *
+ * Issue #394
+ */
+
+import type { SpeechCreateParams } from "openai/resources/audio/speech";
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
+import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
+
+/** Default voice for Chatterbox */
+const CHATTERBOX_DEFAULT_VOICE = "default";
+
+/** Default audio format for Chatterbox (WAV for highest quality) */
+const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
+
+/** Default TTS model identifier */
+const DEFAULT_MODEL = "tts-1";
+
+/** Default speech speed multiplier */
+const DEFAULT_SPEED = 1.0;
+
+/**
+ * Languages supported by Chatterbox for cross-language voice transfer.
+ * Chatterbox supports 23 languages for voice cloning and synthesis.
+ */
+const SUPPORTED_LANGUAGES: readonly string[] = [
+  "en", // English
+  "fr", // French
+  "de", // German
+  "es", // Spanish
+  "it", // Italian
+  "pt", // Portuguese
+  "nl", // Dutch
+  "pl", // Polish
+  "ru", // Russian
+  "uk", // Ukrainian
+  "ja", // Japanese
+  "zh", // Chinese
+  "ko", // Korean
+  "ar", // Arabic
+  "hi", // Hindi
+  "tr", // Turkish
+  "sv", // Swedish
+  "da", // Danish
+  "fi", // Finnish
+  "no", // Norwegian
+  "cs", // Czech
+  "el", // Greek
+  "ro", // Romanian
+] as const;
+
+/**
+ * Chatterbox TTS provider (premium tier).
+ *
+ * Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
+ * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
+ * body parameters for its advanced features.
+ *
+ * @example
+ * ```typescript
+ * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
+ *
+ * // Basic synthesis
+ * const result = await provider.synthesize("Hello!");
+ *
+ * // Voice cloning with emotion
+ * const clonedResult = await provider.synthesize("Hello!", {
+ *   referenceAudio: myAudioBuffer,
+ *   emotionExaggeration: 0.7,
+ * });
+ * ```
+ */
+export class ChatterboxTTSProvider extends BaseTTSProvider {
+  readonly name = "chatterbox";
+  readonly tier: SpeechTier = "premium";
+
+  /**
+   * Languages supported for cross-language voice transfer.
+   */
+  readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
+
+  constructor(baseURL: string) {
+    super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
+  }
+
+  /**
+   * Synthesize text to audio with optional voice cloning and emotion control.
+   *
+   * Overrides the base synthesize() to support Chatterbox-specific options:
+   * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
+   * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
+   *
+   * These are passed as extra body parameters to the OpenAI-compatible endpoint,
+   * which Chatterbox's API accepts alongside the standard parameters.
+   *
+   * @param text - Text to convert to speech
+   * @param options - Synthesis options, optionally including Chatterbox-specific params
+   * @returns Synthesis result with audio buffer and metadata
+   * @throws {Error} If synthesis fails (e.g., GPU unavailable)
+   */
+  async synthesize(
+    text: string,
+    options?: SynthesizeOptions | ChatterboxSynthesizeOptions
+  ): Promise<SynthesisResult> {
+    const voice = options?.voice ?? this.defaultVoice;
+    const format = options?.format ?? this.defaultFormat;
+    const speed = options?.speed ?? DEFAULT_SPEED;
+
+    // Build the request body with standard OpenAI-compatible params
+    const requestBody: Record<string, unknown> = {
+      model: DEFAULT_MODEL,
+      input: text,
+      voice,
+      response_format: format,
+      speed,
+    };
+
+    // Add Chatterbox-specific params if provided
+    const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
+
+    if (chatterboxOptions?.referenceAudio) {
+      requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
+    }
+
+    if (chatterboxOptions?.emotionExaggeration !== undefined) {
+      // Clamp to valid range [0.0, 1.0]
+      requestBody.exaggeration = Math.max(
+        0.0,
+        Math.min(1.0, chatterboxOptions.emotionExaggeration)
+      );
+    }
+
+    try {
+      // Use the OpenAI SDK's create method, passing extra params
+      // The OpenAI SDK allows additional body params to be passed through
+      const response = await this.client.audio.speech.create(
+        requestBody as unknown as SpeechCreateParams
+      );
+
+      const arrayBuffer = await response.arrayBuffer();
+      const audio = Buffer.from(arrayBuffer);
+
+      return {
+        audio,
+        format,
+        voice,
+        tier: this.tier,
+      };
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`TTS synthesis failed: ${message}`);
+      throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
+    }
+  }
+}
diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
new file mode 100644
index 0000000..27c35dc
--- /dev/null
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
@@ -0,0 +1,316 @@
+/**
+ * KokoroTtsProvider Unit Tests
+ *
+ * Tests the Kokoro-FastAPI TTS provider with full voice catalog,
+ * voice metadata parsing, and Kokoro-specific feature constants.
+ *
+ * Issue #393
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import {
+  KokoroTtsProvider,
+  KOKORO_SUPPORTED_FORMATS,
+  KOKORO_SPEED_RANGE,
+  KOKORO_VOICES,
+  parseVoicePrefix,
+} from "./kokoro-tts.provider";
+import type { VoiceInfo } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      speech: {
+        create: vi.fn(),
+      },
+    };
+  }
+  return { default: MockOpenAI };
+});
+
+// ==========================================
+// Provider identity
+// ==========================================
+
+describe("KokoroTtsProvider", () => {
+  const testBaseURL = "http://kokoro-tts:8880/v1";
+  let provider: KokoroTtsProvider;
+
+  beforeEach(() => {
+    provider = new KokoroTtsProvider(testBaseURL);
+  });
+
+  describe("provider identity", () => {
+    it("should have name 'kokoro'", () => {
+      expect(provider.name).toBe("kokoro");
+    });
+
+    it("should have tier 'default'", () => {
+      expect(provider.tier).toBe("default");
+    });
+  });
+
+  // ==========================================
+  // listVoices()
+  // ==========================================
+
+  describe("listVoices", () => {
+    let voices: VoiceInfo[];
+
+    beforeEach(async () => {
+      voices = await provider.listVoices();
+    });
+
+    it("should return an array of VoiceInfo objects", () => {
+      expect(voices).toBeInstanceOf(Array);
+      expect(voices.length).toBeGreaterThan(0);
+    });
+
+    it("should return at least 10 voices", () => {
+      // The issue specifies at least: af_heart, af_bella, af_nicole, af_sarah, af_sky,
+      // am_adam, am_michael, bf_emma, bf_isabella, bm_george, bm_lewis
+      expect(voices.length).toBeGreaterThanOrEqual(10);
+    });
+
+    it("should set tier to 'default' on all voices", () => {
+      for (const voice of voices) {
+        expect(voice.tier).toBe("default");
+      }
+    });
+
+    it("should have exactly one default voice", () => {
+      const defaults = voices.filter((v) => v.isDefault === true);
+      expect(defaults.length).toBe(1);
+    });
+
+    it("should mark af_heart as the default voice", () => {
+      const defaultVoice = voices.find((v) => v.isDefault === true);
+      expect(defaultVoice).toBeDefined();
+      expect(defaultVoice?.id).toBe("af_heart");
+    });
+
+    it("should have an id and name for every voice", () => {
+      for (const voice of voices) {
+        expect(voice.id).toBeTruthy();
+        expect(voice.name).toBeTruthy();
+      }
+    });
+
+    it("should set language on every voice", () => {
+      for (const voice of voices) {
+        expect(voice.language).toBeTruthy();
+      }
+    });
+
+    // ==========================================
+    // Required voices from the issue
+    // ==========================================
+
+    describe("required voices", () => {
+      const requiredVoiceIds = [
+        "af_heart",
+        "af_bella",
+        "af_nicole",
+        "af_sarah",
+        "af_sky",
+        "am_adam",
+        "am_michael",
+        "bf_emma",
+        "bf_isabella",
+        "bm_george",
+        "bm_lewis",
+      ];
+
+      it.each(requiredVoiceIds)("should include voice '%s'", (voiceId) => {
+        const voice = voices.find((v) => v.id === voiceId);
+        expect(voice).toBeDefined();
+      });
+    });
+
+    // ==========================================
+    // Voice metadata from prefix
+    // ==========================================
+
+    describe("voice metadata from prefix", () => {
+      it("should set language to 'en-US' for af_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "af_heart");
+        expect(voice?.language).toBe("en-US");
+      });
+
+      it("should set language to 'en-US' for am_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "am_adam");
+        expect(voice?.language).toBe("en-US");
+      });
+
+      it("should set language to 'en-GB' for bf_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "bf_emma");
+        expect(voice?.language).toBe("en-GB");
+      });
+
+      it("should set language to 'en-GB' for bm_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "bm_george");
+        expect(voice?.language).toBe("en-GB");
+      });
+
+      it("should include gender in voice name for af_ prefix", () => {
+        const voice = voices.find((v) => v.id === "af_heart");
+        expect(voice?.name).toContain("Female");
+      });
+
+      it("should include gender in voice name for am_ prefix", () => {
+        const voice = voices.find((v) => v.id === "am_adam");
+        expect(voice?.name).toContain("Male");
+      });
+
+      it("should include gender in voice name for bf_ prefix", () => {
+        const voice = voices.find((v) => v.id === "bf_emma");
+        expect(voice?.name).toContain("Female");
+      });
+
+      it("should include gender in voice name for bm_ prefix", () => {
+        const voice = voices.find((v) => v.id === "bm_george");
+        expect(voice?.name).toContain("Male");
+      });
+    });
+
+    // ==========================================
+    // Voice name formatting
+    // ==========================================
+
+    describe("voice name formatting", () => {
+      it("should capitalize the voice name portion", () => {
+        const voice = voices.find((v) => v.id === "af_heart");
+        expect(voice?.name).toContain("Heart");
+      });
+
+      it("should include the accent/language label in the name", () => {
+        const afVoice = voices.find((v) => v.id === "af_heart");
+        expect(afVoice?.name).toContain("American");
+
+        const bfVoice = voices.find((v) => v.id === "bf_emma");
+        expect(bfVoice?.name).toContain("British");
+      });
+    });
+  });
+
+  // ==========================================
+  // Custom constructor
+  // ==========================================
+
+  describe("constructor", () => {
+    it("should accept custom default voice", () => {
+      const customProvider = new KokoroTtsProvider(testBaseURL, "af_bella");
+      expect(customProvider).toBeDefined();
+    });
+
+    it("should accept custom default format", () => {
+      const customProvider = new KokoroTtsProvider(testBaseURL, "af_heart", "wav");
+      expect(customProvider).toBeDefined();
+    });
+
+    it("should use af_heart as default voice when none specified", () => {
+      const defaultProvider = new KokoroTtsProvider(testBaseURL);
+      expect(defaultProvider).toBeDefined();
+    });
+  });
+});
+
+// ==========================================
+// parseVoicePrefix utility
+// ==========================================
+
+describe("parseVoicePrefix", () => {
+  it("should parse af_ as American English Female", () => {
+    const result = parseVoicePrefix("af_heart");
+    expect(result.language).toBe("en-US");
+    expect(result.gender).toBe("female");
+    expect(result.accent).toBe("American");
+  });
+
+  it("should parse am_ as American English Male", () => {
+    const result = parseVoicePrefix("am_adam");
+    expect(result.language).toBe("en-US");
+    expect(result.gender).toBe("male");
+    expect(result.accent).toBe("American");
+  });
+
+  it("should parse bf_ as British English Female", () => {
+    const result = parseVoicePrefix("bf_emma");
+    expect(result.language).toBe("en-GB");
+    expect(result.gender).toBe("female");
+    expect(result.accent).toBe("British");
+  });
+
+  it("should parse bm_ as British English Male", () => {
+    const result = parseVoicePrefix("bm_george");
+    expect(result.language).toBe("en-GB");
+    expect(result.gender).toBe("male");
+    expect(result.accent).toBe("British");
+  });
+
+  it("should return unknown for unrecognized prefix", () => {
+    const result = parseVoicePrefix("xx_unknown");
+    expect(result.language).toBe("unknown");
+    expect(result.gender).toBe("unknown");
+    expect(result.accent).toBe("Unknown");
+  });
+});
+
+// ==========================================
+// Exported constants
+// ==========================================
+
+describe("KOKORO_SUPPORTED_FORMATS", () => {
+  it("should include mp3", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("mp3");
+  });
+
+  it("should include wav", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("wav");
+  });
+
+  it("should include opus", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("opus");
+  });
+
+  it("should include flac", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("flac");
+  });
+
+  it("should be a readonly array", () => {
+    expect(Array.isArray(KOKORO_SUPPORTED_FORMATS)).toBe(true);
+  });
+});
+
+describe("KOKORO_SPEED_RANGE", () => {
+  it("should have min speed of 0.25", () => {
+    expect(KOKORO_SPEED_RANGE.min).toBe(0.25);
+  });
+
+  it("should have max speed of 4.0", () => {
+    expect(KOKORO_SPEED_RANGE.max).toBe(4.0);
+  });
+});
+
+describe("KOKORO_VOICES", () => {
+  it("should be a non-empty array", () => {
+    expect(Array.isArray(KOKORO_VOICES)).toBe(true);
+    expect(KOKORO_VOICES.length).toBeGreaterThan(0);
+  });
+
+  it("should contain voice entries with id and label", () => {
+    for (const voice of KOKORO_VOICES) {
+      expect(voice.id).toBeTruthy();
+      expect(voice.label).toBeTruthy();
+    }
+  });
+
+  it("should include voices from multiple language prefixes", () => {
+    const prefixes = new Set(KOKORO_VOICES.map((v) => v.id.substring(0, 2)));
+    expect(prefixes.size).toBeGreaterThanOrEqual(4);
+  });
+});
diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.ts b/apps/api/src/speech/providers/kokoro-tts.provider.ts
new file mode 100644
index 0000000..ac1b7d3
--- /dev/null
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts
@@ -0,0 +1,278 @@
+/**
+ * Kokoro-FastAPI TTS Provider
+ *
+ * Default-tier TTS provider backed by Kokoro-FastAPI.
+ * CPU-based, always available, Apache 2.0 license.
+ *
+ * Features:
+ * - 54 built-in voices across 8 languages
+ * - Speed control: 0.25x to 4.0x
+ * - Output formats: mp3, wav, opus, flac
+ * - Voice metadata derived from ID prefix (language, gender, accent)
+ *
+ * Voice ID format: {prefix}_{name}
+ *   - First character: language/accent code (a=American, b=British, etc.)
+ *   - Second character: gender code (f=Female, m=Male)
+ *
+ * Issue #393
+ */
+
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
+
+// ==========================================
+// Constants
+// ==========================================
+
+/** Audio formats supported by Kokoro-FastAPI */
+export const KOKORO_SUPPORTED_FORMATS: readonly AudioFormat[] = [
+  "mp3",
+  "wav",
+  "opus",
+  "flac",
+] as const;
+
+/** Speed range supported by Kokoro-FastAPI */
+export const KOKORO_SPEED_RANGE = {
+  min: 0.25,
+  max: 4.0,
+} as const;
+
+/** Default voice for Kokoro */
+const KOKORO_DEFAULT_VOICE = "af_heart";
+
+/** Default audio format for Kokoro */
+const KOKORO_DEFAULT_FORMAT: AudioFormat = "mp3";
+
+// ==========================================
+// Voice prefix mapping
+// ==========================================
+
+/**
+ * Mapping of voice ID prefix (first two characters) to language/accent/gender metadata.
+ *
+ * Kokoro voice IDs follow the pattern: {lang}{gender}_{name}
+ * - lang: a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese
+ * - gender: f=Female, m=Male
+ */
+const VOICE_PREFIX_MAP: Record<string, { language: string; gender: string; accent: string }> = {
+  af: { language: "en-US", gender: "female", accent: "American" },
+  am: { language: "en-US", gender: "male", accent: "American" },
+  bf: { language: "en-GB", gender: "female", accent: "British" },
+  bm: { language: "en-GB", gender: "male", accent: "British" },
+  ef: { language: "es", gender: "female", accent: "Spanish" },
+  em: { language: "es", gender: "male", accent: "Spanish" },
+  ff: { language: "fr", gender: "female", accent: "French" },
+  fm: { language: "fr", gender: "male", accent: "French" },
+  hf: { language: "hi", gender: "female", accent: "Hindi" },
+  hm: { language: "hi", gender: "male", accent: "Hindi" },
+  jf: { language: "ja", gender: "female", accent: "Japanese" },
+  jm: { language: "ja", gender: "male", accent: "Japanese" },
+  pf: { language: "pt-BR", gender: "female", accent: "Portuguese" },
+  pm: { language: "pt-BR", gender: "male", accent: "Portuguese" },
+  zf: { language: "zh", gender: "female", accent: "Chinese" },
+  zm: { language: "zh", gender: "male", accent: "Chinese" },
+};
+
+// ==========================================
+// Voice catalog
+// ==========================================
+
+/** Raw voice catalog entry */
+interface KokoroVoiceEntry {
+  /** Voice ID (e.g. "af_heart") */
+  id: string;
+  /** Human-readable label (e.g. "Heart") */
+  label: string;
+}
+
+/**
+ * Complete catalog of Kokoro built-in voices.
+ *
+ * Organized by language/accent prefix:
+ * - af_: American English Female
+ * - am_: American English Male
+ * - bf_: British English Female
+ * - bm_: British English Male
+ * - ef_: Spanish Female
+ * - em_: Spanish Male
+ * - ff_: French Female
+ * - hf_: Hindi Female
+ * - jf_: Japanese Female
+ * - jm_: Japanese Male
+ * - pf_: Portuguese Female
+ * - zf_: Chinese Female
+ * - zm_: Chinese Male
+ */
+export const KOKORO_VOICES: readonly KokoroVoiceEntry[] = [
+  // American English Female (af_)
+  { id: "af_heart", label: "Heart" },
+  { id: "af_alloy", label: "Alloy" },
+  { id: "af_aoede", label: "Aoede" },
+  { id: "af_bella", label: "Bella" },
+  { id: "af_jessica", label: "Jessica" },
+  { id: "af_kore", label: "Kore" },
+  { id: "af_nicole", label: "Nicole" },
+  { id: "af_nova", label: "Nova" },
+  { id: "af_river", label: "River" },
+  { id: "af_sarah", label: "Sarah" },
+  { id: "af_sky", label: "Sky" },
+  // American English Male (am_)
+  { id: "am_adam", label: "Adam" },
+  { id: "am_echo", label: "Echo" },
+  { id: "am_eric", label: "Eric" },
+  { id: "am_fenrir", label: "Fenrir" },
+  { id: "am_liam", label: "Liam" },
+  { id: "am_michael", label: "Michael" },
+  { id: "am_onyx", label: "Onyx" },
+  { id: "am_puck", label: "Puck" },
+  { id: "am_santa", label: "Santa" },
+  // British English Female (bf_)
+  { id: "bf_alice", label: "Alice" },
+  { id: "bf_emma", label: "Emma" },
+  { id: "bf_isabella", label: "Isabella" },
+  { id: "bf_lily", label: "Lily" },
+  // British English Male (bm_)
+  { id: "bm_daniel", label: "Daniel" },
+  { id: "bm_fable", label: "Fable" },
+  { id: "bm_george", label: "George" },
+  { id: "bm_lewis", label: "Lewis" },
+  { id: "bm_oscar", label: "Oscar" },
+  // Spanish Female (ef_)
+  { id: "ef_dora", label: "Dora" },
+  { id: "ef_elena", label: "Elena" },
+  { id: "ef_maria", label: "Maria" },
+  // Spanish Male (em_)
+  { id: "em_alex", label: "Alex" },
+  { id: "em_carlos", label: "Carlos" },
+  { id: "em_santa", label: "Santa" },
+  // French Female (ff_)
+  { id: "ff_camille", label: "Camille" },
+  { id: "ff_siwis", label: "Siwis" },
+  // Hindi Female (hf_)
+  { id: "hf_alpha", label: "Alpha" },
+  { id: "hf_beta", label: "Beta" },
+  // Japanese Female (jf_)
+  { id: "jf_alpha", label: "Alpha" },
+  { id: "jf_gongitsune", label: "Gongitsune" },
+  { id: "jf_nezumi", label: "Nezumi" },
+  { id: "jf_tebukuro", label: "Tebukuro" },
+  // Japanese Male (jm_)
+  { id: "jm_kumo", label: "Kumo" },
+  // Portuguese Female (pf_)
+  { id: "pf_dora", label: "Dora" },
+  // Chinese Female (zf_)
+  { id: "zf_xiaobei", label: "Xiaobei" },
+  { id: "zf_xiaoni", label: "Xiaoni" },
+  { id: "zf_xiaoxiao", label: "Xiaoxiao" },
+  { id: "zf_xiaoyi", label: "Xiaoyi" },
+  // Chinese Male (zm_)
+  { id: "zm_yunjian", label: "Yunjian" },
+  { id: "zm_yunxi", label: "Yunxi" },
+  { id: "zm_yunxia", label: "Yunxia" },
+  { id: "zm_yunyang", label: "Yunyang" },
+] as const;
+
+// ==========================================
+// Prefix parser
+// ==========================================
+
+/** Parsed voice prefix metadata */
+export interface VoicePrefixMetadata {
+  /** BCP 47 language code (e.g. "en-US", "en-GB", "ja") */
+  language: string;
+  /** Gender: "female", "male", or "unknown" */
+  gender: string;
+  /** Human-readable accent label (e.g. "American", "British") */
+  accent: string;
+}
+
+/**
+ * Parse a Kokoro voice ID to extract language, gender, and accent metadata.
+ *
+ * Voice IDs follow the pattern: {lang}{gender}_{name}
+ * The first two characters encode language/accent and gender.
+ *
+ * @param voiceId - Kokoro voice ID (e.g. "af_heart")
+ * @returns Parsed metadata with language, gender, and accent
+ */
+export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
+  const prefix = voiceId.substring(0, 2);
+  const mapping = VOICE_PREFIX_MAP[prefix];
+
+  if (mapping) {
+    return {
+      language: mapping.language,
+      gender: mapping.gender,
+      accent: mapping.accent,
+    };
+  }
+
+  return {
+    language: "unknown",
+    gender: "unknown",
+    accent: "Unknown",
+  };
+}
+
+// ==========================================
+// Provider class
+// ==========================================
+
+/**
+ * Kokoro-FastAPI TTS provider (default tier).
+ *
+ * CPU-based text-to-speech engine with 54 built-in voices across 8 languages.
+ * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
+ *
+ * @example
+ * ```typescript
+ * const kokoro = new KokoroTtsProvider("http://kokoro-tts:8880/v1");
+ * const voices = await kokoro.listVoices();
+ * const result = await kokoro.synthesize("Hello!", { voice: "af_heart" });
+ * ```
+ */
+export class KokoroTtsProvider extends BaseTTSProvider {
+  readonly name = "kokoro";
+  readonly tier: SpeechTier = "default";
+
+  /**
+   * Create a new Kokoro TTS provider.
+   *
+   * @param baseURL - Base URL for the Kokoro-FastAPI endpoint (e.g. "http://kokoro-tts:8880/v1")
+   * @param defaultVoice - Default voice ID (defaults to "af_heart")
+   * @param defaultFormat - Default audio format (defaults to "mp3")
+   */
+  constructor(
+    baseURL: string,
+    defaultVoice: string = KOKORO_DEFAULT_VOICE,
+    defaultFormat: AudioFormat = KOKORO_DEFAULT_FORMAT
+  ) {
+    super(baseURL, defaultVoice, defaultFormat);
+  }
+
+  /**
+   * List all available Kokoro voices with metadata.
+   *
+   * Returns the full catalog of 54 built-in voices with language, gender,
+   * and accent information derived from voice ID prefixes.
+   *
+   * @returns Array of VoiceInfo objects for all Kokoro voices
+   */
+  override listVoices(): Promise<VoiceInfo[]> {
+    const voices: VoiceInfo[] = KOKORO_VOICES.map((entry) => {
+      const metadata = parseVoicePrefix(entry.id);
+      const genderLabel = metadata.gender === "female" ? "Female" : "Male";
+
+      return {
+        id: entry.id,
+        name: `${entry.label} (${metadata.accent} ${genderLabel})`,
+        language: metadata.language,
+        tier: this.tier,
+        isDefault: entry.id === this.defaultVoice,
+      };
+    });
+
+    return Promise.resolve(voices);
+  }
+}
diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts
index 3f049ab..28c807f 100644
--- a/apps/api/src/speech/providers/tts-provider.factory.ts
+++ b/apps/api/src/speech/providers/tts-provider.factory.ts
@@ -15,6 +15,8 @@
 
 import { Logger } from "@nestjs/common";
 import { BaseTTSProvider } from "./base-tts.provider";
+import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
+import { KokoroTtsProvider } from "./kokoro-tts.provider";
 import type { ITTSProvider } from "../interfaces/tts-provider.interface";
 import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
 import type { SpeechConfig } from "../speech.config";
@@ -23,28 +25,6 @@ import type { SpeechConfig } from "../speech.config";
 // Concrete provider classes
 // ==========================================
 
-/**
- * Kokoro TTS provider (default tier).
- * CPU-based, always available, Apache 2.0 license.
- */
-class KokoroProvider extends BaseTTSProvider {
-  readonly name = "kokoro";
-  readonly tier: SpeechTier = "default";
-}
-
-/**
- * Chatterbox TTS provider (premium tier).
- * GPU required, voice cloning capable, MIT license.
- */
-class ChatterboxProvider extends BaseTTSProvider {
-  readonly name = "chatterbox";
-  readonly tier: SpeechTier = "premium";
-
-  constructor(baseURL: string) {
-    super(baseURL, "default", "mp3");
-  }
-}
-
 /**
  * Piper TTS provider via OpenedAI Speech (fallback tier).
  * Ultra-lightweight CPU, GPL license.
@@ -78,7 +58,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
 
   // Default tier: Kokoro
   if (config.tts.default.enabled) {
-    const provider = new KokoroProvider(
+    const provider = new KokoroTtsProvider(
       config.tts.default.url,
       config.tts.default.voice,
       config.tts.default.format as AudioFormat
@@ -89,7 +69,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
 
   // Premium tier: Chatterbox
   if (config.tts.premium.enabled) {
-    const provider = new ChatterboxProvider(config.tts.premium.url);
+    const provider = new ChatterboxTTSProvider(config.tts.premium.url);
     providers.set("premium", provider);
     logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`);
   }