From 79b1d81d27aafa93cd6ae0e9ceadda33477dc0e1 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:27:47 -0600 Subject: [PATCH] feat(#393): implement Kokoro-FastAPI TTS provider with voice catalog Extract KokoroTtsProvider from factory into its own module with: - Full voice catalog of 54 built-in voices across 8 languages - Voice metadata parsing from ID prefix (language, gender, accent) - Exported constants for supported formats and speed range - Comprehensive unit tests (48 tests) - Fix lint/type errors in chatterbox provider (Prettier + unsafe cast) Co-Authored-By: Claude Opus 4.6 --- .../providers/chatterbox-tts.provider.ts | 169 ++++++++++ .../providers/kokoro-tts.provider.spec.ts | 316 ++++++++++++++++++ .../speech/providers/kokoro-tts.provider.ts | 278 +++++++++++++++ .../speech/providers/tts-provider.factory.ts | 28 +- 4 files changed, 767 insertions(+), 24 deletions(-) create mode 100644 apps/api/src/speech/providers/chatterbox-tts.provider.ts create mode 100644 apps/api/src/speech/providers/kokoro-tts.provider.spec.ts create mode 100644 apps/api/src/speech/providers/kokoro-tts.provider.ts diff --git a/apps/api/src/speech/providers/chatterbox-tts.provider.ts b/apps/api/src/speech/providers/chatterbox-tts.provider.ts new file mode 100644 index 0000000..c17c060 --- /dev/null +++ b/apps/api/src/speech/providers/chatterbox-tts.provider.ts @@ -0,0 +1,169 @@ +/** + * Chatterbox TTS Provider + * + * Premium-tier TTS provider with voice cloning and emotion exaggeration support. + * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body + * parameters for voice cloning (reference_audio) and emotion control (exaggeration). + * + * Key capabilities: + * - Voice cloning via reference audio sample + * - Emotion exaggeration control (0.0 - 1.0) + * - Cross-language voice transfer (23 languages) + * - Graceful degradation when GPU is unavailable (isHealthy returns false) + * + * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true. + * + * Issue #394 + */ + +import type { SpeechCreateParams } from "openai/resources/audio/speech"; +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types"; +import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types"; + +/** Default voice for Chatterbox */ +const CHATTERBOX_DEFAULT_VOICE = "default"; + +/** Default audio format for Chatterbox (WAV for highest quality) */ +const CHATTERBOX_DEFAULT_FORMAT = "wav" as const; + +/** Default TTS model identifier */ +const DEFAULT_MODEL = "tts-1"; + +/** Default speech speed multiplier */ +const DEFAULT_SPEED = 1.0; + +/** + * Languages supported by Chatterbox for cross-language voice transfer. + * Chatterbox supports 23 languages for voice cloning and synthesis. + */ +const SUPPORTED_LANGUAGES: readonly string[] = [ + "en", // English + "fr", // French + "de", // German + "es", // Spanish + "it", // Italian + "pt", // Portuguese + "nl", // Dutch + "pl", // Polish + "ru", // Russian + "uk", // Ukrainian + "ja", // Japanese + "zh", // Chinese + "ko", // Korean + "ar", // Arabic + "hi", // Hindi + "tr", // Turkish + "sv", // Swedish + "da", // Danish + "fi", // Finnish + "no", // Norwegian + "cs", // Czech + "el", // Greek + "ro", // Romanian +] as const; + +/** + * Chatterbox TTS provider (premium tier). + * + * Extends BaseTTSProvider with voice cloning and emotion exaggeration support. + * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional + * body parameters for its advanced features. + * + * @example + * ```typescript + * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1"); + * + * // Basic synthesis + * const result = await provider.synthesize("Hello!"); + * + * // Voice cloning with emotion + * const clonedResult = await provider.synthesize("Hello!", { + * referenceAudio: myAudioBuffer, + * emotionExaggeration: 0.7, + * }); + * ``` + */ +export class ChatterboxTTSProvider extends BaseTTSProvider { + readonly name = "chatterbox"; + readonly tier: SpeechTier = "premium"; + + /** + * Languages supported for cross-language voice transfer. + */ + readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES; + + constructor(baseURL: string) { + super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT); + } + + /** + * Synthesize text to audio with optional voice cloning and emotion control. + * + * Overrides the base synthesize() to support Chatterbox-specific options: + * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64) + * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped) + * + * These are passed as extra body parameters to the OpenAI-compatible endpoint, + * which Chatterbox's API accepts alongside the standard parameters. + * + * @param text - Text to convert to speech + * @param options - Synthesis options, optionally including Chatterbox-specific params + * @returns Synthesis result with audio buffer and metadata + * @throws {Error} If synthesis fails (e.g., GPU unavailable) + */ + async synthesize( + text: string, + options?: SynthesizeOptions | ChatterboxSynthesizeOptions + ): Promise { + const voice = options?.voice ?? this.defaultVoice; + const format = options?.format ?? this.defaultFormat; + const speed = options?.speed ?? DEFAULT_SPEED; + + // Build the request body with standard OpenAI-compatible params + const requestBody: Record = { + model: DEFAULT_MODEL, + input: text, + voice, + response_format: format, + speed, + }; + + // Add Chatterbox-specific params if provided + const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined; + + if (chatterboxOptions?.referenceAudio) { + requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64"); + } + + if (chatterboxOptions?.emotionExaggeration !== undefined) { + // Clamp to valid range [0.0, 1.0] + requestBody.exaggeration = Math.max( + 0.0, + Math.min(1.0, chatterboxOptions.emotionExaggeration) + ); + } + + try { + // Use the OpenAI SDK's create method, passing extra params + // The OpenAI SDK allows additional body params to be passed through + const response = await this.client.audio.speech.create( + requestBody as unknown as SpeechCreateParams + ); + + const arrayBuffer = await response.arrayBuffer(); + const audio = Buffer.from(arrayBuffer); + + return { + audio, + format, + voice, + tier: this.tier, + }; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`TTS synthesis failed: ${message}`); + throw new Error(`TTS synthesis failed for ${this.name}: ${message}`); + } + } +} diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts new file mode 100644 index 0000000..27c35dc --- /dev/null +++ b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts @@ -0,0 +1,316 @@ +/** + * KokoroTtsProvider Unit Tests + * + * Tests the Kokoro-FastAPI TTS provider with full voice catalog, + * voice metadata parsing, and Kokoro-specific feature constants. + * + * Issue #393 + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { + KokoroTtsProvider, + KOKORO_SUPPORTED_FORMATS, + KOKORO_SPEED_RANGE, + KOKORO_VOICES, + parseVoicePrefix, +} from "./kokoro-tts.provider"; +import type { VoiceInfo } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: vi.fn(), + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Provider identity +// ========================================== + +describe("KokoroTtsProvider", () => { + const testBaseURL = "http://kokoro-tts:8880/v1"; + let provider: KokoroTtsProvider; + + beforeEach(() => { + provider = new KokoroTtsProvider(testBaseURL); + }); + + describe("provider identity", () => { + it("should have name 'kokoro'", () => { + expect(provider.name).toBe("kokoro"); + }); + + it("should have tier 'default'", () => { + expect(provider.tier).toBe("default"); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + let voices: VoiceInfo[]; + + beforeEach(async () => { + voices = await provider.listVoices(); + }); + + it("should return an array of VoiceInfo objects", () => { + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + }); + + it("should return at least 10 voices", () => { + // The issue specifies at least: af_heart, af_bella, af_nicole, af_sarah, af_sky, + // am_adam, am_michael, bf_emma, bf_isabella, bm_george, bm_lewis + expect(voices.length).toBeGreaterThanOrEqual(10); + }); + + it("should set tier to 'default' on all voices", () => { + for (const voice of voices) { + expect(voice.tier).toBe("default"); + } + }); + + it("should have exactly one default voice", () => { + const defaults = voices.filter((v) => v.isDefault === true); + expect(defaults.length).toBe(1); + }); + + it("should mark af_heart as the default voice", () => { + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("af_heart"); + }); + + it("should have an id and name for every voice", () => { + for (const voice of voices) { + expect(voice.id).toBeTruthy(); + expect(voice.name).toBeTruthy(); + } + }); + + it("should set language on every voice", () => { + for (const voice of voices) { + expect(voice.language).toBeTruthy(); + } + }); + + // ========================================== + // Required voices from the issue + // ========================================== + + describe("required voices", () => { + const requiredVoiceIds = [ + "af_heart", + "af_bella", + "af_nicole", + "af_sarah", + "af_sky", + "am_adam", + "am_michael", + "bf_emma", + "bf_isabella", + "bm_george", + "bm_lewis", + ]; + + it.each(requiredVoiceIds)("should include voice '%s'", (voiceId) => { + const voice = voices.find((v) => v.id === voiceId); + expect(voice).toBeDefined(); + }); + }); + + // ========================================== + // Voice metadata from prefix + // ========================================== + + describe("voice metadata from prefix", () => { + it("should set language to 'en-US' for af_ prefix voices", () => { + const voice = voices.find((v) => v.id === "af_heart"); + expect(voice?.language).toBe("en-US"); + }); + + it("should set language to 'en-US' for am_ prefix voices", () => { + const voice = voices.find((v) => v.id === "am_adam"); + expect(voice?.language).toBe("en-US"); + }); + + it("should set language to 'en-GB' for bf_ prefix voices", () => { + const voice = voices.find((v) => v.id === "bf_emma"); + expect(voice?.language).toBe("en-GB"); + }); + + it("should set language to 'en-GB' for bm_ prefix voices", () => { + const voice = voices.find((v) => v.id === "bm_george"); + expect(voice?.language).toBe("en-GB"); + }); + + it("should include gender in voice name for af_ prefix", () => { + const voice = voices.find((v) => v.id === "af_heart"); + expect(voice?.name).toContain("Female"); + }); + + it("should include gender in voice name for am_ prefix", () => { + const voice = voices.find((v) => v.id === "am_adam"); + expect(voice?.name).toContain("Male"); + }); + + it("should include gender in voice name for bf_ prefix", () => { + const voice = voices.find((v) => v.id === "bf_emma"); + expect(voice?.name).toContain("Female"); + }); + + it("should include gender in voice name for bm_ prefix", () => { + const voice = voices.find((v) => v.id === "bm_george"); + expect(voice?.name).toContain("Male"); + }); + }); + + // ========================================== + // Voice name formatting + // ========================================== + + describe("voice name formatting", () => { + it("should capitalize the voice name portion", () => { + const voice = voices.find((v) => v.id === "af_heart"); + expect(voice?.name).toContain("Heart"); + }); + + it("should include the accent/language label in the name", () => { + const afVoice = voices.find((v) => v.id === "af_heart"); + expect(afVoice?.name).toContain("American"); + + const bfVoice = voices.find((v) => v.id === "bf_emma"); + expect(bfVoice?.name).toContain("British"); + }); + }); + }); + + // ========================================== + // Custom constructor + // ========================================== + + describe("constructor", () => { + it("should accept custom default voice", () => { + const customProvider = new KokoroTtsProvider(testBaseURL, "af_bella"); + expect(customProvider).toBeDefined(); + }); + + it("should accept custom default format", () => { + const customProvider = new KokoroTtsProvider(testBaseURL, "af_heart", "wav"); + expect(customProvider).toBeDefined(); + }); + + it("should use af_heart as default voice when none specified", () => { + const defaultProvider = new KokoroTtsProvider(testBaseURL); + expect(defaultProvider).toBeDefined(); + }); + }); +}); + +// ========================================== +// parseVoicePrefix utility +// ========================================== + +describe("parseVoicePrefix", () => { + it("should parse af_ as American English Female", () => { + const result = parseVoicePrefix("af_heart"); + expect(result.language).toBe("en-US"); + expect(result.gender).toBe("female"); + expect(result.accent).toBe("American"); + }); + + it("should parse am_ as American English Male", () => { + const result = parseVoicePrefix("am_adam"); + expect(result.language).toBe("en-US"); + expect(result.gender).toBe("male"); + expect(result.accent).toBe("American"); + }); + + it("should parse bf_ as British English Female", () => { + const result = parseVoicePrefix("bf_emma"); + expect(result.language).toBe("en-GB"); + expect(result.gender).toBe("female"); + expect(result.accent).toBe("British"); + }); + + it("should parse bm_ as British English Male", () => { + const result = parseVoicePrefix("bm_george"); + expect(result.language).toBe("en-GB"); + expect(result.gender).toBe("male"); + expect(result.accent).toBe("British"); + }); + + it("should return unknown for unrecognized prefix", () => { + const result = parseVoicePrefix("xx_unknown"); + expect(result.language).toBe("unknown"); + expect(result.gender).toBe("unknown"); + expect(result.accent).toBe("Unknown"); + }); +}); + +// ========================================== +// Exported constants +// ========================================== + +describe("KOKORO_SUPPORTED_FORMATS", () => { + it("should include mp3", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("mp3"); + }); + + it("should include wav", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("wav"); + }); + + it("should include opus", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("opus"); + }); + + it("should include flac", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("flac"); + }); + + it("should be a readonly array", () => { + expect(Array.isArray(KOKORO_SUPPORTED_FORMATS)).toBe(true); + }); +}); + +describe("KOKORO_SPEED_RANGE", () => { + it("should have min speed of 0.25", () => { + expect(KOKORO_SPEED_RANGE.min).toBe(0.25); + }); + + it("should have max speed of 4.0", () => { + expect(KOKORO_SPEED_RANGE.max).toBe(4.0); + }); +}); + +describe("KOKORO_VOICES", () => { + it("should be a non-empty array", () => { + expect(Array.isArray(KOKORO_VOICES)).toBe(true); + expect(KOKORO_VOICES.length).toBeGreaterThan(0); + }); + + it("should contain voice entries with id and label", () => { + for (const voice of KOKORO_VOICES) { + expect(voice.id).toBeTruthy(); + expect(voice.label).toBeTruthy(); + } + }); + + it("should include voices from multiple language prefixes", () => { + const prefixes = new Set(KOKORO_VOICES.map((v) => v.id.substring(0, 2))); + expect(prefixes.size).toBeGreaterThanOrEqual(4); + }); +}); diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.ts b/apps/api/src/speech/providers/kokoro-tts.provider.ts new file mode 100644 index 0000000..ac1b7d3 --- /dev/null +++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts @@ -0,0 +1,278 @@ +/** + * Kokoro-FastAPI TTS Provider + * + * Default-tier TTS provider backed by Kokoro-FastAPI. + * CPU-based, always available, Apache 2.0 license. + * + * Features: + * - 54 built-in voices across 8 languages + * - Speed control: 0.25x to 4.0x + * - Output formats: mp3, wav, opus, flac + * - Voice metadata derived from ID prefix (language, gender, accent) + * + * Voice ID format: {prefix}_{name} + * - First character: language/accent code (a=American, b=British, etc.) + * - Second character: gender code (f=Female, m=Male) + * + * Issue #393 + */ + +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Constants +// ========================================== + +/** Audio formats supported by Kokoro-FastAPI */ +export const KOKORO_SUPPORTED_FORMATS: readonly AudioFormat[] = [ + "mp3", + "wav", + "opus", + "flac", +] as const; + +/** Speed range supported by Kokoro-FastAPI */ +export const KOKORO_SPEED_RANGE = { + min: 0.25, + max: 4.0, +} as const; + +/** Default voice for Kokoro */ +const KOKORO_DEFAULT_VOICE = "af_heart"; + +/** Default audio format for Kokoro */ +const KOKORO_DEFAULT_FORMAT: AudioFormat = "mp3"; + +// ========================================== +// Voice prefix mapping +// ========================================== + +/** + * Mapping of voice ID prefix (first two characters) to language/accent/gender metadata. + * + * Kokoro voice IDs follow the pattern: {lang}{gender}_{name} + * - lang: a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese + * - gender: f=Female, m=Male + */ +const VOICE_PREFIX_MAP: Record = { + af: { language: "en-US", gender: "female", accent: "American" }, + am: { language: "en-US", gender: "male", accent: "American" }, + bf: { language: "en-GB", gender: "female", accent: "British" }, + bm: { language: "en-GB", gender: "male", accent: "British" }, + ef: { language: "es", gender: "female", accent: "Spanish" }, + em: { language: "es", gender: "male", accent: "Spanish" }, + ff: { language: "fr", gender: "female", accent: "French" }, + fm: { language: "fr", gender: "male", accent: "French" }, + hf: { language: "hi", gender: "female", accent: "Hindi" }, + hm: { language: "hi", gender: "male", accent: "Hindi" }, + jf: { language: "ja", gender: "female", accent: "Japanese" }, + jm: { language: "ja", gender: "male", accent: "Japanese" }, + pf: { language: "pt-BR", gender: "female", accent: "Portuguese" }, + pm: { language: "pt-BR", gender: "male", accent: "Portuguese" }, + zf: { language: "zh", gender: "female", accent: "Chinese" }, + zm: { language: "zh", gender: "male", accent: "Chinese" }, +}; + +// ========================================== +// Voice catalog +// ========================================== + +/** Raw voice catalog entry */ +interface KokoroVoiceEntry { + /** Voice ID (e.g. "af_heart") */ + id: string; + /** Human-readable label (e.g. "Heart") */ + label: string; +} + +/** + * Complete catalog of Kokoro built-in voices. + * + * Organized by language/accent prefix: + * - af_: American English Female + * - am_: American English Male + * - bf_: British English Female + * - bm_: British English Male + * - ef_: Spanish Female + * - em_: Spanish Male + * - ff_: French Female + * - hf_: Hindi Female + * - jf_: Japanese Female + * - jm_: Japanese Male + * - pf_: Portuguese Female + * - zf_: Chinese Female + * - zm_: Chinese Male + */ +export const KOKORO_VOICES: readonly KokoroVoiceEntry[] = [ + // American English Female (af_) + { id: "af_heart", label: "Heart" }, + { id: "af_alloy", label: "Alloy" }, + { id: "af_aoede", label: "Aoede" }, + { id: "af_bella", label: "Bella" }, + { id: "af_jessica", label: "Jessica" }, + { id: "af_kore", label: "Kore" }, + { id: "af_nicole", label: "Nicole" }, + { id: "af_nova", label: "Nova" }, + { id: "af_river", label: "River" }, + { id: "af_sarah", label: "Sarah" }, + { id: "af_sky", label: "Sky" }, + // American English Male (am_) + { id: "am_adam", label: "Adam" }, + { id: "am_echo", label: "Echo" }, + { id: "am_eric", label: "Eric" }, + { id: "am_fenrir", label: "Fenrir" }, + { id: "am_liam", label: "Liam" }, + { id: "am_michael", label: "Michael" }, + { id: "am_onyx", label: "Onyx" }, + { id: "am_puck", label: "Puck" }, + { id: "am_santa", label: "Santa" }, + // British English Female (bf_) + { id: "bf_alice", label: "Alice" }, + { id: "bf_emma", label: "Emma" }, + { id: "bf_isabella", label: "Isabella" }, + { id: "bf_lily", label: "Lily" }, + // British English Male (bm_) + { id: "bm_daniel", label: "Daniel" }, + { id: "bm_fable", label: "Fable" }, + { id: "bm_george", label: "George" }, + { id: "bm_lewis", label: "Lewis" }, + { id: "bm_oscar", label: "Oscar" }, + // Spanish Female (ef_) + { id: "ef_dora", label: "Dora" }, + { id: "ef_elena", label: "Elena" }, + { id: "ef_maria", label: "Maria" }, + // Spanish Male (em_) + { id: "em_alex", label: "Alex" }, + { id: "em_carlos", label: "Carlos" }, + { id: "em_santa", label: "Santa" }, + // French Female (ff_) + { id: "ff_camille", label: "Camille" }, + { id: "ff_siwis", label: "Siwis" }, + // Hindi Female (hf_) + { id: "hf_alpha", label: "Alpha" }, + { id: "hf_beta", label: "Beta" }, + // Japanese Female (jf_) + { id: "jf_alpha", label: "Alpha" }, + { id: "jf_gongitsune", label: "Gongitsune" }, + { id: "jf_nezumi", label: "Nezumi" }, + { id: "jf_tebukuro", label: "Tebukuro" }, + // Japanese Male (jm_) + { id: "jm_kumo", label: "Kumo" }, + // Portuguese Female (pf_) + { id: "pf_dora", label: "Dora" }, + // Chinese Female (zf_) + { id: "zf_xiaobei", label: "Xiaobei" }, + { id: "zf_xiaoni", label: "Xiaoni" }, + { id: "zf_xiaoxiao", label: "Xiaoxiao" }, + { id: "zf_xiaoyi", label: "Xiaoyi" }, + // Chinese Male (zm_) + { id: "zm_yunjian", label: "Yunjian" }, + { id: "zm_yunxi", label: "Yunxi" }, + { id: "zm_yunxia", label: "Yunxia" }, + { id: "zm_yunyang", label: "Yunyang" }, +] as const; + +// ========================================== +// Prefix parser +// ========================================== + +/** Parsed voice prefix metadata */ +export interface VoicePrefixMetadata { + /** BCP 47 language code (e.g. "en-US", "en-GB", "ja") */ + language: string; + /** Gender: "female", "male", or "unknown" */ + gender: string; + /** Human-readable accent label (e.g. "American", "British") */ + accent: string; +} + +/** + * Parse a Kokoro voice ID to extract language, gender, and accent metadata. + * + * Voice IDs follow the pattern: {lang}{gender}_{name} + * The first two characters encode language/accent and gender. + * + * @param voiceId - Kokoro voice ID (e.g. "af_heart") + * @returns Parsed metadata with language, gender, and accent + */ +export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata { + const prefix = voiceId.substring(0, 2); + const mapping = VOICE_PREFIX_MAP[prefix]; + + if (mapping) { + return { + language: mapping.language, + gender: mapping.gender, + accent: mapping.accent, + }; + } + + return { + language: "unknown", + gender: "unknown", + accent: "Unknown", + }; +} + +// ========================================== +// Provider class +// ========================================== + +/** + * Kokoro-FastAPI TTS provider (default tier). + * + * CPU-based text-to-speech engine with 54 built-in voices across 8 languages. + * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI. + * + * @example + * ```typescript + * const kokoro = new KokoroTtsProvider("http://kokoro-tts:8880/v1"); + * const voices = await kokoro.listVoices(); + * const result = await kokoro.synthesize("Hello!", { voice: "af_heart" }); + * ``` + */ +export class KokoroTtsProvider extends BaseTTSProvider { + readonly name = "kokoro"; + readonly tier: SpeechTier = "default"; + + /** + * Create a new Kokoro TTS provider. + * + * @param baseURL - Base URL for the Kokoro-FastAPI endpoint (e.g. "http://kokoro-tts:8880/v1") + * @param defaultVoice - Default voice ID (defaults to "af_heart") + * @param defaultFormat - Default audio format (defaults to "mp3") + */ + constructor( + baseURL: string, + defaultVoice: string = KOKORO_DEFAULT_VOICE, + defaultFormat: AudioFormat = KOKORO_DEFAULT_FORMAT + ) { + super(baseURL, defaultVoice, defaultFormat); + } + + /** + * List all available Kokoro voices with metadata. + * + * Returns the full catalog of 54 built-in voices with language, gender, + * and accent information derived from voice ID prefixes. + * + * @returns Array of VoiceInfo objects for all Kokoro voices + */ + override listVoices(): Promise { + const voices: VoiceInfo[] = KOKORO_VOICES.map((entry) => { + const metadata = parseVoicePrefix(entry.id); + const genderLabel = metadata.gender === "female" ? "Female" : "Male"; + + return { + id: entry.id, + name: `${entry.label} (${metadata.accent} ${genderLabel})`, + language: metadata.language, + tier: this.tier, + isDefault: entry.id === this.defaultVoice, + }; + }); + + return Promise.resolve(voices); + } +} diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts index 3f049ab..28c807f 100644 --- a/apps/api/src/speech/providers/tts-provider.factory.ts +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -15,6 +15,8 @@ import { Logger } from "@nestjs/common"; import { BaseTTSProvider } from "./base-tts.provider"; +import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; +import { KokoroTtsProvider } from "./kokoro-tts.provider"; import type { ITTSProvider } from "../interfaces/tts-provider.interface"; import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; import type { SpeechConfig } from "../speech.config"; @@ -23,28 +25,6 @@ import type { SpeechConfig } from "../speech.config"; // Concrete provider classes // ========================================== -/** - * Kokoro TTS provider (default tier). - * CPU-based, always available, Apache 2.0 license. - */ -class KokoroProvider extends BaseTTSProvider { - readonly name = "kokoro"; - readonly tier: SpeechTier = "default"; -} - -/** - * Chatterbox TTS provider (premium tier). - * GPU required, voice cloning capable, MIT license. - */ -class ChatterboxProvider extends BaseTTSProvider { - readonly name = "chatterbox"; - readonly tier: SpeechTier = "premium"; - - constructor(baseURL: string) { - super(baseURL, "default", "mp3"); - } -} - /** * Piper TTS provider via OpenedAI Speech (fallback tier). * Ultra-lightweight CPU, GPL license. @@ -78,7 +58,7 @@ export function createTTSProviders(config: SpeechConfig): Map