feat(#393): implement Kokoro-FastAPI TTS provider with voice catalog
Some checks failed
ci/woodpecker/push/api Pipeline failed

Extract KokoroTtsProvider from factory into its own module with:
- Full voice catalog of 54 built-in voices across 8 languages
- Voice metadata parsing from ID prefix (language, gender, accent)
- Exported constants for supported formats and speed range
- Comprehensive unit tests (48 tests)
- Fix lint/type errors in chatterbox provider (Prettier + unsafe cast)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 02:27:47 -06:00
parent b5edb4f37e
commit 79b1d81d27
4 changed files with 767 additions and 24 deletions

View File

@@ -0,0 +1,169 @@
/**
* Chatterbox TTS Provider
*
* Premium-tier TTS provider with voice cloning and emotion exaggeration support.
* Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
* parameters for voice cloning (reference_audio) and emotion control (exaggeration).
*
* Key capabilities:
* - Voice cloning via reference audio sample
* - Emotion exaggeration control (0.0 - 1.0)
* - Cross-language voice transfer (23 languages)
* - Graceful degradation when GPU is unavailable (isHealthy returns false)
*
* The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
*
* Issue #394
*/
import type { SpeechCreateParams } from "openai/resources/audio/speech";
import { BaseTTSProvider } from "./base-tts.provider";
import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
/** Default voice for Chatterbox */
const CHATTERBOX_DEFAULT_VOICE = "default";
/** Default audio format for Chatterbox (WAV for highest quality) */
const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
/** Default TTS model identifier */
const DEFAULT_MODEL = "tts-1";
/** Default speech speed multiplier */
const DEFAULT_SPEED = 1.0;
/**
* Languages supported by Chatterbox for cross-language voice transfer.
* Chatterbox supports 23 languages for voice cloning and synthesis.
*/
const SUPPORTED_LANGUAGES: readonly string[] = [
"en", // English
"fr", // French
"de", // German
"es", // Spanish
"it", // Italian
"pt", // Portuguese
"nl", // Dutch
"pl", // Polish
"ru", // Russian
"uk", // Ukrainian
"ja", // Japanese
"zh", // Chinese
"ko", // Korean
"ar", // Arabic
"hi", // Hindi
"tr", // Turkish
"sv", // Swedish
"da", // Danish
"fi", // Finnish
"no", // Norwegian
"cs", // Czech
"el", // Greek
"ro", // Romanian
] as const;
/**
* Chatterbox TTS provider (premium tier).
*
* Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
* The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
* body parameters for its advanced features.
*
* @example
* ```typescript
* const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
*
* // Basic synthesis
* const result = await provider.synthesize("Hello!");
*
* // Voice cloning with emotion
* const clonedResult = await provider.synthesize("Hello!", {
* referenceAudio: myAudioBuffer,
* emotionExaggeration: 0.7,
* });
* ```
*/
export class ChatterboxTTSProvider extends BaseTTSProvider {
readonly name = "chatterbox";
readonly tier: SpeechTier = "premium";
/**
* Languages supported for cross-language voice transfer.
*/
readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
constructor(baseURL: string) {
super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
}
/**
* Synthesize text to audio with optional voice cloning and emotion control.
*
* Overrides the base synthesize() to support Chatterbox-specific options:
* - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
* - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
*
* These are passed as extra body parameters to the OpenAI-compatible endpoint,
* which Chatterbox's API accepts alongside the standard parameters.
*
* @param text - Text to convert to speech
* @param options - Synthesis options, optionally including Chatterbox-specific params
* @returns Synthesis result with audio buffer and metadata
* @throws {Error} If synthesis fails (e.g., GPU unavailable)
*/
async synthesize(
text: string,
options?: SynthesizeOptions | ChatterboxSynthesizeOptions
): Promise<SynthesisResult> {
const voice = options?.voice ?? this.defaultVoice;
const format = options?.format ?? this.defaultFormat;
const speed = options?.speed ?? DEFAULT_SPEED;
// Build the request body with standard OpenAI-compatible params
const requestBody: Record<string, unknown> = {
model: DEFAULT_MODEL,
input: text,
voice,
response_format: format,
speed,
};
// Add Chatterbox-specific params if provided
const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
if (chatterboxOptions?.referenceAudio) {
requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
}
if (chatterboxOptions?.emotionExaggeration !== undefined) {
// Clamp to valid range [0.0, 1.0]
requestBody.exaggeration = Math.max(
0.0,
Math.min(1.0, chatterboxOptions.emotionExaggeration)
);
}
try {
// Use the OpenAI SDK's create method, passing extra params
// The OpenAI SDK allows additional body params to be passed through
const response = await this.client.audio.speech.create(
requestBody as unknown as SpeechCreateParams
);
const arrayBuffer = await response.arrayBuffer();
const audio = Buffer.from(arrayBuffer);
return {
audio,
format,
voice,
tier: this.tier,
};
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.error(`TTS synthesis failed: ${message}`);
throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
}
}
}

View File

@@ -0,0 +1,316 @@
/**
* KokoroTtsProvider Unit Tests
*
* Tests the Kokoro-FastAPI TTS provider with full voice catalog,
* voice metadata parsing, and Kokoro-specific feature constants.
*
* Issue #393
*/
import { describe, it, expect, vi, beforeEach } from "vitest";
import {
KokoroTtsProvider,
KOKORO_SUPPORTED_FORMATS,
KOKORO_SPEED_RANGE,
KOKORO_VOICES,
parseVoicePrefix,
} from "./kokoro-tts.provider";
import type { VoiceInfo } from "../interfaces/speech-types";
// ==========================================
// Mock OpenAI SDK
// ==========================================
vi.mock("openai", () => {
class MockOpenAI {
audio = {
speech: {
create: vi.fn(),
},
};
}
return { default: MockOpenAI };
});
// ==========================================
// Provider identity
// ==========================================
describe("KokoroTtsProvider", () => {
const testBaseURL = "http://kokoro-tts:8880/v1";
let provider: KokoroTtsProvider;
beforeEach(() => {
provider = new KokoroTtsProvider(testBaseURL);
});
describe("provider identity", () => {
it("should have name 'kokoro'", () => {
expect(provider.name).toBe("kokoro");
});
it("should have tier 'default'", () => {
expect(provider.tier).toBe("default");
});
});
// ==========================================
// listVoices()
// ==========================================
describe("listVoices", () => {
let voices: VoiceInfo[];
beforeEach(async () => {
voices = await provider.listVoices();
});
it("should return an array of VoiceInfo objects", () => {
expect(voices).toBeInstanceOf(Array);
expect(voices.length).toBeGreaterThan(0);
});
it("should return at least 10 voices", () => {
// The issue specifies at least: af_heart, af_bella, af_nicole, af_sarah, af_sky,
// am_adam, am_michael, bf_emma, bf_isabella, bm_george, bm_lewis
expect(voices.length).toBeGreaterThanOrEqual(10);
});
it("should set tier to 'default' on all voices", () => {
for (const voice of voices) {
expect(voice.tier).toBe("default");
}
});
it("should have exactly one default voice", () => {
const defaults = voices.filter((v) => v.isDefault === true);
expect(defaults.length).toBe(1);
});
it("should mark af_heart as the default voice", () => {
const defaultVoice = voices.find((v) => v.isDefault === true);
expect(defaultVoice).toBeDefined();
expect(defaultVoice?.id).toBe("af_heart");
});
it("should have an id and name for every voice", () => {
for (const voice of voices) {
expect(voice.id).toBeTruthy();
expect(voice.name).toBeTruthy();
}
});
it("should set language on every voice", () => {
for (const voice of voices) {
expect(voice.language).toBeTruthy();
}
});
// ==========================================
// Required voices from the issue
// ==========================================
describe("required voices", () => {
const requiredVoiceIds = [
"af_heart",
"af_bella",
"af_nicole",
"af_sarah",
"af_sky",
"am_adam",
"am_michael",
"bf_emma",
"bf_isabella",
"bm_george",
"bm_lewis",
];
it.each(requiredVoiceIds)("should include voice '%s'", (voiceId) => {
const voice = voices.find((v) => v.id === voiceId);
expect(voice).toBeDefined();
});
});
// ==========================================
// Voice metadata from prefix
// ==========================================
describe("voice metadata from prefix", () => {
it("should set language to 'en-US' for af_ prefix voices", () => {
const voice = voices.find((v) => v.id === "af_heart");
expect(voice?.language).toBe("en-US");
});
it("should set language to 'en-US' for am_ prefix voices", () => {
const voice = voices.find((v) => v.id === "am_adam");
expect(voice?.language).toBe("en-US");
});
it("should set language to 'en-GB' for bf_ prefix voices", () => {
const voice = voices.find((v) => v.id === "bf_emma");
expect(voice?.language).toBe("en-GB");
});
it("should set language to 'en-GB' for bm_ prefix voices", () => {
const voice = voices.find((v) => v.id === "bm_george");
expect(voice?.language).toBe("en-GB");
});
it("should include gender in voice name for af_ prefix", () => {
const voice = voices.find((v) => v.id === "af_heart");
expect(voice?.name).toContain("Female");
});
it("should include gender in voice name for am_ prefix", () => {
const voice = voices.find((v) => v.id === "am_adam");
expect(voice?.name).toContain("Male");
});
it("should include gender in voice name for bf_ prefix", () => {
const voice = voices.find((v) => v.id === "bf_emma");
expect(voice?.name).toContain("Female");
});
it("should include gender in voice name for bm_ prefix", () => {
const voice = voices.find((v) => v.id === "bm_george");
expect(voice?.name).toContain("Male");
});
});
// ==========================================
// Voice name formatting
// ==========================================
describe("voice name formatting", () => {
it("should capitalize the voice name portion", () => {
const voice = voices.find((v) => v.id === "af_heart");
expect(voice?.name).toContain("Heart");
});
it("should include the accent/language label in the name", () => {
const afVoice = voices.find((v) => v.id === "af_heart");
expect(afVoice?.name).toContain("American");
const bfVoice = voices.find((v) => v.id === "bf_emma");
expect(bfVoice?.name).toContain("British");
});
});
});
// ==========================================
// Custom constructor
// ==========================================
describe("constructor", () => {
it("should accept custom default voice", () => {
const customProvider = new KokoroTtsProvider(testBaseURL, "af_bella");
expect(customProvider).toBeDefined();
});
it("should accept custom default format", () => {
const customProvider = new KokoroTtsProvider(testBaseURL, "af_heart", "wav");
expect(customProvider).toBeDefined();
});
it("should use af_heart as default voice when none specified", () => {
const defaultProvider = new KokoroTtsProvider(testBaseURL);
expect(defaultProvider).toBeDefined();
});
});
});
// ==========================================
// parseVoicePrefix utility
// ==========================================
describe("parseVoicePrefix", () => {
it("should parse af_ as American English Female", () => {
const result = parseVoicePrefix("af_heart");
expect(result.language).toBe("en-US");
expect(result.gender).toBe("female");
expect(result.accent).toBe("American");
});
it("should parse am_ as American English Male", () => {
const result = parseVoicePrefix("am_adam");
expect(result.language).toBe("en-US");
expect(result.gender).toBe("male");
expect(result.accent).toBe("American");
});
it("should parse bf_ as British English Female", () => {
const result = parseVoicePrefix("bf_emma");
expect(result.language).toBe("en-GB");
expect(result.gender).toBe("female");
expect(result.accent).toBe("British");
});
it("should parse bm_ as British English Male", () => {
const result = parseVoicePrefix("bm_george");
expect(result.language).toBe("en-GB");
expect(result.gender).toBe("male");
expect(result.accent).toBe("British");
});
it("should return unknown for unrecognized prefix", () => {
const result = parseVoicePrefix("xx_unknown");
expect(result.language).toBe("unknown");
expect(result.gender).toBe("unknown");
expect(result.accent).toBe("Unknown");
});
});
// ==========================================
// Exported constants
// ==========================================
describe("KOKORO_SUPPORTED_FORMATS", () => {
it("should include mp3", () => {
expect(KOKORO_SUPPORTED_FORMATS).toContain("mp3");
});
it("should include wav", () => {
expect(KOKORO_SUPPORTED_FORMATS).toContain("wav");
});
it("should include opus", () => {
expect(KOKORO_SUPPORTED_FORMATS).toContain("opus");
});
it("should include flac", () => {
expect(KOKORO_SUPPORTED_FORMATS).toContain("flac");
});
it("should be a readonly array", () => {
expect(Array.isArray(KOKORO_SUPPORTED_FORMATS)).toBe(true);
});
});
describe("KOKORO_SPEED_RANGE", () => {
it("should have min speed of 0.25", () => {
expect(KOKORO_SPEED_RANGE.min).toBe(0.25);
});
it("should have max speed of 4.0", () => {
expect(KOKORO_SPEED_RANGE.max).toBe(4.0);
});
});
describe("KOKORO_VOICES", () => {
it("should be a non-empty array", () => {
expect(Array.isArray(KOKORO_VOICES)).toBe(true);
expect(KOKORO_VOICES.length).toBeGreaterThan(0);
});
it("should contain voice entries with id and label", () => {
for (const voice of KOKORO_VOICES) {
expect(voice.id).toBeTruthy();
expect(voice.label).toBeTruthy();
}
});
it("should include voices from multiple language prefixes", () => {
const prefixes = new Set(KOKORO_VOICES.map((v) => v.id.substring(0, 2)));
expect(prefixes.size).toBeGreaterThanOrEqual(4);
});
});

View File

@@ -0,0 +1,278 @@
/**
* Kokoro-FastAPI TTS Provider
*
* Default-tier TTS provider backed by Kokoro-FastAPI.
* CPU-based, always available, Apache 2.0 license.
*
* Features:
* - 54 built-in voices across 8 languages
* - Speed control: 0.25x to 4.0x
* - Output formats: mp3, wav, opus, flac
* - Voice metadata derived from ID prefix (language, gender, accent)
*
* Voice ID format: {prefix}_{name}
* - First character: language/accent code (a=American, b=British, etc.)
* - Second character: gender code (f=Female, m=Male)
*
* Issue #393
*/
import { BaseTTSProvider } from "./base-tts.provider";
import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
// ==========================================
// Constants
// ==========================================
/** Audio formats supported by Kokoro-FastAPI */
export const KOKORO_SUPPORTED_FORMATS: readonly AudioFormat[] = [
"mp3",
"wav",
"opus",
"flac",
] as const;
/** Speed range supported by Kokoro-FastAPI */
export const KOKORO_SPEED_RANGE = {
min: 0.25,
max: 4.0,
} as const;
/** Default voice for Kokoro */
const KOKORO_DEFAULT_VOICE = "af_heart";
/** Default audio format for Kokoro */
const KOKORO_DEFAULT_FORMAT: AudioFormat = "mp3";
// ==========================================
// Voice prefix mapping
// ==========================================
/**
* Mapping of voice ID prefix (first two characters) to language/accent/gender metadata.
*
* Kokoro voice IDs follow the pattern: {lang}{gender}_{name}
* - lang: a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese
* - gender: f=Female, m=Male
*/
const VOICE_PREFIX_MAP: Record<string, { language: string; gender: string; accent: string }> = {
af: { language: "en-US", gender: "female", accent: "American" },
am: { language: "en-US", gender: "male", accent: "American" },
bf: { language: "en-GB", gender: "female", accent: "British" },
bm: { language: "en-GB", gender: "male", accent: "British" },
ef: { language: "es", gender: "female", accent: "Spanish" },
em: { language: "es", gender: "male", accent: "Spanish" },
ff: { language: "fr", gender: "female", accent: "French" },
fm: { language: "fr", gender: "male", accent: "French" },
hf: { language: "hi", gender: "female", accent: "Hindi" },
hm: { language: "hi", gender: "male", accent: "Hindi" },
jf: { language: "ja", gender: "female", accent: "Japanese" },
jm: { language: "ja", gender: "male", accent: "Japanese" },
pf: { language: "pt-BR", gender: "female", accent: "Portuguese" },
pm: { language: "pt-BR", gender: "male", accent: "Portuguese" },
zf: { language: "zh", gender: "female", accent: "Chinese" },
zm: { language: "zh", gender: "male", accent: "Chinese" },
};
// ==========================================
// Voice catalog
// ==========================================
/** Raw voice catalog entry */
interface KokoroVoiceEntry {
/** Voice ID (e.g. "af_heart") */
id: string;
/** Human-readable label (e.g. "Heart") */
label: string;
}
/**
* Complete catalog of Kokoro built-in voices.
*
* Organized by language/accent prefix:
* - af_: American English Female
* - am_: American English Male
* - bf_: British English Female
* - bm_: British English Male
* - ef_: Spanish Female
* - em_: Spanish Male
* - ff_: French Female
* - hf_: Hindi Female
* - jf_: Japanese Female
* - jm_: Japanese Male
* - pf_: Portuguese Female
* - zf_: Chinese Female
* - zm_: Chinese Male
*/
export const KOKORO_VOICES: readonly KokoroVoiceEntry[] = [
// American English Female (af_)
{ id: "af_heart", label: "Heart" },
{ id: "af_alloy", label: "Alloy" },
{ id: "af_aoede", label: "Aoede" },
{ id: "af_bella", label: "Bella" },
{ id: "af_jessica", label: "Jessica" },
{ id: "af_kore", label: "Kore" },
{ id: "af_nicole", label: "Nicole" },
{ id: "af_nova", label: "Nova" },
{ id: "af_river", label: "River" },
{ id: "af_sarah", label: "Sarah" },
{ id: "af_sky", label: "Sky" },
// American English Male (am_)
{ id: "am_adam", label: "Adam" },
{ id: "am_echo", label: "Echo" },
{ id: "am_eric", label: "Eric" },
{ id: "am_fenrir", label: "Fenrir" },
{ id: "am_liam", label: "Liam" },
{ id: "am_michael", label: "Michael" },
{ id: "am_onyx", label: "Onyx" },
{ id: "am_puck", label: "Puck" },
{ id: "am_santa", label: "Santa" },
// British English Female (bf_)
{ id: "bf_alice", label: "Alice" },
{ id: "bf_emma", label: "Emma" },
{ id: "bf_isabella", label: "Isabella" },
{ id: "bf_lily", label: "Lily" },
// British English Male (bm_)
{ id: "bm_daniel", label: "Daniel" },
{ id: "bm_fable", label: "Fable" },
{ id: "bm_george", label: "George" },
{ id: "bm_lewis", label: "Lewis" },
{ id: "bm_oscar", label: "Oscar" },
// Spanish Female (ef_)
{ id: "ef_dora", label: "Dora" },
{ id: "ef_elena", label: "Elena" },
{ id: "ef_maria", label: "Maria" },
// Spanish Male (em_)
{ id: "em_alex", label: "Alex" },
{ id: "em_carlos", label: "Carlos" },
{ id: "em_santa", label: "Santa" },
// French Female (ff_)
{ id: "ff_camille", label: "Camille" },
{ id: "ff_siwis", label: "Siwis" },
// Hindi Female (hf_)
{ id: "hf_alpha", label: "Alpha" },
{ id: "hf_beta", label: "Beta" },
// Japanese Female (jf_)
{ id: "jf_alpha", label: "Alpha" },
{ id: "jf_gongitsune", label: "Gongitsune" },
{ id: "jf_nezumi", label: "Nezumi" },
{ id: "jf_tebukuro", label: "Tebukuro" },
// Japanese Male (jm_)
{ id: "jm_kumo", label: "Kumo" },
// Portuguese Female (pf_)
{ id: "pf_dora", label: "Dora" },
// Chinese Female (zf_)
{ id: "zf_xiaobei", label: "Xiaobei" },
{ id: "zf_xiaoni", label: "Xiaoni" },
{ id: "zf_xiaoxiao", label: "Xiaoxiao" },
{ id: "zf_xiaoyi", label: "Xiaoyi" },
// Chinese Male (zm_)
{ id: "zm_yunjian", label: "Yunjian" },
{ id: "zm_yunxi", label: "Yunxi" },
{ id: "zm_yunxia", label: "Yunxia" },
{ id: "zm_yunyang", label: "Yunyang" },
] as const;
// ==========================================
// Prefix parser
// ==========================================
/** Parsed voice prefix metadata */
export interface VoicePrefixMetadata {
/** BCP 47 language code (e.g. "en-US", "en-GB", "ja") */
language: string;
/** Gender: "female", "male", or "unknown" */
gender: string;
/** Human-readable accent label (e.g. "American", "British") */
accent: string;
}
/**
* Parse a Kokoro voice ID to extract language, gender, and accent metadata.
*
* Voice IDs follow the pattern: {lang}{gender}_{name}
* The first two characters encode language/accent and gender.
*
* @param voiceId - Kokoro voice ID (e.g. "af_heart")
* @returns Parsed metadata with language, gender, and accent
*/
export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
const prefix = voiceId.substring(0, 2);
const mapping = VOICE_PREFIX_MAP[prefix];
if (mapping) {
return {
language: mapping.language,
gender: mapping.gender,
accent: mapping.accent,
};
}
return {
language: "unknown",
gender: "unknown",
accent: "Unknown",
};
}
// ==========================================
// Provider class
// ==========================================
/**
* Kokoro-FastAPI TTS provider (default tier).
*
* CPU-based text-to-speech engine with 54 built-in voices across 8 languages.
* Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
*
* @example
* ```typescript
* const kokoro = new KokoroTtsProvider("http://kokoro-tts:8880/v1");
* const voices = await kokoro.listVoices();
* const result = await kokoro.synthesize("Hello!", { voice: "af_heart" });
* ```
*/
export class KokoroTtsProvider extends BaseTTSProvider {
readonly name = "kokoro";
readonly tier: SpeechTier = "default";
/**
* Create a new Kokoro TTS provider.
*
* @param baseURL - Base URL for the Kokoro-FastAPI endpoint (e.g. "http://kokoro-tts:8880/v1")
* @param defaultVoice - Default voice ID (defaults to "af_heart")
* @param defaultFormat - Default audio format (defaults to "mp3")
*/
constructor(
baseURL: string,
defaultVoice: string = KOKORO_DEFAULT_VOICE,
defaultFormat: AudioFormat = KOKORO_DEFAULT_FORMAT
) {
super(baseURL, defaultVoice, defaultFormat);
}
/**
* List all available Kokoro voices with metadata.
*
* Returns the full catalog of 54 built-in voices with language, gender,
* and accent information derived from voice ID prefixes.
*
* @returns Array of VoiceInfo objects for all Kokoro voices
*/
override listVoices(): Promise<VoiceInfo[]> {
const voices: VoiceInfo[] = KOKORO_VOICES.map((entry) => {
const metadata = parseVoicePrefix(entry.id);
const genderLabel = metadata.gender === "female" ? "Female" : "Male";
return {
id: entry.id,
name: `${entry.label} (${metadata.accent} ${genderLabel})`,
language: metadata.language,
tier: this.tier,
isDefault: entry.id === this.defaultVoice,
};
});
return Promise.resolve(voices);
}
}

View File

@@ -15,6 +15,8 @@
import { Logger } from "@nestjs/common"; import { Logger } from "@nestjs/common";
import { BaseTTSProvider } from "./base-tts.provider"; import { BaseTTSProvider } from "./base-tts.provider";
import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
import { KokoroTtsProvider } from "./kokoro-tts.provider";
import type { ITTSProvider } from "../interfaces/tts-provider.interface"; import type { ITTSProvider } from "../interfaces/tts-provider.interface";
import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
import type { SpeechConfig } from "../speech.config"; import type { SpeechConfig } from "../speech.config";
@@ -23,28 +25,6 @@ import type { SpeechConfig } from "../speech.config";
// Concrete provider classes // Concrete provider classes
// ========================================== // ==========================================
/**
* Kokoro TTS provider (default tier).
* CPU-based, always available, Apache 2.0 license.
*/
class KokoroProvider extends BaseTTSProvider {
readonly name = "kokoro";
readonly tier: SpeechTier = "default";
}
/**
* Chatterbox TTS provider (premium tier).
* GPU required, voice cloning capable, MIT license.
*/
class ChatterboxProvider extends BaseTTSProvider {
readonly name = "chatterbox";
readonly tier: SpeechTier = "premium";
constructor(baseURL: string) {
super(baseURL, "default", "mp3");
}
}
/** /**
* Piper TTS provider via OpenedAI Speech (fallback tier). * Piper TTS provider via OpenedAI Speech (fallback tier).
* Ultra-lightweight CPU, GPL license. * Ultra-lightweight CPU, GPL license.
@@ -78,7 +58,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
// Default tier: Kokoro // Default tier: Kokoro
if (config.tts.default.enabled) { if (config.tts.default.enabled) {
const provider = new KokoroProvider( const provider = new KokoroTtsProvider(
config.tts.default.url, config.tts.default.url,
config.tts.default.voice, config.tts.default.voice,
config.tts.default.format as AudioFormat config.tts.default.format as AudioFormat
@@ -89,7 +69,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
// Premium tier: Chatterbox // Premium tier: Chatterbox
if (config.tts.premium.enabled) { if (config.tts.premium.enabled) {
const provider = new ChatterboxProvider(config.tts.premium.url); const provider = new ChatterboxTTSProvider(config.tts.premium.url);
providers.set("premium", provider); providers.set("premium", provider);
logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`); logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`);
} }