feat: M13-SpeechServices — TTS & STT integration #409
169
apps/api/src/speech/providers/chatterbox-tts.provider.ts
Normal file
169
apps/api/src/speech/providers/chatterbox-tts.provider.ts
Normal file
@@ -0,0 +1,169 @@
|
||||
/**
|
||||
* Chatterbox TTS Provider
|
||||
*
|
||||
* Premium-tier TTS provider with voice cloning and emotion exaggeration support.
|
||||
* Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
|
||||
* parameters for voice cloning (reference_audio) and emotion control (exaggeration).
|
||||
*
|
||||
* Key capabilities:
|
||||
* - Voice cloning via reference audio sample
|
||||
* - Emotion exaggeration control (0.0 - 1.0)
|
||||
* - Cross-language voice transfer (23 languages)
|
||||
* - Graceful degradation when GPU is unavailable (isHealthy returns false)
|
||||
*
|
||||
* The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
|
||||
*
|
||||
* Issue #394
|
||||
*/
|
||||
|
||||
import type { SpeechCreateParams } from "openai/resources/audio/speech";
|
||||
import { BaseTTSProvider } from "./base-tts.provider";
|
||||
import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
|
||||
import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
|
||||
|
||||
/** Default voice for Chatterbox */
|
||||
const CHATTERBOX_DEFAULT_VOICE = "default";
|
||||
|
||||
/** Default audio format for Chatterbox (WAV for highest quality) */
|
||||
const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
|
||||
|
||||
/** Default TTS model identifier */
|
||||
const DEFAULT_MODEL = "tts-1";
|
||||
|
||||
/** Default speech speed multiplier */
|
||||
const DEFAULT_SPEED = 1.0;
|
||||
|
||||
/**
|
||||
* Languages supported by Chatterbox for cross-language voice transfer.
|
||||
* Chatterbox supports 23 languages for voice cloning and synthesis.
|
||||
*/
|
||||
const SUPPORTED_LANGUAGES: readonly string[] = [
|
||||
"en", // English
|
||||
"fr", // French
|
||||
"de", // German
|
||||
"es", // Spanish
|
||||
"it", // Italian
|
||||
"pt", // Portuguese
|
||||
"nl", // Dutch
|
||||
"pl", // Polish
|
||||
"ru", // Russian
|
||||
"uk", // Ukrainian
|
||||
"ja", // Japanese
|
||||
"zh", // Chinese
|
||||
"ko", // Korean
|
||||
"ar", // Arabic
|
||||
"hi", // Hindi
|
||||
"tr", // Turkish
|
||||
"sv", // Swedish
|
||||
"da", // Danish
|
||||
"fi", // Finnish
|
||||
"no", // Norwegian
|
||||
"cs", // Czech
|
||||
"el", // Greek
|
||||
"ro", // Romanian
|
||||
] as const;
|
||||
|
||||
/**
|
||||
* Chatterbox TTS provider (premium tier).
|
||||
*
|
||||
* Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
|
||||
* The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
|
||||
* body parameters for its advanced features.
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
|
||||
*
|
||||
* // Basic synthesis
|
||||
* const result = await provider.synthesize("Hello!");
|
||||
*
|
||||
* // Voice cloning with emotion
|
||||
* const clonedResult = await provider.synthesize("Hello!", {
|
||||
* referenceAudio: myAudioBuffer,
|
||||
* emotionExaggeration: 0.7,
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
export class ChatterboxTTSProvider extends BaseTTSProvider {
|
||||
readonly name = "chatterbox";
|
||||
readonly tier: SpeechTier = "premium";
|
||||
|
||||
/**
|
||||
* Languages supported for cross-language voice transfer.
|
||||
*/
|
||||
readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
|
||||
|
||||
constructor(baseURL: string) {
|
||||
super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Synthesize text to audio with optional voice cloning and emotion control.
|
||||
*
|
||||
* Overrides the base synthesize() to support Chatterbox-specific options:
|
||||
* - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
|
||||
* - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
|
||||
*
|
||||
* These are passed as extra body parameters to the OpenAI-compatible endpoint,
|
||||
* which Chatterbox's API accepts alongside the standard parameters.
|
||||
*
|
||||
* @param text - Text to convert to speech
|
||||
* @param options - Synthesis options, optionally including Chatterbox-specific params
|
||||
* @returns Synthesis result with audio buffer and metadata
|
||||
* @throws {Error} If synthesis fails (e.g., GPU unavailable)
|
||||
*/
|
||||
async synthesize(
|
||||
text: string,
|
||||
options?: SynthesizeOptions | ChatterboxSynthesizeOptions
|
||||
): Promise<SynthesisResult> {
|
||||
const voice = options?.voice ?? this.defaultVoice;
|
||||
const format = options?.format ?? this.defaultFormat;
|
||||
const speed = options?.speed ?? DEFAULT_SPEED;
|
||||
|
||||
// Build the request body with standard OpenAI-compatible params
|
||||
const requestBody: Record<string, unknown> = {
|
||||
model: DEFAULT_MODEL,
|
||||
input: text,
|
||||
voice,
|
||||
response_format: format,
|
||||
speed,
|
||||
};
|
||||
|
||||
// Add Chatterbox-specific params if provided
|
||||
const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
|
||||
|
||||
if (chatterboxOptions?.referenceAudio) {
|
||||
requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
|
||||
}
|
||||
|
||||
if (chatterboxOptions?.emotionExaggeration !== undefined) {
|
||||
// Clamp to valid range [0.0, 1.0]
|
||||
requestBody.exaggeration = Math.max(
|
||||
0.0,
|
||||
Math.min(1.0, chatterboxOptions.emotionExaggeration)
|
||||
);
|
||||
}
|
||||
|
||||
try {
|
||||
// Use the OpenAI SDK's create method, passing extra params
|
||||
// The OpenAI SDK allows additional body params to be passed through
|
||||
const response = await this.client.audio.speech.create(
|
||||
requestBody as unknown as SpeechCreateParams
|
||||
);
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
const audio = Buffer.from(arrayBuffer);
|
||||
|
||||
return {
|
||||
audio,
|
||||
format,
|
||||
voice,
|
||||
tier: this.tier,
|
||||
};
|
||||
} catch (error: unknown) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
this.logger.error(`TTS synthesis failed: ${message}`);
|
||||
throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
316
apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
Normal file
316
apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
Normal file
@@ -0,0 +1,316 @@
|
||||
/**
|
||||
* KokoroTtsProvider Unit Tests
|
||||
*
|
||||
* Tests the Kokoro-FastAPI TTS provider with full voice catalog,
|
||||
* voice metadata parsing, and Kokoro-specific feature constants.
|
||||
*
|
||||
* Issue #393
|
||||
*/
|
||||
|
||||
import { describe, it, expect, vi, beforeEach } from "vitest";
|
||||
import {
|
||||
KokoroTtsProvider,
|
||||
KOKORO_SUPPORTED_FORMATS,
|
||||
KOKORO_SPEED_RANGE,
|
||||
KOKORO_VOICES,
|
||||
parseVoicePrefix,
|
||||
} from "./kokoro-tts.provider";
|
||||
import type { VoiceInfo } from "../interfaces/speech-types";
|
||||
|
||||
// ==========================================
|
||||
// Mock OpenAI SDK
|
||||
// ==========================================
|
||||
|
||||
vi.mock("openai", () => {
|
||||
class MockOpenAI {
|
||||
audio = {
|
||||
speech: {
|
||||
create: vi.fn(),
|
||||
},
|
||||
};
|
||||
}
|
||||
return { default: MockOpenAI };
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Provider identity
|
||||
// ==========================================
|
||||
|
||||
describe("KokoroTtsProvider", () => {
|
||||
const testBaseURL = "http://kokoro-tts:8880/v1";
|
||||
let provider: KokoroTtsProvider;
|
||||
|
||||
beforeEach(() => {
|
||||
provider = new KokoroTtsProvider(testBaseURL);
|
||||
});
|
||||
|
||||
describe("provider identity", () => {
|
||||
it("should have name 'kokoro'", () => {
|
||||
expect(provider.name).toBe("kokoro");
|
||||
});
|
||||
|
||||
it("should have tier 'default'", () => {
|
||||
expect(provider.tier).toBe("default");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// listVoices()
|
||||
// ==========================================
|
||||
|
||||
describe("listVoices", () => {
|
||||
let voices: VoiceInfo[];
|
||||
|
||||
beforeEach(async () => {
|
||||
voices = await provider.listVoices();
|
||||
});
|
||||
|
||||
it("should return an array of VoiceInfo objects", () => {
|
||||
expect(voices).toBeInstanceOf(Array);
|
||||
expect(voices.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("should return at least 10 voices", () => {
|
||||
// The issue specifies at least: af_heart, af_bella, af_nicole, af_sarah, af_sky,
|
||||
// am_adam, am_michael, bf_emma, bf_isabella, bm_george, bm_lewis
|
||||
expect(voices.length).toBeGreaterThanOrEqual(10);
|
||||
});
|
||||
|
||||
it("should set tier to 'default' on all voices", () => {
|
||||
for (const voice of voices) {
|
||||
expect(voice.tier).toBe("default");
|
||||
}
|
||||
});
|
||||
|
||||
it("should have exactly one default voice", () => {
|
||||
const defaults = voices.filter((v) => v.isDefault === true);
|
||||
expect(defaults.length).toBe(1);
|
||||
});
|
||||
|
||||
it("should mark af_heart as the default voice", () => {
|
||||
const defaultVoice = voices.find((v) => v.isDefault === true);
|
||||
expect(defaultVoice).toBeDefined();
|
||||
expect(defaultVoice?.id).toBe("af_heart");
|
||||
});
|
||||
|
||||
it("should have an id and name for every voice", () => {
|
||||
for (const voice of voices) {
|
||||
expect(voice.id).toBeTruthy();
|
||||
expect(voice.name).toBeTruthy();
|
||||
}
|
||||
});
|
||||
|
||||
it("should set language on every voice", () => {
|
||||
for (const voice of voices) {
|
||||
expect(voice.language).toBeTruthy();
|
||||
}
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Required voices from the issue
|
||||
// ==========================================
|
||||
|
||||
describe("required voices", () => {
|
||||
const requiredVoiceIds = [
|
||||
"af_heart",
|
||||
"af_bella",
|
||||
"af_nicole",
|
||||
"af_sarah",
|
||||
"af_sky",
|
||||
"am_adam",
|
||||
"am_michael",
|
||||
"bf_emma",
|
||||
"bf_isabella",
|
||||
"bm_george",
|
||||
"bm_lewis",
|
||||
];
|
||||
|
||||
it.each(requiredVoiceIds)("should include voice '%s'", (voiceId) => {
|
||||
const voice = voices.find((v) => v.id === voiceId);
|
||||
expect(voice).toBeDefined();
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Voice metadata from prefix
|
||||
// ==========================================
|
||||
|
||||
describe("voice metadata from prefix", () => {
|
||||
it("should set language to 'en-US' for af_ prefix voices", () => {
|
||||
const voice = voices.find((v) => v.id === "af_heart");
|
||||
expect(voice?.language).toBe("en-US");
|
||||
});
|
||||
|
||||
it("should set language to 'en-US' for am_ prefix voices", () => {
|
||||
const voice = voices.find((v) => v.id === "am_adam");
|
||||
expect(voice?.language).toBe("en-US");
|
||||
});
|
||||
|
||||
it("should set language to 'en-GB' for bf_ prefix voices", () => {
|
||||
const voice = voices.find((v) => v.id === "bf_emma");
|
||||
expect(voice?.language).toBe("en-GB");
|
||||
});
|
||||
|
||||
it("should set language to 'en-GB' for bm_ prefix voices", () => {
|
||||
const voice = voices.find((v) => v.id === "bm_george");
|
||||
expect(voice?.language).toBe("en-GB");
|
||||
});
|
||||
|
||||
it("should include gender in voice name for af_ prefix", () => {
|
||||
const voice = voices.find((v) => v.id === "af_heart");
|
||||
expect(voice?.name).toContain("Female");
|
||||
});
|
||||
|
||||
it("should include gender in voice name for am_ prefix", () => {
|
||||
const voice = voices.find((v) => v.id === "am_adam");
|
||||
expect(voice?.name).toContain("Male");
|
||||
});
|
||||
|
||||
it("should include gender in voice name for bf_ prefix", () => {
|
||||
const voice = voices.find((v) => v.id === "bf_emma");
|
||||
expect(voice?.name).toContain("Female");
|
||||
});
|
||||
|
||||
it("should include gender in voice name for bm_ prefix", () => {
|
||||
const voice = voices.find((v) => v.id === "bm_george");
|
||||
expect(voice?.name).toContain("Male");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Voice name formatting
|
||||
// ==========================================
|
||||
|
||||
describe("voice name formatting", () => {
|
||||
it("should capitalize the voice name portion", () => {
|
||||
const voice = voices.find((v) => v.id === "af_heart");
|
||||
expect(voice?.name).toContain("Heart");
|
||||
});
|
||||
|
||||
it("should include the accent/language label in the name", () => {
|
||||
const afVoice = voices.find((v) => v.id === "af_heart");
|
||||
expect(afVoice?.name).toContain("American");
|
||||
|
||||
const bfVoice = voices.find((v) => v.id === "bf_emma");
|
||||
expect(bfVoice?.name).toContain("British");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Custom constructor
|
||||
// ==========================================
|
||||
|
||||
describe("constructor", () => {
|
||||
it("should accept custom default voice", () => {
|
||||
const customProvider = new KokoroTtsProvider(testBaseURL, "af_bella");
|
||||
expect(customProvider).toBeDefined();
|
||||
});
|
||||
|
||||
it("should accept custom default format", () => {
|
||||
const customProvider = new KokoroTtsProvider(testBaseURL, "af_heart", "wav");
|
||||
expect(customProvider).toBeDefined();
|
||||
});
|
||||
|
||||
it("should use af_heart as default voice when none specified", () => {
|
||||
const defaultProvider = new KokoroTtsProvider(testBaseURL);
|
||||
expect(defaultProvider).toBeDefined();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// parseVoicePrefix utility
|
||||
// ==========================================
|
||||
|
||||
describe("parseVoicePrefix", () => {
|
||||
it("should parse af_ as American English Female", () => {
|
||||
const result = parseVoicePrefix("af_heart");
|
||||
expect(result.language).toBe("en-US");
|
||||
expect(result.gender).toBe("female");
|
||||
expect(result.accent).toBe("American");
|
||||
});
|
||||
|
||||
it("should parse am_ as American English Male", () => {
|
||||
const result = parseVoicePrefix("am_adam");
|
||||
expect(result.language).toBe("en-US");
|
||||
expect(result.gender).toBe("male");
|
||||
expect(result.accent).toBe("American");
|
||||
});
|
||||
|
||||
it("should parse bf_ as British English Female", () => {
|
||||
const result = parseVoicePrefix("bf_emma");
|
||||
expect(result.language).toBe("en-GB");
|
||||
expect(result.gender).toBe("female");
|
||||
expect(result.accent).toBe("British");
|
||||
});
|
||||
|
||||
it("should parse bm_ as British English Male", () => {
|
||||
const result = parseVoicePrefix("bm_george");
|
||||
expect(result.language).toBe("en-GB");
|
||||
expect(result.gender).toBe("male");
|
||||
expect(result.accent).toBe("British");
|
||||
});
|
||||
|
||||
it("should return unknown for unrecognized prefix", () => {
|
||||
const result = parseVoicePrefix("xx_unknown");
|
||||
expect(result.language).toBe("unknown");
|
||||
expect(result.gender).toBe("unknown");
|
||||
expect(result.accent).toBe("Unknown");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Exported constants
|
||||
// ==========================================
|
||||
|
||||
describe("KOKORO_SUPPORTED_FORMATS", () => {
|
||||
it("should include mp3", () => {
|
||||
expect(KOKORO_SUPPORTED_FORMATS).toContain("mp3");
|
||||
});
|
||||
|
||||
it("should include wav", () => {
|
||||
expect(KOKORO_SUPPORTED_FORMATS).toContain("wav");
|
||||
});
|
||||
|
||||
it("should include opus", () => {
|
||||
expect(KOKORO_SUPPORTED_FORMATS).toContain("opus");
|
||||
});
|
||||
|
||||
it("should include flac", () => {
|
||||
expect(KOKORO_SUPPORTED_FORMATS).toContain("flac");
|
||||
});
|
||||
|
||||
it("should be a readonly array", () => {
|
||||
expect(Array.isArray(KOKORO_SUPPORTED_FORMATS)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("KOKORO_SPEED_RANGE", () => {
|
||||
it("should have min speed of 0.25", () => {
|
||||
expect(KOKORO_SPEED_RANGE.min).toBe(0.25);
|
||||
});
|
||||
|
||||
it("should have max speed of 4.0", () => {
|
||||
expect(KOKORO_SPEED_RANGE.max).toBe(4.0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("KOKORO_VOICES", () => {
|
||||
it("should be a non-empty array", () => {
|
||||
expect(Array.isArray(KOKORO_VOICES)).toBe(true);
|
||||
expect(KOKORO_VOICES.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("should contain voice entries with id and label", () => {
|
||||
for (const voice of KOKORO_VOICES) {
|
||||
expect(voice.id).toBeTruthy();
|
||||
expect(voice.label).toBeTruthy();
|
||||
}
|
||||
});
|
||||
|
||||
it("should include voices from multiple language prefixes", () => {
|
||||
const prefixes = new Set(KOKORO_VOICES.map((v) => v.id.substring(0, 2)));
|
||||
expect(prefixes.size).toBeGreaterThanOrEqual(4);
|
||||
});
|
||||
});
|
||||
278
apps/api/src/speech/providers/kokoro-tts.provider.ts
Normal file
278
apps/api/src/speech/providers/kokoro-tts.provider.ts
Normal file
@@ -0,0 +1,278 @@
|
||||
/**
|
||||
* Kokoro-FastAPI TTS Provider
|
||||
*
|
||||
* Default-tier TTS provider backed by Kokoro-FastAPI.
|
||||
* CPU-based, always available, Apache 2.0 license.
|
||||
*
|
||||
* Features:
|
||||
* - 54 built-in voices across 8 languages
|
||||
* - Speed control: 0.25x to 4.0x
|
||||
* - Output formats: mp3, wav, opus, flac
|
||||
* - Voice metadata derived from ID prefix (language, gender, accent)
|
||||
*
|
||||
* Voice ID format: {prefix}_{name}
|
||||
* - First character: language/accent code (a=American, b=British, etc.)
|
||||
* - Second character: gender code (f=Female, m=Male)
|
||||
*
|
||||
* Issue #393
|
||||
*/
|
||||
|
||||
import { BaseTTSProvider } from "./base-tts.provider";
|
||||
import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
|
||||
|
||||
// ==========================================
|
||||
// Constants
|
||||
// ==========================================
|
||||
|
||||
/** Audio formats supported by Kokoro-FastAPI */
|
||||
export const KOKORO_SUPPORTED_FORMATS: readonly AudioFormat[] = [
|
||||
"mp3",
|
||||
"wav",
|
||||
"opus",
|
||||
"flac",
|
||||
] as const;
|
||||
|
||||
/** Speed range supported by Kokoro-FastAPI */
|
||||
export const KOKORO_SPEED_RANGE = {
|
||||
min: 0.25,
|
||||
max: 4.0,
|
||||
} as const;
|
||||
|
||||
/** Default voice for Kokoro */
|
||||
const KOKORO_DEFAULT_VOICE = "af_heart";
|
||||
|
||||
/** Default audio format for Kokoro */
|
||||
const KOKORO_DEFAULT_FORMAT: AudioFormat = "mp3";
|
||||
|
||||
// ==========================================
|
||||
// Voice prefix mapping
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* Mapping of voice ID prefix (first two characters) to language/accent/gender metadata.
|
||||
*
|
||||
* Kokoro voice IDs follow the pattern: {lang}{gender}_{name}
|
||||
* - lang: a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese
|
||||
* - gender: f=Female, m=Male
|
||||
*/
|
||||
const VOICE_PREFIX_MAP: Record<string, { language: string; gender: string; accent: string }> = {
|
||||
af: { language: "en-US", gender: "female", accent: "American" },
|
||||
am: { language: "en-US", gender: "male", accent: "American" },
|
||||
bf: { language: "en-GB", gender: "female", accent: "British" },
|
||||
bm: { language: "en-GB", gender: "male", accent: "British" },
|
||||
ef: { language: "es", gender: "female", accent: "Spanish" },
|
||||
em: { language: "es", gender: "male", accent: "Spanish" },
|
||||
ff: { language: "fr", gender: "female", accent: "French" },
|
||||
fm: { language: "fr", gender: "male", accent: "French" },
|
||||
hf: { language: "hi", gender: "female", accent: "Hindi" },
|
||||
hm: { language: "hi", gender: "male", accent: "Hindi" },
|
||||
jf: { language: "ja", gender: "female", accent: "Japanese" },
|
||||
jm: { language: "ja", gender: "male", accent: "Japanese" },
|
||||
pf: { language: "pt-BR", gender: "female", accent: "Portuguese" },
|
||||
pm: { language: "pt-BR", gender: "male", accent: "Portuguese" },
|
||||
zf: { language: "zh", gender: "female", accent: "Chinese" },
|
||||
zm: { language: "zh", gender: "male", accent: "Chinese" },
|
||||
};
|
||||
|
||||
// ==========================================
|
||||
// Voice catalog
|
||||
// ==========================================
|
||||
|
||||
/** Raw voice catalog entry */
|
||||
interface KokoroVoiceEntry {
|
||||
/** Voice ID (e.g. "af_heart") */
|
||||
id: string;
|
||||
/** Human-readable label (e.g. "Heart") */
|
||||
label: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Complete catalog of Kokoro built-in voices.
|
||||
*
|
||||
* Organized by language/accent prefix:
|
||||
* - af_: American English Female
|
||||
* - am_: American English Male
|
||||
* - bf_: British English Female
|
||||
* - bm_: British English Male
|
||||
* - ef_: Spanish Female
|
||||
* - em_: Spanish Male
|
||||
* - ff_: French Female
|
||||
* - hf_: Hindi Female
|
||||
* - jf_: Japanese Female
|
||||
* - jm_: Japanese Male
|
||||
* - pf_: Portuguese Female
|
||||
* - zf_: Chinese Female
|
||||
* - zm_: Chinese Male
|
||||
*/
|
||||
export const KOKORO_VOICES: readonly KokoroVoiceEntry[] = [
|
||||
// American English Female (af_)
|
||||
{ id: "af_heart", label: "Heart" },
|
||||
{ id: "af_alloy", label: "Alloy" },
|
||||
{ id: "af_aoede", label: "Aoede" },
|
||||
{ id: "af_bella", label: "Bella" },
|
||||
{ id: "af_jessica", label: "Jessica" },
|
||||
{ id: "af_kore", label: "Kore" },
|
||||
{ id: "af_nicole", label: "Nicole" },
|
||||
{ id: "af_nova", label: "Nova" },
|
||||
{ id: "af_river", label: "River" },
|
||||
{ id: "af_sarah", label: "Sarah" },
|
||||
{ id: "af_sky", label: "Sky" },
|
||||
// American English Male (am_)
|
||||
{ id: "am_adam", label: "Adam" },
|
||||
{ id: "am_echo", label: "Echo" },
|
||||
{ id: "am_eric", label: "Eric" },
|
||||
{ id: "am_fenrir", label: "Fenrir" },
|
||||
{ id: "am_liam", label: "Liam" },
|
||||
{ id: "am_michael", label: "Michael" },
|
||||
{ id: "am_onyx", label: "Onyx" },
|
||||
{ id: "am_puck", label: "Puck" },
|
||||
{ id: "am_santa", label: "Santa" },
|
||||
// British English Female (bf_)
|
||||
{ id: "bf_alice", label: "Alice" },
|
||||
{ id: "bf_emma", label: "Emma" },
|
||||
{ id: "bf_isabella", label: "Isabella" },
|
||||
{ id: "bf_lily", label: "Lily" },
|
||||
// British English Male (bm_)
|
||||
{ id: "bm_daniel", label: "Daniel" },
|
||||
{ id: "bm_fable", label: "Fable" },
|
||||
{ id: "bm_george", label: "George" },
|
||||
{ id: "bm_lewis", label: "Lewis" },
|
||||
{ id: "bm_oscar", label: "Oscar" },
|
||||
// Spanish Female (ef_)
|
||||
{ id: "ef_dora", label: "Dora" },
|
||||
{ id: "ef_elena", label: "Elena" },
|
||||
{ id: "ef_maria", label: "Maria" },
|
||||
// Spanish Male (em_)
|
||||
{ id: "em_alex", label: "Alex" },
|
||||
{ id: "em_carlos", label: "Carlos" },
|
||||
{ id: "em_santa", label: "Santa" },
|
||||
// French Female (ff_)
|
||||
{ id: "ff_camille", label: "Camille" },
|
||||
{ id: "ff_siwis", label: "Siwis" },
|
||||
// Hindi Female (hf_)
|
||||
{ id: "hf_alpha", label: "Alpha" },
|
||||
{ id: "hf_beta", label: "Beta" },
|
||||
// Japanese Female (jf_)
|
||||
{ id: "jf_alpha", label: "Alpha" },
|
||||
{ id: "jf_gongitsune", label: "Gongitsune" },
|
||||
{ id: "jf_nezumi", label: "Nezumi" },
|
||||
{ id: "jf_tebukuro", label: "Tebukuro" },
|
||||
// Japanese Male (jm_)
|
||||
{ id: "jm_kumo", label: "Kumo" },
|
||||
// Portuguese Female (pf_)
|
||||
{ id: "pf_dora", label: "Dora" },
|
||||
// Chinese Female (zf_)
|
||||
{ id: "zf_xiaobei", label: "Xiaobei" },
|
||||
{ id: "zf_xiaoni", label: "Xiaoni" },
|
||||
{ id: "zf_xiaoxiao", label: "Xiaoxiao" },
|
||||
{ id: "zf_xiaoyi", label: "Xiaoyi" },
|
||||
// Chinese Male (zm_)
|
||||
{ id: "zm_yunjian", label: "Yunjian" },
|
||||
{ id: "zm_yunxi", label: "Yunxi" },
|
||||
{ id: "zm_yunxia", label: "Yunxia" },
|
||||
{ id: "zm_yunyang", label: "Yunyang" },
|
||||
] as const;
|
||||
|
||||
// ==========================================
|
||||
// Prefix parser
|
||||
// ==========================================
|
||||
|
||||
/** Parsed voice prefix metadata */
|
||||
export interface VoicePrefixMetadata {
|
||||
/** BCP 47 language code (e.g. "en-US", "en-GB", "ja") */
|
||||
language: string;
|
||||
/** Gender: "female", "male", or "unknown" */
|
||||
gender: string;
|
||||
/** Human-readable accent label (e.g. "American", "British") */
|
||||
accent: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a Kokoro voice ID to extract language, gender, and accent metadata.
|
||||
*
|
||||
* Voice IDs follow the pattern: {lang}{gender}_{name}
|
||||
* The first two characters encode language/accent and gender.
|
||||
*
|
||||
* @param voiceId - Kokoro voice ID (e.g. "af_heart")
|
||||
* @returns Parsed metadata with language, gender, and accent
|
||||
*/
|
||||
export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
|
||||
const prefix = voiceId.substring(0, 2);
|
||||
const mapping = VOICE_PREFIX_MAP[prefix];
|
||||
|
||||
if (mapping) {
|
||||
return {
|
||||
language: mapping.language,
|
||||
gender: mapping.gender,
|
||||
accent: mapping.accent,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
language: "unknown",
|
||||
gender: "unknown",
|
||||
accent: "Unknown",
|
||||
};
|
||||
}
|
||||
|
||||
// ==========================================
|
||||
// Provider class
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* Kokoro-FastAPI TTS provider (default tier).
|
||||
*
|
||||
* CPU-based text-to-speech engine with 54 built-in voices across 8 languages.
|
||||
* Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const kokoro = new KokoroTtsProvider("http://kokoro-tts:8880/v1");
|
||||
* const voices = await kokoro.listVoices();
|
||||
* const result = await kokoro.synthesize("Hello!", { voice: "af_heart" });
|
||||
* ```
|
||||
*/
|
||||
export class KokoroTtsProvider extends BaseTTSProvider {
|
||||
readonly name = "kokoro";
|
||||
readonly tier: SpeechTier = "default";
|
||||
|
||||
/**
|
||||
* Create a new Kokoro TTS provider.
|
||||
*
|
||||
* @param baseURL - Base URL for the Kokoro-FastAPI endpoint (e.g. "http://kokoro-tts:8880/v1")
|
||||
* @param defaultVoice - Default voice ID (defaults to "af_heart")
|
||||
* @param defaultFormat - Default audio format (defaults to "mp3")
|
||||
*/
|
||||
constructor(
|
||||
baseURL: string,
|
||||
defaultVoice: string = KOKORO_DEFAULT_VOICE,
|
||||
defaultFormat: AudioFormat = KOKORO_DEFAULT_FORMAT
|
||||
) {
|
||||
super(baseURL, defaultVoice, defaultFormat);
|
||||
}
|
||||
|
||||
/**
|
||||
* List all available Kokoro voices with metadata.
|
||||
*
|
||||
* Returns the full catalog of 54 built-in voices with language, gender,
|
||||
* and accent information derived from voice ID prefixes.
|
||||
*
|
||||
* @returns Array of VoiceInfo objects for all Kokoro voices
|
||||
*/
|
||||
override listVoices(): Promise<VoiceInfo[]> {
|
||||
const voices: VoiceInfo[] = KOKORO_VOICES.map((entry) => {
|
||||
const metadata = parseVoicePrefix(entry.id);
|
||||
const genderLabel = metadata.gender === "female" ? "Female" : "Male";
|
||||
|
||||
return {
|
||||
id: entry.id,
|
||||
name: `${entry.label} (${metadata.accent} ${genderLabel})`,
|
||||
language: metadata.language,
|
||||
tier: this.tier,
|
||||
isDefault: entry.id === this.defaultVoice,
|
||||
};
|
||||
});
|
||||
|
||||
return Promise.resolve(voices);
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,8 @@
|
||||
|
||||
import { Logger } from "@nestjs/common";
|
||||
import { BaseTTSProvider } from "./base-tts.provider";
|
||||
import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
|
||||
import { KokoroTtsProvider } from "./kokoro-tts.provider";
|
||||
import type { ITTSProvider } from "../interfaces/tts-provider.interface";
|
||||
import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
|
||||
import type { SpeechConfig } from "../speech.config";
|
||||
@@ -23,28 +25,6 @@ import type { SpeechConfig } from "../speech.config";
|
||||
// Concrete provider classes
|
||||
// ==========================================
|
||||
|
||||
/**
|
||||
* Kokoro TTS provider (default tier).
|
||||
* CPU-based, always available, Apache 2.0 license.
|
||||
*/
|
||||
class KokoroProvider extends BaseTTSProvider {
|
||||
readonly name = "kokoro";
|
||||
readonly tier: SpeechTier = "default";
|
||||
}
|
||||
|
||||
/**
|
||||
* Chatterbox TTS provider (premium tier).
|
||||
* GPU required, voice cloning capable, MIT license.
|
||||
*/
|
||||
class ChatterboxProvider extends BaseTTSProvider {
|
||||
readonly name = "chatterbox";
|
||||
readonly tier: SpeechTier = "premium";
|
||||
|
||||
constructor(baseURL: string) {
|
||||
super(baseURL, "default", "mp3");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Piper TTS provider via OpenedAI Speech (fallback tier).
|
||||
* Ultra-lightweight CPU, GPL license.
|
||||
@@ -78,7 +58,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
|
||||
|
||||
// Default tier: Kokoro
|
||||
if (config.tts.default.enabled) {
|
||||
const provider = new KokoroProvider(
|
||||
const provider = new KokoroTtsProvider(
|
||||
config.tts.default.url,
|
||||
config.tts.default.voice,
|
||||
config.tts.default.format as AudioFormat
|
||||
@@ -89,7 +69,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
|
||||
|
||||
// Premium tier: Chatterbox
|
||||
if (config.tts.premium.enabled) {
|
||||
const provider = new ChatterboxProvider(config.tts.premium.url);
|
||||
const provider = new ChatterboxTTSProvider(config.tts.premium.url);
|
||||
providers.set("premium", provider);
|
||||
logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user