diff --git a/apps/api/src/speech/providers/piper-tts.provider.spec.ts b/apps/api/src/speech/providers/piper-tts.provider.spec.ts new file mode 100644 index 0000000..c0c1661 --- /dev/null +++ b/apps/api/src/speech/providers/piper-tts.provider.spec.ts @@ -0,0 +1,266 @@ +/** + * PiperTtsProvider Unit Tests + * + * Tests the Piper TTS provider via OpenedAI Speech (fallback tier). + * Validates provider identity, OpenAI voice name mapping, voice listing, + * and ultra-lightweight CPU-only design characteristics. + * + * Issue #395 + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { + PiperTtsProvider, + PIPER_VOICE_MAP, + PIPER_SUPPORTED_FORMATS, + OPENAI_STANDARD_VOICES, +} from "./piper-tts.provider"; +import type { VoiceInfo } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: vi.fn(), + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Provider identity +// ========================================== + +describe("PiperTtsProvider", () => { + const testBaseURL = "http://openedai-speech:8000/v1"; + let provider: PiperTtsProvider; + + beforeEach(() => { + provider = new PiperTtsProvider(testBaseURL); + }); + + describe("provider identity", () => { + it("should have name 'piper'", () => { + expect(provider.name).toBe("piper"); + }); + + it("should have tier 'fallback'", () => { + expect(provider.tier).toBe("fallback"); + }); + }); + + // ========================================== + // Constructor + // ========================================== + + describe("constructor", () => { + it("should use 'alloy' as default voice", () => { + const newProvider = new PiperTtsProvider(testBaseURL); + expect(newProvider).toBeDefined(); + }); + + it("should accept a custom default voice", () => { + const customProvider = new PiperTtsProvider(testBaseURL, "nova"); + expect(customProvider).toBeDefined(); + }); + + it("should accept a custom default format", () => { + const customProvider = new PiperTtsProvider(testBaseURL, "alloy", "wav"); + expect(customProvider).toBeDefined(); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + let voices: VoiceInfo[]; + + beforeEach(async () => { + voices = await provider.listVoices(); + }); + + it("should return an array of VoiceInfo objects", () => { + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + }); + + it("should return exactly 6 voices (OpenAI standard set)", () => { + expect(voices.length).toBe(6); + }); + + it("should set tier to 'fallback' on all voices", () => { + for (const voice of voices) { + expect(voice.tier).toBe("fallback"); + } + }); + + it("should have exactly one default voice", () => { + const defaults = voices.filter((v) => v.isDefault === true); + expect(defaults.length).toBe(1); + }); + + it("should mark 'alloy' as the default voice", () => { + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("alloy"); + }); + + it("should have an id and name for every voice", () => { + for (const voice of voices) { + expect(voice.id).toBeTruthy(); + expect(voice.name).toBeTruthy(); + } + }); + + it("should set language on every voice", () => { + for (const voice of voices) { + expect(voice.language).toBeTruthy(); + } + }); + + // ========================================== + // All 6 OpenAI standard voices present + // ========================================== + + describe("OpenAI standard voices", () => { + const standardVoiceIds = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]; + + it.each(standardVoiceIds)("should include voice '%s'", (voiceId) => { + const voice = voices.find((v) => v.id === voiceId); + expect(voice).toBeDefined(); + }); + }); + + // ========================================== + // Voice metadata + // ========================================== + + describe("voice metadata", () => { + it("should include gender info in voice names", () => { + const alloy = voices.find((v) => v.id === "alloy"); + expect(alloy?.name).toMatch(/Female|Male/); + }); + + it("should map alloy to a female voice", () => { + const alloy = voices.find((v) => v.id === "alloy"); + expect(alloy?.name).toContain("Female"); + }); + + it("should map echo to a male voice", () => { + const echo = voices.find((v) => v.id === "echo"); + expect(echo?.name).toContain("Male"); + }); + + it("should map fable to a British voice", () => { + const fable = voices.find((v) => v.id === "fable"); + expect(fable?.language).toBe("en-GB"); + }); + + it("should map onyx to a male voice", () => { + const onyx = voices.find((v) => v.id === "onyx"); + expect(onyx?.name).toContain("Male"); + }); + + it("should map nova to a female voice", () => { + const nova = voices.find((v) => v.id === "nova"); + expect(nova?.name).toContain("Female"); + }); + + it("should map shimmer to a female voice", () => { + const shimmer = voices.find((v) => v.id === "shimmer"); + expect(shimmer?.name).toContain("Female"); + }); + }); + }); +}); + +// ========================================== +// PIPER_VOICE_MAP +// ========================================== + +describe("PIPER_VOICE_MAP", () => { + it("should contain all 6 OpenAI standard voice names", () => { + const expectedKeys = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]; + for (const key of expectedKeys) { + expect(PIPER_VOICE_MAP).toHaveProperty(key); + } + }); + + it("should map each voice to a Piper voice ID", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.piperVoice).toBeTruthy(); + expect(typeof entry.piperVoice).toBe("string"); + } + }); + + it("should have gender for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.gender).toMatch(/^(female|male)$/); + } + }); + + it("should have a language for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.language).toBeTruthy(); + } + }); + + it("should have a description for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.description).toBeTruthy(); + } + }); +}); + +// ========================================== +// OPENAI_STANDARD_VOICES +// ========================================== + +describe("OPENAI_STANDARD_VOICES", () => { + it("should be an array of 6 voice IDs", () => { + expect(Array.isArray(OPENAI_STANDARD_VOICES)).toBe(true); + expect(OPENAI_STANDARD_VOICES.length).toBe(6); + }); + + it("should contain all standard OpenAI voice names", () => { + expect(OPENAI_STANDARD_VOICES).toContain("alloy"); + expect(OPENAI_STANDARD_VOICES).toContain("echo"); + expect(OPENAI_STANDARD_VOICES).toContain("fable"); + expect(OPENAI_STANDARD_VOICES).toContain("onyx"); + expect(OPENAI_STANDARD_VOICES).toContain("nova"); + expect(OPENAI_STANDARD_VOICES).toContain("shimmer"); + }); +}); + +// ========================================== +// PIPER_SUPPORTED_FORMATS +// ========================================== + +describe("PIPER_SUPPORTED_FORMATS", () => { + it("should include mp3", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("mp3"); + }); + + it("should include wav", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("wav"); + }); + + it("should include opus", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("opus"); + }); + + it("should include flac", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("flac"); + }); + + it("should be a readonly array", () => { + expect(Array.isArray(PIPER_SUPPORTED_FORMATS)).toBe(true); + }); +}); diff --git a/apps/api/src/speech/providers/piper-tts.provider.ts b/apps/api/src/speech/providers/piper-tts.provider.ts new file mode 100644 index 0000000..40e4638 --- /dev/null +++ b/apps/api/src/speech/providers/piper-tts.provider.ts @@ -0,0 +1,212 @@ +/** + * Piper TTS Provider via OpenedAI Speech + * + * Fallback-tier TTS provider using Piper via OpenedAI Speech for + * ultra-lightweight CPU-only synthesis. Designed for low-resource + * environments including Raspberry Pi. + * + * Features: + * - OpenAI-compatible API via OpenedAI Speech server + * - 100+ Piper voices across 40+ languages + * - 6 standard OpenAI voice names mapped to Piper voices + * - Output formats: mp3, wav, opus, flac, aac, pcm + * - CPU-only, no GPU required + * - GPL license (via OpenedAI Speech) + * + * Voice names use the OpenAI standard set (alloy, echo, fable, onyx, + * nova, shimmer) which OpenedAI Speech maps to configured Piper voices. + * + * Issue #395 + */ + +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Constants +// ========================================== + +/** Audio formats supported by OpenedAI Speech with Piper backend */ +export const PIPER_SUPPORTED_FORMATS: readonly AudioFormat[] = [ + "mp3", + "wav", + "opus", + "flac", +] as const; + +/** Default voice for Piper (via OpenedAI Speech) */ +const PIPER_DEFAULT_VOICE = "alloy"; + +/** Default audio format for Piper */ +const PIPER_DEFAULT_FORMAT: AudioFormat = "mp3"; + +// ========================================== +// OpenAI standard voice names +// ========================================== + +/** + * The 6 standard OpenAI TTS voice names. + * OpenedAI Speech accepts these names and routes them to configured Piper voices. + */ +export const OPENAI_STANDARD_VOICES: readonly string[] = [ + "alloy", + "echo", + "fable", + "onyx", + "nova", + "shimmer", +] as const; + +// ========================================== +// Voice mapping +// ========================================== + +/** Metadata for a Piper voice mapped from an OpenAI voice name */ +export interface PiperVoiceMapping { + /** The underlying Piper voice ID configured in OpenedAI Speech */ + piperVoice: string; + /** Human-readable description of the voice character */ + description: string; + /** Gender of the voice */ + gender: "female" | "male"; + /** BCP 47 language code */ + language: string; +} + +/** Fallback mapping used when a voice ID is not found in PIPER_VOICE_MAP */ +const DEFAULT_MAPPING: PiperVoiceMapping = { + piperVoice: "en_US-amy-medium", + description: "Default voice", + gender: "female", + language: "en-US", +}; + +/** + * Mapping of OpenAI standard voice names to their default Piper voice + * configuration in OpenedAI Speech. + * + * These are the default mappings that OpenedAI Speech uses when configured + * with Piper as the TTS backend. The actual Piper voice used can be + * customized in the OpenedAI Speech configuration file. + * + * Default Piper voice assignments: + * - alloy: en_US-amy-medium (warm, balanced female) + * - echo: en_US-ryan-medium (clear, articulate male) + * - fable: en_GB-alan-medium (British male narrator) + * - onyx: en_US-danny-low (deep, resonant male) + * - nova: en_US-lessac-medium (expressive female) + * - shimmer: en_US-kristin-medium (bright, energetic female) + */ +export const PIPER_VOICE_MAP: Record = { + alloy: { + piperVoice: "en_US-amy-medium", + description: "Warm, balanced voice", + gender: "female", + language: "en-US", + }, + echo: { + piperVoice: "en_US-ryan-medium", + description: "Clear, articulate voice", + gender: "male", + language: "en-US", + }, + fable: { + piperVoice: "en_GB-alan-medium", + description: "British narrator voice", + gender: "male", + language: "en-GB", + }, + onyx: { + piperVoice: "en_US-danny-low", + description: "Deep, resonant voice", + gender: "male", + language: "en-US", + }, + nova: { + piperVoice: "en_US-lessac-medium", + description: "Expressive, versatile voice", + gender: "female", + language: "en-US", + }, + shimmer: { + piperVoice: "en_US-kristin-medium", + description: "Bright, energetic voice", + gender: "female", + language: "en-US", + }, +}; + +// ========================================== +// Provider class +// ========================================== + +/** + * Piper TTS provider via OpenedAI Speech (fallback tier). + * + * Ultra-lightweight CPU-only text-to-speech engine using Piper voices + * through the OpenedAI Speech server's OpenAI-compatible API. + * + * Designed for: + * - CPU-only environments (no GPU required) + * - Low-resource devices (Raspberry Pi, ARM SBCs) + * - Fallback when primary TTS engines are unavailable + * - High-volume, low-latency synthesis needs + * + * The provider exposes the 6 standard OpenAI voice names (alloy, echo, + * fable, onyx, nova, shimmer) which OpenedAI Speech maps to configured + * Piper voices. Additional Piper voices (100+ across 40+ languages) + * can be accessed by passing the Piper voice ID directly. + * + * @example + * ```typescript + * const piper = new PiperTtsProvider("http://openedai-speech:8000/v1"); + * const voices = await piper.listVoices(); + * const result = await piper.synthesize("Hello!", { voice: "alloy" }); + * ``` + */ +export class PiperTtsProvider extends BaseTTSProvider { + readonly name = "piper"; + readonly tier: SpeechTier = "fallback"; + + /** + * Create a new Piper TTS provider. + * + * @param baseURL - Base URL for the OpenedAI Speech endpoint (e.g. "http://openedai-speech:8000/v1") + * @param defaultVoice - Default OpenAI voice name (defaults to "alloy") + * @param defaultFormat - Default audio format (defaults to "mp3") + */ + constructor( + baseURL: string, + defaultVoice: string = PIPER_DEFAULT_VOICE, + defaultFormat: AudioFormat = PIPER_DEFAULT_FORMAT + ) { + super(baseURL, defaultVoice, defaultFormat); + } + + /** + * List available voices with OpenAI-to-Piper mapping metadata. + * + * Returns the 6 standard OpenAI voice names with information about + * the underlying Piper voice, gender, and language. These are the + * voices that can be specified in the `voice` parameter of synthesize(). + * + * @returns Array of VoiceInfo objects for all mapped Piper voices + */ + override listVoices(): Promise { + const voices: VoiceInfo[] = OPENAI_STANDARD_VOICES.map((voiceId) => { + const mapping = PIPER_VOICE_MAP[voiceId] ?? DEFAULT_MAPPING; + const genderLabel = mapping.gender === "female" ? "Female" : "Male"; + const label = voiceId.charAt(0).toUpperCase() + voiceId.slice(1); + + return { + id: voiceId, + name: `${label} (${genderLabel} - ${mapping.description})`, + language: mapping.language, + tier: this.tier, + isDefault: voiceId === this.defaultVoice, + }; + }); + + return Promise.resolve(voices); + } +} diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts index 28c807f..5a1f69f 100644 --- a/apps/api/src/speech/providers/tts-provider.factory.ts +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -14,30 +14,13 @@ */ import { Logger } from "@nestjs/common"; -import { BaseTTSProvider } from "./base-tts.provider"; import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; import { KokoroTtsProvider } from "./kokoro-tts.provider"; +import { PiperTtsProvider } from "./piper-tts.provider"; import type { ITTSProvider } from "../interfaces/tts-provider.interface"; import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; import type { SpeechConfig } from "../speech.config"; -// ========================================== -// Concrete provider classes -// ========================================== - -/** - * Piper TTS provider via OpenedAI Speech (fallback tier). - * Ultra-lightweight CPU, GPL license. - */ -class PiperProvider extends BaseTTSProvider { - readonly name = "piper"; - readonly tier: SpeechTier = "fallback"; - - constructor(baseURL: string) { - super(baseURL, "alloy", "mp3"); - } -} - // ========================================== // Factory function // ========================================== @@ -76,7 +59,7 @@ export function createTTSProviders(config: SpeechConfig): Map