From 6c465566f6e14f20ac5cb462e61fa5fee5b71804 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:39:20 -0600 Subject: [PATCH] feat(#395): implement Piper TTS provider via OpenedAI Speech Add fallback-tier TTS provider using Piper via OpenedAI Speech for ultra-lightweight CPU-only synthesis. Maps 6 standard OpenAI voice names (alloy, echo, fable, onyx, nova, shimmer) to Piper voices. Update factory to use the new PiperTtsProvider class, replacing the inline stub. Includes 37 unit tests covering provider identity, voice mapping, and voice listing. Fixes #395 Co-Authored-By: Claude Opus 4.6 --- .../providers/piper-tts.provider.spec.ts | 266 ++++++++++++++++++ .../speech/providers/piper-tts.provider.ts | 212 ++++++++++++++ .../speech/providers/tts-provider.factory.ts | 21 +- 3 files changed, 480 insertions(+), 19 deletions(-) create mode 100644 apps/api/src/speech/providers/piper-tts.provider.spec.ts create mode 100644 apps/api/src/speech/providers/piper-tts.provider.ts diff --git a/apps/api/src/speech/providers/piper-tts.provider.spec.ts b/apps/api/src/speech/providers/piper-tts.provider.spec.ts new file mode 100644 index 0000000..c0c1661 --- /dev/null +++ b/apps/api/src/speech/providers/piper-tts.provider.spec.ts @@ -0,0 +1,266 @@ +/** + * PiperTtsProvider Unit Tests + * + * Tests the Piper TTS provider via OpenedAI Speech (fallback tier). + * Validates provider identity, OpenAI voice name mapping, voice listing, + * and ultra-lightweight CPU-only design characteristics. + * + * Issue #395 + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { + PiperTtsProvider, + PIPER_VOICE_MAP, + PIPER_SUPPORTED_FORMATS, + OPENAI_STANDARD_VOICES, +} from "./piper-tts.provider"; +import type { VoiceInfo } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: vi.fn(), + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Provider identity +// ========================================== + +describe("PiperTtsProvider", () => { + const testBaseURL = "http://openedai-speech:8000/v1"; + let provider: PiperTtsProvider; + + beforeEach(() => { + provider = new PiperTtsProvider(testBaseURL); + }); + + describe("provider identity", () => { + it("should have name 'piper'", () => { + expect(provider.name).toBe("piper"); + }); + + it("should have tier 'fallback'", () => { + expect(provider.tier).toBe("fallback"); + }); + }); + + // ========================================== + // Constructor + // ========================================== + + describe("constructor", () => { + it("should use 'alloy' as default voice", () => { + const newProvider = new PiperTtsProvider(testBaseURL); + expect(newProvider).toBeDefined(); + }); + + it("should accept a custom default voice", () => { + const customProvider = new PiperTtsProvider(testBaseURL, "nova"); + expect(customProvider).toBeDefined(); + }); + + it("should accept a custom default format", () => { + const customProvider = new PiperTtsProvider(testBaseURL, "alloy", "wav"); + expect(customProvider).toBeDefined(); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + let voices: VoiceInfo[]; + + beforeEach(async () => { + voices = await provider.listVoices(); + }); + + it("should return an array of VoiceInfo objects", () => { + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + }); + + it("should return exactly 6 voices (OpenAI standard set)", () => { + expect(voices.length).toBe(6); + }); + + it("should set tier to 'fallback' on all voices", () => { + for (const voice of voices) { + expect(voice.tier).toBe("fallback"); + } + }); + + it("should have exactly one default voice", () => { + const defaults = voices.filter((v) => v.isDefault === true); + expect(defaults.length).toBe(1); + }); + + it("should mark 'alloy' as the default voice", () => { + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("alloy"); + }); + + it("should have an id and name for every voice", () => { + for (const voice of voices) { + expect(voice.id).toBeTruthy(); + expect(voice.name).toBeTruthy(); + } + }); + + it("should set language on every voice", () => { + for (const voice of voices) { + expect(voice.language).toBeTruthy(); + } + }); + + // ========================================== + // All 6 OpenAI standard voices present + // ========================================== + + describe("OpenAI standard voices", () => { + const standardVoiceIds = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]; + + it.each(standardVoiceIds)("should include voice '%s'", (voiceId) => { + const voice = voices.find((v) => v.id === voiceId); + expect(voice).toBeDefined(); + }); + }); + + // ========================================== + // Voice metadata + // ========================================== + + describe("voice metadata", () => { + it("should include gender info in voice names", () => { + const alloy = voices.find((v) => v.id === "alloy"); + expect(alloy?.name).toMatch(/Female|Male/); + }); + + it("should map alloy to a female voice", () => { + const alloy = voices.find((v) => v.id === "alloy"); + expect(alloy?.name).toContain("Female"); + }); + + it("should map echo to a male voice", () => { + const echo = voices.find((v) => v.id === "echo"); + expect(echo?.name).toContain("Male"); + }); + + it("should map fable to a British voice", () => { + const fable = voices.find((v) => v.id === "fable"); + expect(fable?.language).toBe("en-GB"); + }); + + it("should map onyx to a male voice", () => { + const onyx = voices.find((v) => v.id === "onyx"); + expect(onyx?.name).toContain("Male"); + }); + + it("should map nova to a female voice", () => { + const nova = voices.find((v) => v.id === "nova"); + expect(nova?.name).toContain("Female"); + }); + + it("should map shimmer to a female voice", () => { + const shimmer = voices.find((v) => v.id === "shimmer"); + expect(shimmer?.name).toContain("Female"); + }); + }); + }); +}); + +// ========================================== +// PIPER_VOICE_MAP +// ========================================== + +describe("PIPER_VOICE_MAP", () => { + it("should contain all 6 OpenAI standard voice names", () => { + const expectedKeys = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]; + for (const key of expectedKeys) { + expect(PIPER_VOICE_MAP).toHaveProperty(key); + } + }); + + it("should map each voice to a Piper voice ID", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.piperVoice).toBeTruthy(); + expect(typeof entry.piperVoice).toBe("string"); + } + }); + + it("should have gender for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.gender).toMatch(/^(female|male)$/); + } + }); + + it("should have a language for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.language).toBeTruthy(); + } + }); + + it("should have a description for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.description).toBeTruthy(); + } + }); +}); + +// ========================================== +// OPENAI_STANDARD_VOICES +// ========================================== + +describe("OPENAI_STANDARD_VOICES", () => { + it("should be an array of 6 voice IDs", () => { + expect(Array.isArray(OPENAI_STANDARD_VOICES)).toBe(true); + expect(OPENAI_STANDARD_VOICES.length).toBe(6); + }); + + it("should contain all standard OpenAI voice names", () => { + expect(OPENAI_STANDARD_VOICES).toContain("alloy"); + expect(OPENAI_STANDARD_VOICES).toContain("echo"); + expect(OPENAI_STANDARD_VOICES).toContain("fable"); + expect(OPENAI_STANDARD_VOICES).toContain("onyx"); + expect(OPENAI_STANDARD_VOICES).toContain("nova"); + expect(OPENAI_STANDARD_VOICES).toContain("shimmer"); + }); +}); + +// ========================================== +// PIPER_SUPPORTED_FORMATS +// ========================================== + +describe("PIPER_SUPPORTED_FORMATS", () => { + it("should include mp3", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("mp3"); + }); + + it("should include wav", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("wav"); + }); + + it("should include opus", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("opus"); + }); + + it("should include flac", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("flac"); + }); + + it("should be a readonly array", () => { + expect(Array.isArray(PIPER_SUPPORTED_FORMATS)).toBe(true); + }); +}); diff --git a/apps/api/src/speech/providers/piper-tts.provider.ts b/apps/api/src/speech/providers/piper-tts.provider.ts new file mode 100644 index 0000000..40e4638 --- /dev/null +++ b/apps/api/src/speech/providers/piper-tts.provider.ts @@ -0,0 +1,212 @@ +/** + * Piper TTS Provider via OpenedAI Speech + * + * Fallback-tier TTS provider using Piper via OpenedAI Speech for + * ultra-lightweight CPU-only synthesis. Designed for low-resource + * environments including Raspberry Pi. + * + * Features: + * - OpenAI-compatible API via OpenedAI Speech server + * - 100+ Piper voices across 40+ languages + * - 6 standard OpenAI voice names mapped to Piper voices + * - Output formats: mp3, wav, opus, flac, aac, pcm + * - CPU-only, no GPU required + * - GPL license (via OpenedAI Speech) + * + * Voice names use the OpenAI standard set (alloy, echo, fable, onyx, + * nova, shimmer) which OpenedAI Speech maps to configured Piper voices. + * + * Issue #395 + */ + +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Constants +// ========================================== + +/** Audio formats supported by OpenedAI Speech with Piper backend */ +export const PIPER_SUPPORTED_FORMATS: readonly AudioFormat[] = [ + "mp3", + "wav", + "opus", + "flac", +] as const; + +/** Default voice for Piper (via OpenedAI Speech) */ +const PIPER_DEFAULT_VOICE = "alloy"; + +/** Default audio format for Piper */ +const PIPER_DEFAULT_FORMAT: AudioFormat = "mp3"; + +// ========================================== +// OpenAI standard voice names +// ========================================== + +/** + * The 6 standard OpenAI TTS voice names. + * OpenedAI Speech accepts these names and routes them to configured Piper voices. + */ +export const OPENAI_STANDARD_VOICES: readonly string[] = [ + "alloy", + "echo", + "fable", + "onyx", + "nova", + "shimmer", +] as const; + +// ========================================== +// Voice mapping +// ========================================== + +/** Metadata for a Piper voice mapped from an OpenAI voice name */ +export interface PiperVoiceMapping { + /** The underlying Piper voice ID configured in OpenedAI Speech */ + piperVoice: string; + /** Human-readable description of the voice character */ + description: string; + /** Gender of the voice */ + gender: "female" | "male"; + /** BCP 47 language code */ + language: string; +} + +/** Fallback mapping used when a voice ID is not found in PIPER_VOICE_MAP */ +const DEFAULT_MAPPING: PiperVoiceMapping = { + piperVoice: "en_US-amy-medium", + description: "Default voice", + gender: "female", + language: "en-US", +}; + +/** + * Mapping of OpenAI standard voice names to their default Piper voice + * configuration in OpenedAI Speech. + * + * These are the default mappings that OpenedAI Speech uses when configured + * with Piper as the TTS backend. The actual Piper voice used can be + * customized in the OpenedAI Speech configuration file. + * + * Default Piper voice assignments: + * - alloy: en_US-amy-medium (warm, balanced female) + * - echo: en_US-ryan-medium (clear, articulate male) + * - fable: en_GB-alan-medium (British male narrator) + * - onyx: en_US-danny-low (deep, resonant male) + * - nova: en_US-lessac-medium (expressive female) + * - shimmer: en_US-kristin-medium (bright, energetic female) + */ +export const PIPER_VOICE_MAP: Record = { + alloy: { + piperVoice: "en_US-amy-medium", + description: "Warm, balanced voice", + gender: "female", + language: "en-US", + }, + echo: { + piperVoice: "en_US-ryan-medium", + description: "Clear, articulate voice", + gender: "male", + language: "en-US", + }, + fable: { + piperVoice: "en_GB-alan-medium", + description: "British narrator voice", + gender: "male", + language: "en-GB", + }, + onyx: { + piperVoice: "en_US-danny-low", + description: "Deep, resonant voice", + gender: "male", + language: "en-US", + }, + nova: { + piperVoice: "en_US-lessac-medium", + description: "Expressive, versatile voice", + gender: "female", + language: "en-US", + }, + shimmer: { + piperVoice: "en_US-kristin-medium", + description: "Bright, energetic voice", + gender: "female", + language: "en-US", + }, +}; + +// ========================================== +// Provider class +// ========================================== + +/** + * Piper TTS provider via OpenedAI Speech (fallback tier). + * + * Ultra-lightweight CPU-only text-to-speech engine using Piper voices + * through the OpenedAI Speech server's OpenAI-compatible API. + * + * Designed for: + * - CPU-only environments (no GPU required) + * - Low-resource devices (Raspberry Pi, ARM SBCs) + * - Fallback when primary TTS engines are unavailable + * - High-volume, low-latency synthesis needs + * + * The provider exposes the 6 standard OpenAI voice names (alloy, echo, + * fable, onyx, nova, shimmer) which OpenedAI Speech maps to configured + * Piper voices. Additional Piper voices (100+ across 40+ languages) + * can be accessed by passing the Piper voice ID directly. + * + * @example + * ```typescript + * const piper = new PiperTtsProvider("http://openedai-speech:8000/v1"); + * const voices = await piper.listVoices(); + * const result = await piper.synthesize("Hello!", { voice: "alloy" }); + * ``` + */ +export class PiperTtsProvider extends BaseTTSProvider { + readonly name = "piper"; + readonly tier: SpeechTier = "fallback"; + + /** + * Create a new Piper TTS provider. + * + * @param baseURL - Base URL for the OpenedAI Speech endpoint (e.g. "http://openedai-speech:8000/v1") + * @param defaultVoice - Default OpenAI voice name (defaults to "alloy") + * @param defaultFormat - Default audio format (defaults to "mp3") + */ + constructor( + baseURL: string, + defaultVoice: string = PIPER_DEFAULT_VOICE, + defaultFormat: AudioFormat = PIPER_DEFAULT_FORMAT + ) { + super(baseURL, defaultVoice, defaultFormat); + } + + /** + * List available voices with OpenAI-to-Piper mapping metadata. + * + * Returns the 6 standard OpenAI voice names with information about + * the underlying Piper voice, gender, and language. These are the + * voices that can be specified in the `voice` parameter of synthesize(). + * + * @returns Array of VoiceInfo objects for all mapped Piper voices + */ + override listVoices(): Promise { + const voices: VoiceInfo[] = OPENAI_STANDARD_VOICES.map((voiceId) => { + const mapping = PIPER_VOICE_MAP[voiceId] ?? DEFAULT_MAPPING; + const genderLabel = mapping.gender === "female" ? "Female" : "Male"; + const label = voiceId.charAt(0).toUpperCase() + voiceId.slice(1); + + return { + id: voiceId, + name: `${label} (${genderLabel} - ${mapping.description})`, + language: mapping.language, + tier: this.tier, + isDefault: voiceId === this.defaultVoice, + }; + }); + + return Promise.resolve(voices); + } +} diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts index 28c807f..5a1f69f 100644 --- a/apps/api/src/speech/providers/tts-provider.factory.ts +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -14,30 +14,13 @@ */ import { Logger } from "@nestjs/common"; -import { BaseTTSProvider } from "./base-tts.provider"; import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; import { KokoroTtsProvider } from "./kokoro-tts.provider"; +import { PiperTtsProvider } from "./piper-tts.provider"; import type { ITTSProvider } from "../interfaces/tts-provider.interface"; import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; import type { SpeechConfig } from "../speech.config"; -// ========================================== -// Concrete provider classes -// ========================================== - -/** - * Piper TTS provider via OpenedAI Speech (fallback tier). - * Ultra-lightweight CPU, GPL license. - */ -class PiperProvider extends BaseTTSProvider { - readonly name = "piper"; - readonly tier: SpeechTier = "fallback"; - - constructor(baseURL: string) { - super(baseURL, "alloy", "mp3"); - } -} - // ========================================== // Factory function // ========================================== @@ -76,7 +59,7 @@ export function createTTSProviders(config: SpeechConfig): Map