feat: M13-SpeechServices — TTS & STT integration #409

Merged
jason.woltje merged 20 commits from feature/m13-speech-services into develop 2026-02-15 18:37:54 +00:00
3 changed files with 480 additions and 19 deletions
Showing only changes of commit 6c465566f6 - Show all commits

View File

@@ -0,0 +1,266 @@
/**
* PiperTtsProvider Unit Tests
*
* Tests the Piper TTS provider via OpenedAI Speech (fallback tier).
* Validates provider identity, OpenAI voice name mapping, voice listing,
* and ultra-lightweight CPU-only design characteristics.
*
* Issue #395
*/
import { describe, it, expect, vi, beforeEach } from "vitest";
import {
PiperTtsProvider,
PIPER_VOICE_MAP,
PIPER_SUPPORTED_FORMATS,
OPENAI_STANDARD_VOICES,
} from "./piper-tts.provider";
import type { VoiceInfo } from "../interfaces/speech-types";
// ==========================================
// Mock OpenAI SDK
// ==========================================
vi.mock("openai", () => {
class MockOpenAI {
audio = {
speech: {
create: vi.fn(),
},
};
}
return { default: MockOpenAI };
});
// ==========================================
// Provider identity
// ==========================================
describe("PiperTtsProvider", () => {
const testBaseURL = "http://openedai-speech:8000/v1";
let provider: PiperTtsProvider;
beforeEach(() => {
provider = new PiperTtsProvider(testBaseURL);
});
describe("provider identity", () => {
it("should have name 'piper'", () => {
expect(provider.name).toBe("piper");
});
it("should have tier 'fallback'", () => {
expect(provider.tier).toBe("fallback");
});
});
// ==========================================
// Constructor
// ==========================================
describe("constructor", () => {
it("should use 'alloy' as default voice", () => {
const newProvider = new PiperTtsProvider(testBaseURL);
expect(newProvider).toBeDefined();
});
it("should accept a custom default voice", () => {
const customProvider = new PiperTtsProvider(testBaseURL, "nova");
expect(customProvider).toBeDefined();
});
it("should accept a custom default format", () => {
const customProvider = new PiperTtsProvider(testBaseURL, "alloy", "wav");
expect(customProvider).toBeDefined();
});
});
// ==========================================
// listVoices()
// ==========================================
describe("listVoices", () => {
let voices: VoiceInfo[];
beforeEach(async () => {
voices = await provider.listVoices();
});
it("should return an array of VoiceInfo objects", () => {
expect(voices).toBeInstanceOf(Array);
expect(voices.length).toBeGreaterThan(0);
});
it("should return exactly 6 voices (OpenAI standard set)", () => {
expect(voices.length).toBe(6);
});
it("should set tier to 'fallback' on all voices", () => {
for (const voice of voices) {
expect(voice.tier).toBe("fallback");
}
});
it("should have exactly one default voice", () => {
const defaults = voices.filter((v) => v.isDefault === true);
expect(defaults.length).toBe(1);
});
it("should mark 'alloy' as the default voice", () => {
const defaultVoice = voices.find((v) => v.isDefault === true);
expect(defaultVoice).toBeDefined();
expect(defaultVoice?.id).toBe("alloy");
});
it("should have an id and name for every voice", () => {
for (const voice of voices) {
expect(voice.id).toBeTruthy();
expect(voice.name).toBeTruthy();
}
});
it("should set language on every voice", () => {
for (const voice of voices) {
expect(voice.language).toBeTruthy();
}
});
// ==========================================
// All 6 OpenAI standard voices present
// ==========================================
describe("OpenAI standard voices", () => {
const standardVoiceIds = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"];
it.each(standardVoiceIds)("should include voice '%s'", (voiceId) => {
const voice = voices.find((v) => v.id === voiceId);
expect(voice).toBeDefined();
});
});
// ==========================================
// Voice metadata
// ==========================================
describe("voice metadata", () => {
it("should include gender info in voice names", () => {
const alloy = voices.find((v) => v.id === "alloy");
expect(alloy?.name).toMatch(/Female|Male/);
});
it("should map alloy to a female voice", () => {
const alloy = voices.find((v) => v.id === "alloy");
expect(alloy?.name).toContain("Female");
});
it("should map echo to a male voice", () => {
const echo = voices.find((v) => v.id === "echo");
expect(echo?.name).toContain("Male");
});
it("should map fable to a British voice", () => {
const fable = voices.find((v) => v.id === "fable");
expect(fable?.language).toBe("en-GB");
});
it("should map onyx to a male voice", () => {
const onyx = voices.find((v) => v.id === "onyx");
expect(onyx?.name).toContain("Male");
});
it("should map nova to a female voice", () => {
const nova = voices.find((v) => v.id === "nova");
expect(nova?.name).toContain("Female");
});
it("should map shimmer to a female voice", () => {
const shimmer = voices.find((v) => v.id === "shimmer");
expect(shimmer?.name).toContain("Female");
});
});
});
});
// ==========================================
// PIPER_VOICE_MAP
// ==========================================
describe("PIPER_VOICE_MAP", () => {
it("should contain all 6 OpenAI standard voice names", () => {
const expectedKeys = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"];
for (const key of expectedKeys) {
expect(PIPER_VOICE_MAP).toHaveProperty(key);
}
});
it("should map each voice to a Piper voice ID", () => {
for (const entry of Object.values(PIPER_VOICE_MAP)) {
expect(entry.piperVoice).toBeTruthy();
expect(typeof entry.piperVoice).toBe("string");
}
});
it("should have gender for each voice entry", () => {
for (const entry of Object.values(PIPER_VOICE_MAP)) {
expect(entry.gender).toMatch(/^(female|male)$/);
}
});
it("should have a language for each voice entry", () => {
for (const entry of Object.values(PIPER_VOICE_MAP)) {
expect(entry.language).toBeTruthy();
}
});
it("should have a description for each voice entry", () => {
for (const entry of Object.values(PIPER_VOICE_MAP)) {
expect(entry.description).toBeTruthy();
}
});
});
// ==========================================
// OPENAI_STANDARD_VOICES
// ==========================================
describe("OPENAI_STANDARD_VOICES", () => {
it("should be an array of 6 voice IDs", () => {
expect(Array.isArray(OPENAI_STANDARD_VOICES)).toBe(true);
expect(OPENAI_STANDARD_VOICES.length).toBe(6);
});
it("should contain all standard OpenAI voice names", () => {
expect(OPENAI_STANDARD_VOICES).toContain("alloy");
expect(OPENAI_STANDARD_VOICES).toContain("echo");
expect(OPENAI_STANDARD_VOICES).toContain("fable");
expect(OPENAI_STANDARD_VOICES).toContain("onyx");
expect(OPENAI_STANDARD_VOICES).toContain("nova");
expect(OPENAI_STANDARD_VOICES).toContain("shimmer");
});
});
// ==========================================
// PIPER_SUPPORTED_FORMATS
// ==========================================
describe("PIPER_SUPPORTED_FORMATS", () => {
it("should include mp3", () => {
expect(PIPER_SUPPORTED_FORMATS).toContain("mp3");
});
it("should include wav", () => {
expect(PIPER_SUPPORTED_FORMATS).toContain("wav");
});
it("should include opus", () => {
expect(PIPER_SUPPORTED_FORMATS).toContain("opus");
});
it("should include flac", () => {
expect(PIPER_SUPPORTED_FORMATS).toContain("flac");
});
it("should be a readonly array", () => {
expect(Array.isArray(PIPER_SUPPORTED_FORMATS)).toBe(true);
});
});

View File

@@ -0,0 +1,212 @@
/**
* Piper TTS Provider via OpenedAI Speech
*
* Fallback-tier TTS provider using Piper via OpenedAI Speech for
* ultra-lightweight CPU-only synthesis. Designed for low-resource
* environments including Raspberry Pi.
*
* Features:
* - OpenAI-compatible API via OpenedAI Speech server
* - 100+ Piper voices across 40+ languages
* - 6 standard OpenAI voice names mapped to Piper voices
* - Output formats: mp3, wav, opus, flac, aac, pcm
* - CPU-only, no GPU required
* - GPL license (via OpenedAI Speech)
*
* Voice names use the OpenAI standard set (alloy, echo, fable, onyx,
* nova, shimmer) which OpenedAI Speech maps to configured Piper voices.
*
* Issue #395
*/
import { BaseTTSProvider } from "./base-tts.provider";
import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
// ==========================================
// Constants
// ==========================================
/** Audio formats supported by OpenedAI Speech with Piper backend */
export const PIPER_SUPPORTED_FORMATS: readonly AudioFormat[] = [
"mp3",
"wav",
"opus",
"flac",
] as const;
/** Default voice for Piper (via OpenedAI Speech) */
const PIPER_DEFAULT_VOICE = "alloy";
/** Default audio format for Piper */
const PIPER_DEFAULT_FORMAT: AudioFormat = "mp3";
// ==========================================
// OpenAI standard voice names
// ==========================================
/**
* The 6 standard OpenAI TTS voice names.
* OpenedAI Speech accepts these names and routes them to configured Piper voices.
*/
export const OPENAI_STANDARD_VOICES: readonly string[] = [
"alloy",
"echo",
"fable",
"onyx",
"nova",
"shimmer",
] as const;
// ==========================================
// Voice mapping
// ==========================================
/** Metadata for a Piper voice mapped from an OpenAI voice name */
export interface PiperVoiceMapping {
/** The underlying Piper voice ID configured in OpenedAI Speech */
piperVoice: string;
/** Human-readable description of the voice character */
description: string;
/** Gender of the voice */
gender: "female" | "male";
/** BCP 47 language code */
language: string;
}
/** Fallback mapping used when a voice ID is not found in PIPER_VOICE_MAP */
const DEFAULT_MAPPING: PiperVoiceMapping = {
piperVoice: "en_US-amy-medium",
description: "Default voice",
gender: "female",
language: "en-US",
};
/**
* Mapping of OpenAI standard voice names to their default Piper voice
* configuration in OpenedAI Speech.
*
* These are the default mappings that OpenedAI Speech uses when configured
* with Piper as the TTS backend. The actual Piper voice used can be
* customized in the OpenedAI Speech configuration file.
*
* Default Piper voice assignments:
* - alloy: en_US-amy-medium (warm, balanced female)
* - echo: en_US-ryan-medium (clear, articulate male)
* - fable: en_GB-alan-medium (British male narrator)
* - onyx: en_US-danny-low (deep, resonant male)
* - nova: en_US-lessac-medium (expressive female)
* - shimmer: en_US-kristin-medium (bright, energetic female)
*/
export const PIPER_VOICE_MAP: Record<string, PiperVoiceMapping> = {
alloy: {
piperVoice: "en_US-amy-medium",
description: "Warm, balanced voice",
gender: "female",
language: "en-US",
},
echo: {
piperVoice: "en_US-ryan-medium",
description: "Clear, articulate voice",
gender: "male",
language: "en-US",
},
fable: {
piperVoice: "en_GB-alan-medium",
description: "British narrator voice",
gender: "male",
language: "en-GB",
},
onyx: {
piperVoice: "en_US-danny-low",
description: "Deep, resonant voice",
gender: "male",
language: "en-US",
},
nova: {
piperVoice: "en_US-lessac-medium",
description: "Expressive, versatile voice",
gender: "female",
language: "en-US",
},
shimmer: {
piperVoice: "en_US-kristin-medium",
description: "Bright, energetic voice",
gender: "female",
language: "en-US",
},
};
// ==========================================
// Provider class
// ==========================================
/**
* Piper TTS provider via OpenedAI Speech (fallback tier).
*
* Ultra-lightweight CPU-only text-to-speech engine using Piper voices
* through the OpenedAI Speech server's OpenAI-compatible API.
*
* Designed for:
* - CPU-only environments (no GPU required)
* - Low-resource devices (Raspberry Pi, ARM SBCs)
* - Fallback when primary TTS engines are unavailable
* - High-volume, low-latency synthesis needs
*
* The provider exposes the 6 standard OpenAI voice names (alloy, echo,
* fable, onyx, nova, shimmer) which OpenedAI Speech maps to configured
* Piper voices. Additional Piper voices (100+ across 40+ languages)
* can be accessed by passing the Piper voice ID directly.
*
* @example
* ```typescript
* const piper = new PiperTtsProvider("http://openedai-speech:8000/v1");
* const voices = await piper.listVoices();
* const result = await piper.synthesize("Hello!", { voice: "alloy" });
* ```
*/
export class PiperTtsProvider extends BaseTTSProvider {
readonly name = "piper";
readonly tier: SpeechTier = "fallback";
/**
* Create a new Piper TTS provider.
*
* @param baseURL - Base URL for the OpenedAI Speech endpoint (e.g. "http://openedai-speech:8000/v1")
* @param defaultVoice - Default OpenAI voice name (defaults to "alloy")
* @param defaultFormat - Default audio format (defaults to "mp3")
*/
constructor(
baseURL: string,
defaultVoice: string = PIPER_DEFAULT_VOICE,
defaultFormat: AudioFormat = PIPER_DEFAULT_FORMAT
) {
super(baseURL, defaultVoice, defaultFormat);
}
/**
* List available voices with OpenAI-to-Piper mapping metadata.
*
* Returns the 6 standard OpenAI voice names with information about
* the underlying Piper voice, gender, and language. These are the
* voices that can be specified in the `voice` parameter of synthesize().
*
* @returns Array of VoiceInfo objects for all mapped Piper voices
*/
override listVoices(): Promise<VoiceInfo[]> {
const voices: VoiceInfo[] = OPENAI_STANDARD_VOICES.map((voiceId) => {
const mapping = PIPER_VOICE_MAP[voiceId] ?? DEFAULT_MAPPING;
const genderLabel = mapping.gender === "female" ? "Female" : "Male";
const label = voiceId.charAt(0).toUpperCase() + voiceId.slice(1);
return {
id: voiceId,
name: `${label} (${genderLabel} - ${mapping.description})`,
language: mapping.language,
tier: this.tier,
isDefault: voiceId === this.defaultVoice,
};
});
return Promise.resolve(voices);
}
}

View File

@@ -14,30 +14,13 @@
*/
import { Logger } from "@nestjs/common";
import { BaseTTSProvider } from "./base-tts.provider";
import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
import { KokoroTtsProvider } from "./kokoro-tts.provider";
import { PiperTtsProvider } from "./piper-tts.provider";
import type { ITTSProvider } from "../interfaces/tts-provider.interface";
import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
import type { SpeechConfig } from "../speech.config";
// ==========================================
// Concrete provider classes
// ==========================================
/**
* Piper TTS provider via OpenedAI Speech (fallback tier).
* Ultra-lightweight CPU, GPL license.
*/
class PiperProvider extends BaseTTSProvider {
readonly name = "piper";
readonly tier: SpeechTier = "fallback";
constructor(baseURL: string) {
super(baseURL, "alloy", "mp3");
}
}
// ==========================================
// Factory function
// ==========================================
@@ -76,7 +59,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
// Fallback tier: Piper
if (config.tts.fallback.enabled) {
const provider = new PiperProvider(config.tts.fallback.url);
const provider = new PiperTtsProvider(config.tts.fallback.url);
providers.set("fallback", provider);
logger.log(`Registered fallback TTS provider: piper at ${config.tts.fallback.url}`);
}