Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
213 lines
6.5 KiB
TypeScript
213 lines
6.5 KiB
TypeScript
/**
|
|
* Piper TTS Provider via OpenedAI Speech
|
|
*
|
|
* Fallback-tier TTS provider using Piper via OpenedAI Speech for
|
|
* ultra-lightweight CPU-only synthesis. Designed for low-resource
|
|
* environments including Raspberry Pi.
|
|
*
|
|
* Features:
|
|
* - OpenAI-compatible API via OpenedAI Speech server
|
|
* - 100+ Piper voices across 40+ languages
|
|
* - 6 standard OpenAI voice names mapped to Piper voices
|
|
* - Output formats: mp3, wav, opus, flac
|
|
* - CPU-only, no GPU required
|
|
* - GPL license (via OpenedAI Speech)
|
|
*
|
|
* Voice names use the OpenAI standard set (alloy, echo, fable, onyx,
|
|
* nova, shimmer) which OpenedAI Speech maps to configured Piper voices.
|
|
*
|
|
* Issue #395
|
|
*/
|
|
|
|
import { BaseTTSProvider } from "./base-tts.provider";
|
|
import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
|
|
|
|
// ==========================================
|
|
// Constants
|
|
// ==========================================
|
|
|
|
/** Audio formats supported by OpenedAI Speech with Piper backend */
|
|
export const PIPER_SUPPORTED_FORMATS: readonly AudioFormat[] = [
|
|
"mp3",
|
|
"wav",
|
|
"opus",
|
|
"flac",
|
|
] as const;
|
|
|
|
/** Default voice for Piper (via OpenedAI Speech) */
|
|
const PIPER_DEFAULT_VOICE = "alloy";
|
|
|
|
/** Default audio format for Piper */
|
|
const PIPER_DEFAULT_FORMAT: AudioFormat = "mp3";
|
|
|
|
// ==========================================
|
|
// OpenAI standard voice names
|
|
// ==========================================
|
|
|
|
/**
|
|
* The 6 standard OpenAI TTS voice names.
|
|
* OpenedAI Speech accepts these names and routes them to configured Piper voices.
|
|
*/
|
|
export const OPENAI_STANDARD_VOICES: readonly string[] = [
|
|
"alloy",
|
|
"echo",
|
|
"fable",
|
|
"onyx",
|
|
"nova",
|
|
"shimmer",
|
|
] as const;
|
|
|
|
// ==========================================
|
|
// Voice mapping
|
|
// ==========================================
|
|
|
|
/** Metadata for a Piper voice mapped from an OpenAI voice name */
|
|
export interface PiperVoiceMapping {
|
|
/** The underlying Piper voice ID configured in OpenedAI Speech */
|
|
piperVoice: string;
|
|
/** Human-readable description of the voice character */
|
|
description: string;
|
|
/** Gender of the voice */
|
|
gender: "female" | "male";
|
|
/** BCP 47 language code */
|
|
language: string;
|
|
}
|
|
|
|
/** Fallback mapping used when a voice ID is not found in PIPER_VOICE_MAP */
|
|
const DEFAULT_MAPPING: PiperVoiceMapping = {
|
|
piperVoice: "en_US-amy-medium",
|
|
description: "Default voice",
|
|
gender: "female",
|
|
language: "en-US",
|
|
};
|
|
|
|
/**
|
|
* Mapping of OpenAI standard voice names to their default Piper voice
|
|
* configuration in OpenedAI Speech.
|
|
*
|
|
* These are the default mappings that OpenedAI Speech uses when configured
|
|
* with Piper as the TTS backend. The actual Piper voice used can be
|
|
* customized in the OpenedAI Speech configuration file.
|
|
*
|
|
* Default Piper voice assignments:
|
|
* - alloy: en_US-amy-medium (warm, balanced female)
|
|
* - echo: en_US-ryan-medium (clear, articulate male)
|
|
* - fable: en_GB-alan-medium (British male narrator)
|
|
* - onyx: en_US-danny-low (deep, resonant male)
|
|
* - nova: en_US-lessac-medium (expressive female)
|
|
* - shimmer: en_US-kristin-medium (bright, energetic female)
|
|
*/
|
|
export const PIPER_VOICE_MAP: Record<string, PiperVoiceMapping> = {
|
|
alloy: {
|
|
piperVoice: "en_US-amy-medium",
|
|
description: "Warm, balanced voice",
|
|
gender: "female",
|
|
language: "en-US",
|
|
},
|
|
echo: {
|
|
piperVoice: "en_US-ryan-medium",
|
|
description: "Clear, articulate voice",
|
|
gender: "male",
|
|
language: "en-US",
|
|
},
|
|
fable: {
|
|
piperVoice: "en_GB-alan-medium",
|
|
description: "British narrator voice",
|
|
gender: "male",
|
|
language: "en-GB",
|
|
},
|
|
onyx: {
|
|
piperVoice: "en_US-danny-low",
|
|
description: "Deep, resonant voice",
|
|
gender: "male",
|
|
language: "en-US",
|
|
},
|
|
nova: {
|
|
piperVoice: "en_US-lessac-medium",
|
|
description: "Expressive, versatile voice",
|
|
gender: "female",
|
|
language: "en-US",
|
|
},
|
|
shimmer: {
|
|
piperVoice: "en_US-kristin-medium",
|
|
description: "Bright, energetic voice",
|
|
gender: "female",
|
|
language: "en-US",
|
|
},
|
|
};
|
|
|
|
// ==========================================
|
|
// Provider class
|
|
// ==========================================
|
|
|
|
/**
|
|
* Piper TTS provider via OpenedAI Speech (fallback tier).
|
|
*
|
|
* Ultra-lightweight CPU-only text-to-speech engine using Piper voices
|
|
* through the OpenedAI Speech server's OpenAI-compatible API.
|
|
*
|
|
* Designed for:
|
|
* - CPU-only environments (no GPU required)
|
|
* - Low-resource devices (Raspberry Pi, ARM SBCs)
|
|
* - Fallback when primary TTS engines are unavailable
|
|
* - High-volume, low-latency synthesis needs
|
|
*
|
|
* The provider exposes the 6 standard OpenAI voice names (alloy, echo,
|
|
* fable, onyx, nova, shimmer) which OpenedAI Speech maps to configured
|
|
* Piper voices. Additional Piper voices (100+ across 40+ languages)
|
|
* can be accessed by passing the Piper voice ID directly.
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const piper = new PiperTtsProvider("http://openedai-speech:8000/v1");
|
|
* const voices = await piper.listVoices();
|
|
* const result = await piper.synthesize("Hello!", { voice: "alloy" });
|
|
* ```
|
|
*/
|
|
export class PiperTtsProvider extends BaseTTSProvider {
|
|
readonly name = "piper";
|
|
readonly tier: SpeechTier = "fallback";
|
|
|
|
/**
|
|
* Create a new Piper TTS provider.
|
|
*
|
|
* @param baseURL - Base URL for the OpenedAI Speech endpoint (e.g. "http://openedai-speech:8000/v1")
|
|
* @param defaultVoice - Default OpenAI voice name (defaults to "alloy")
|
|
* @param defaultFormat - Default audio format (defaults to "mp3")
|
|
*/
|
|
constructor(
|
|
baseURL: string,
|
|
defaultVoice: string = PIPER_DEFAULT_VOICE,
|
|
defaultFormat: AudioFormat = PIPER_DEFAULT_FORMAT
|
|
) {
|
|
super(baseURL, defaultVoice, defaultFormat);
|
|
}
|
|
|
|
/**
|
|
* List available voices with OpenAI-to-Piper mapping metadata.
|
|
*
|
|
* Returns the 6 standard OpenAI voice names with information about
|
|
* the underlying Piper voice, gender, and language. These are the
|
|
* voices that can be specified in the `voice` parameter of synthesize().
|
|
*
|
|
* @returns Array of VoiceInfo objects for all mapped Piper voices
|
|
*/
|
|
override listVoices(): Promise<VoiceInfo[]> {
|
|
const voices: VoiceInfo[] = OPENAI_STANDARD_VOICES.map((voiceId) => {
|
|
const mapping = PIPER_VOICE_MAP[voiceId] ?? DEFAULT_MAPPING;
|
|
const genderLabel = mapping.gender === "female" ? "Female" : "Male";
|
|
const label = voiceId.charAt(0).toUpperCase() + voiceId.slice(1);
|
|
|
|
return {
|
|
id: voiceId,
|
|
name: `${label} (${genderLabel} - ${mapping.description})`,
|
|
language: mapping.language,
|
|
tier: this.tier,
|
|
isDefault: voiceId === this.defaultVoice,
|
|
};
|
|
});
|
|
|
|
return Promise.resolve(voices);
|
|
}
|
|
}
|