fix(#388): address PR review findings — fix WebSocket/REST bugs, improve error handling, fix types and comments
All checks were successful
ci/woodpecker/push/web Pipeline was successful
ci/woodpecker/push/api Pipeline was successful

Critical fixes:
- Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor
- Add /speech namespace to WebSocket connection URL
- Pass auth token in WebSocket handshake options
- Wrap audio.play() in try-catch for NotAllowedError and DOMException handling
- Replace bare catch block with named error parameter and descriptive message
- Add connect_error and disconnect event handlers to WebSocket
- Update JSDoc to accurately describe batch transcription (not real-time partial)

Important fixes:
- Emit transcription-error before disconnect in gateway auth failures
- Capture MediaRecorder error details and clean up media tracks on error
- Change TtsDefaultConfig.format type from string to AudioFormat
- Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth
- Fix voice count from 54 to 53 in provider, AGENTS.md, and docs
- Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 03:44:33 -06:00
parent dcbc8d1053
commit af9c5799af
14 changed files with 91 additions and 53 deletions

View File

@@ -34,7 +34,7 @@ speech/
└── providers/ └── providers/
├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic) ├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic)
├── base-tts.provider.spec.ts ├── base-tts.provider.spec.ts
├── kokoro-tts.provider.ts # Default tier (CPU, 54 voices, 8 languages) ├── kokoro-tts.provider.ts # Default tier (CPU, 53 voices, 8 languages)
├── kokoro-tts.provider.spec.ts ├── kokoro-tts.provider.spec.ts
├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control) ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control)
├── chatterbox-tts.provider.spec.ts ├── chatterbox-tts.provider.spec.ts

View File

@@ -2,7 +2,7 @@
* SynthesizeDto * SynthesizeDto
* *
* DTO for text-to-speech synthesis requests. * DTO for text-to-speech synthesis requests.
* The text field is validated by TextValidationPipe for length/emptiness. * Text and option fields are validated by class-validator decorators.
* Additional options control voice, speed, format, and tier selection. * Additional options control voice, speed, format, and tier selection.
* *
* Issue #398 * Issue #398
@@ -10,29 +10,13 @@
import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator"; import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
import { Type } from "class-transformer"; import { Type } from "class-transformer";
import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types";
import type { AudioFormat, SpeechTier } from "../interfaces/speech-types"; import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";
/**
* Valid audio output formats for TTS synthesis.
*/
const VALID_AUDIO_FORMATS: readonly AudioFormat[] = [
"mp3",
"wav",
"opus",
"flac",
"aac",
"pcm",
] as const;
/**
* Valid TTS tiers for provider selection.
*/
const VALID_SPEECH_TIERS: readonly SpeechTier[] = ["default", "premium", "fallback"] as const;
export class SynthesizeDto { export class SynthesizeDto {
/** /**
* Text to convert to speech. * Text to convert to speech.
* Validated separately by TextValidationPipe for length and emptiness. * Validated by class-validator decorators for type and maximum length.
*/ */
@IsString({ message: "text must be a string" }) @IsString({ message: "text must be a string" })
@MaxLength(4096, { message: "text must not exceed 4096 characters" }) @MaxLength(4096, { message: "text must not exceed 4096 characters" })
@@ -66,8 +50,8 @@ export class SynthesizeDto {
*/ */
@IsOptional() @IsOptional()
@IsString({ message: "format must be a string" }) @IsString({ message: "format must be a string" })
@IsIn(VALID_AUDIO_FORMATS, { @IsIn(AUDIO_FORMATS, {
message: `format must be one of: ${VALID_AUDIO_FORMATS.join(", ")}`, message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`,
}) })
format?: AudioFormat; format?: AudioFormat;
@@ -78,8 +62,8 @@ export class SynthesizeDto {
*/ */
@IsOptional() @IsOptional()
@IsString({ message: "tier must be a string" }) @IsString({ message: "tier must be a string" })
@IsIn(VALID_SPEECH_TIERS, { @IsIn(SPEECH_TIERS, {
message: `tier must be one of: ${VALID_SPEECH_TIERS.join(", ")}`, message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`,
}) })
tier?: SpeechTier; tier?: SpeechTier;
} }

View File

@@ -6,6 +6,7 @@
export type { ISTTProvider } from "./stt-provider.interface"; export type { ISTTProvider } from "./stt-provider.interface";
export type { ITTSProvider } from "./tts-provider.interface"; export type { ITTSProvider } from "./tts-provider.interface";
export { SPEECH_TIERS, AUDIO_FORMATS } from "./speech-types";
export type { export type {
SpeechTier, SpeechTier,
AudioFormat, AudioFormat,

View File

@@ -12,19 +12,21 @@
// ========================================== // ==========================================
/** /**
* TTS provider tier. * Canonical array of TTS provider tiers.
* Determines which TTS engine is used for synthesis. * Determines which TTS engine is used for synthesis.
* *
* - default: Primary TTS engine (e.g., Kokoro) * - default: Primary TTS engine (e.g., Kokoro)
* - premium: Higher quality TTS engine (e.g., Chatterbox) * - premium: Higher quality TTS engine (e.g., Chatterbox)
* - fallback: Backup TTS engine (e.g., Piper/OpenedAI) * - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
*/ */
export type SpeechTier = "default" | "premium" | "fallback"; export const SPEECH_TIERS = ["default", "premium", "fallback"] as const;
export type SpeechTier = (typeof SPEECH_TIERS)[number];
/** /**
* Audio output format for TTS synthesis. * Canonical array of audio output formats for TTS synthesis.
*/ */
export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm"; export const AUDIO_FORMATS = ["mp3", "wav", "opus", "flac", "aac", "pcm"] as const;
export type AudioFormat = (typeof AUDIO_FORMATS)[number];
// ========================================== // ==========================================
// STT Types // STT Types

View File

@@ -16,7 +16,7 @@ import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
* *
* @example * @example
* ```typescript * ```typescript
* class SpeachesProvider implements ISTTProvider { * class SpeachesSttProvider implements ISTTProvider {
* readonly name = "speaches"; * readonly name = "speaches";
* *
* async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> { * async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {

View File

@@ -5,7 +5,7 @@
* CPU-based, always available, Apache 2.0 license. * CPU-based, always available, Apache 2.0 license.
* *
* Features: * Features:
* - 54 built-in voices across 8 languages * - 53 built-in voices across 8 languages
* - Speed control: 0.25x to 4.0x * - Speed control: 0.25x to 4.0x
* - Output formats: mp3, wav, opus, flac * - Output formats: mp3, wav, opus, flac
* - Voice metadata derived from ID prefix (language, gender, accent) * - Voice metadata derived from ID prefix (language, gender, accent)
@@ -222,7 +222,7 @@ export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
/** /**
* Kokoro-FastAPI TTS provider (default tier). * Kokoro-FastAPI TTS provider (default tier).
* *
* CPU-based text-to-speech engine with 54 built-in voices across 8 languages. * CPU-based text-to-speech engine with 53 built-in voices across 8 languages.
* Uses the OpenAI-compatible API exposed by Kokoro-FastAPI. * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
* *
* @example * @example
@@ -254,7 +254,7 @@ export class KokoroTtsProvider extends BaseTTSProvider {
/** /**
* List all available Kokoro voices with metadata. * List all available Kokoro voices with metadata.
* *
* Returns the full catalog of 54 built-in voices with language, gender, * Returns the full catalog of 53 built-in voices with language, gender,
* and accent information derived from voice ID prefixes. * and accent information derived from voice ID prefixes.
* *
* @returns Array of VoiceInfo objects for all Kokoro voices * @returns Array of VoiceInfo objects for all Kokoro voices

View File

@@ -9,7 +9,7 @@
* - OpenAI-compatible API via OpenedAI Speech server * - OpenAI-compatible API via OpenedAI Speech server
* - 100+ Piper voices across 40+ languages * - 100+ Piper voices across 40+ languages
* - 6 standard OpenAI voice names mapped to Piper voices * - 6 standard OpenAI voice names mapped to Piper voices
* - Output formats: mp3, wav, opus, flac, aac, pcm * - Output formats: mp3, wav, opus, flac
* - CPU-only, no GPU required * - CPU-only, no GPU required
* - GPL license (via OpenedAI Speech) * - GPL license (via OpenedAI Speech)
* *

View File

@@ -18,7 +18,7 @@ import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
import { KokoroTtsProvider } from "./kokoro-tts.provider"; import { KokoroTtsProvider } from "./kokoro-tts.provider";
import { PiperTtsProvider } from "./piper-tts.provider"; import { PiperTtsProvider } from "./piper-tts.provider";
import type { ITTSProvider } from "../interfaces/tts-provider.interface"; import type { ITTSProvider } from "../interfaces/tts-provider.interface";
import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; import type { SpeechTier } from "../interfaces/speech-types";
import type { SpeechConfig } from "../speech.config"; import type { SpeechConfig } from "../speech.config";
// ========================================== // ==========================================
@@ -44,7 +44,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
const provider = new KokoroTtsProvider( const provider = new KokoroTtsProvider(
config.tts.default.url, config.tts.default.url,
config.tts.default.voice, config.tts.default.voice,
config.tts.default.format as AudioFormat config.tts.default.format
); );
providers.set("default", provider); providers.set("default", provider);
logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`); logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`);

View File

@@ -12,6 +12,7 @@
*/ */
import { registerAs } from "@nestjs/config"; import { registerAs } from "@nestjs/config";
import type { AudioFormat } from "./interfaces/speech-types";
// ========================================== // ==========================================
// Default values // Default values
@@ -58,7 +59,7 @@ export interface TtsDefaultConfig {
enabled: boolean; enabled: boolean;
url: string; url: string;
voice: string; voice: string;
format: string; format: AudioFormat;
} }
export interface TtsPremiumConfig { export interface TtsPremiumConfig {
@@ -247,7 +248,7 @@ export function getSpeechConfig(): SpeechConfig {
enabled: isTtsEnabled(), enabled: isTtsEnabled(),
url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url, url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url,
voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice, voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice,
format: process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format, format: (process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format) as AudioFormat,
}, },
premium: { premium: {
enabled: isTtsPremiumEnabled(), enabled: isTtsPremiumEnabled(),

View File

@@ -100,6 +100,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
const timeoutId = setTimeout(() => { const timeoutId = setTimeout(() => {
if (!authenticatedClient.data.userId) { if (!authenticatedClient.data.userId) {
this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`); this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`);
authenticatedClient.emit("transcription-error", {
message: "Authentication timed out.",
});
authenticatedClient.disconnect(); authenticatedClient.disconnect();
} }
}, this.CONNECTION_TIMEOUT_MS); }, this.CONNECTION_TIMEOUT_MS);
@@ -109,6 +112,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
if (!token) { if (!token) {
this.logger.warn(`Client ${authenticatedClient.id} connected without token`); this.logger.warn(`Client ${authenticatedClient.id} connected without token`);
authenticatedClient.emit("transcription-error", {
message: "Authentication failed: no token provided.",
});
authenticatedClient.disconnect(); authenticatedClient.disconnect();
clearTimeout(timeoutId); clearTimeout(timeoutId);
return; return;
@@ -118,6 +124,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
if (!sessionData) { if (!sessionData) {
this.logger.warn(`Client ${authenticatedClient.id} has invalid token`); this.logger.warn(`Client ${authenticatedClient.id} has invalid token`);
authenticatedClient.emit("transcription-error", {
message: "Authentication failed: invalid or expired token.",
});
authenticatedClient.disconnect(); authenticatedClient.disconnect();
clearTimeout(timeoutId); clearTimeout(timeoutId);
return; return;
@@ -133,6 +142,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
if (!workspaceMembership) { if (!workspaceMembership) {
this.logger.warn(`User ${userId} has no workspace access`); this.logger.warn(`User ${userId} has no workspace access`);
authenticatedClient.emit("transcription-error", {
message: "Authentication failed: no workspace access.",
});
authenticatedClient.disconnect(); authenticatedClient.disconnect();
clearTimeout(timeoutId); clearTimeout(timeoutId);
return; return;
@@ -151,6 +163,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
`Authentication failed for speech client ${authenticatedClient.id}:`, `Authentication failed for speech client ${authenticatedClient.id}:`,
error instanceof Error ? error.message : "Unknown error" error instanceof Error ? error.message : "Unknown error"
); );
authenticatedClient.emit("transcription-error", {
message: "Authentication failed: an unexpected error occurred.",
});
authenticatedClient.disconnect(); authenticatedClient.disconnect();
} }
} }

View File

@@ -19,7 +19,7 @@ export interface TextToSpeechButtonProps {
text: string; text: string;
/** Optional voice ID to use */ /** Optional voice ID to use */
voice?: string; voice?: string;
/** Optional tier (e.g. "standard", "premium") */ /** Optional tier (e.g. "default", "premium", "fallback") */
tier?: string; tier?: string;
/** Optional className for the container */ /** Optional className for the container */
className?: string; className?: string;

View File

@@ -173,8 +173,17 @@ export function useTextToSpeech(): UseTextToSpeechReturn {
const play = useCallback(async (): Promise<void> => { const play = useCallback(async (): Promise<void> => {
const audio = audioRef.current; const audio = audioRef.current;
if (audio) { if (audio) {
await audio.play(); try {
setIsPlaying(true); await audio.play();
setIsPlaying(true);
} catch (err) {
const message =
err instanceof DOMException && err.name === "NotAllowedError"
? "Playback was blocked by the browser. Try interacting with the page first."
: "Unable to play audio. The format may not be supported.";
setError(message);
setIsPlaying(false);
}
} }
}, []); }, []);

View File

@@ -1,8 +1,8 @@
/** /**
* useVoiceInput hook * useVoiceInput hook
* *
* Custom hook for microphone capture and real-time transcription. * Custom hook for microphone capture and speech-to-text transcription.
* Supports WebSocket streaming for real-time partial transcriptions * Supports WebSocket streaming with batch transcription on stop,
* with REST upload fallback when WebSocket is unavailable. * with REST upload fallback when WebSocket is unavailable.
*/ */
@@ -20,6 +20,8 @@ export interface UseVoiceInputOptions {
useWebSocket?: boolean; useWebSocket?: boolean;
/** Audio sample rate in Hz (default: 16000) */ /** Audio sample rate in Hz (default: 16000) */
sampleRate?: number; sampleRate?: number;
/** Authentication token for WebSocket connection */
token?: string;
} }
/** Return type for the useVoiceInput hook */ /** Return type for the useVoiceInput hook */
@@ -75,14 +77,14 @@ function getAudioMimeType(): string {
} }
/** /**
* Hook for microphone capture and real-time speech-to-text transcription. * Hook for microphone capture and speech-to-text transcription.
* *
* Uses WebSocket streaming by default for real-time partial transcriptions. * Uses WebSocket streaming by default with batch transcription on stop.
* Falls back to REST upload (POST /api/speech/transcribe) if WebSocket * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
* is disabled or unavailable. * is disabled or unavailable.
*/ */
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn { export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options; const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options;
const [isRecording, setIsRecording] = useState(false); const [isRecording, setIsRecording] = useState(false);
const [transcript, setTranscript] = useState(""); const [transcript, setTranscript] = useState("");
@@ -143,9 +145,12 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
}; };
animationFrameRef.current = requestAnimationFrame(updateLevel); animationFrameRef.current = requestAnimationFrame(updateLevel);
} catch { } catch (err) {
// Audio analysis is non-critical; continue without it // Audio analysis is non-critical; continue without it
console.warn("Audio analysis not available"); console.warn(
"Audio level visualization unavailable:",
err instanceof Error ? err.message : String(err)
);
} }
}, []); }, []);
@@ -169,11 +174,14 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
* Connect to the speech WebSocket namespace * Connect to the speech WebSocket namespace
*/ */
const connectSocket = useCallback((): Socket => { const connectSocket = useCallback((): Socket => {
const socket = io(API_BASE_URL, { const socket = io(`${API_BASE_URL}/speech`, {
path: "/socket.io", path: "/socket.io",
transports: ["websocket", "polling"], transports: ["websocket", "polling"],
...(token ? { auth: { token } } : {}),
}); });
// Future use: the gateway does not currently emit transcription-partial,
// but the listener is registered for when real-time partial transcription is added.
socket.on("transcription-partial", (data: TranscriptionPartialPayload) => { socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
setPartialTranscript(data.text); setPartialTranscript(data.text);
}); });
@@ -188,9 +196,19 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
setError(data.message); setError(data.message);
}); });
socket.on("connect_error", (err: Error) => {
setError(`WebSocket connection failed: ${err.message}`);
});
socket.on("disconnect", (reason: string) => {
if (reason !== "io client disconnect") {
setError(`WebSocket disconnected unexpectedly: ${reason}`);
}
});
socketRef.current = socket; socketRef.current = socket;
return socket; return socket;
}, []); }, [token]);
/** /**
* Disconnect the WebSocket * Disconnect the WebSocket
@@ -200,6 +218,8 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
socketRef.current.off("transcription-partial"); socketRef.current.off("transcription-partial");
socketRef.current.off("transcription-final"); socketRef.current.off("transcription-final");
socketRef.current.off("transcription-error"); socketRef.current.off("transcription-error");
socketRef.current.off("connect_error");
socketRef.current.off("disconnect");
socketRef.current.disconnect(); socketRef.current.disconnect();
socketRef.current = null; socketRef.current = null;
} }
@@ -211,7 +231,7 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => { const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
try { try {
const formData = new FormData(); const formData = new FormData();
formData.append("audio", audioBlob, "recording.webm"); formData.append("file", audioBlob, "recording.webm");
const response = await apiPostFormData<TranscribeResponse>( const response = await apiPostFormData<TranscribeResponse>(
"/api/speech/transcribe", "/api/speech/transcribe",
@@ -315,10 +335,16 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
}); });
// Handle errors // Handle errors
mediaRecorder.addEventListener("error", () => { mediaRecorder.addEventListener("error", (event: Event) => {
setError("Recording encountered an issue. Please try again."); let errorMessage = "Recording encountered an issue. Please try again.";
if ("error" in event && event.error instanceof DOMException) {
errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`;
}
setError(errorMessage);
setIsRecording(false); setIsRecording(false);
isRecordingRef.current = false; isRecordingRef.current = false;
stopMediaTracks();
cleanupAudioAnalysis();
}); });
// Start recording with timeslice for streaming chunks (250ms intervals) // Start recording with timeslice for streaming chunks (250ms intervals)

View File

@@ -494,7 +494,7 @@ Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values de
**Capabilities:** **Capabilities:**
- 54 built-in voices across 8 languages - 53 built-in voices across 8 languages
- Speed control: 0.25x to 4.0x - Speed control: 0.25x to 4.0x
- Output formats: mp3, wav, opus, flac - Output formats: mp3, wav, opus, flac
- Voice metadata derived from ID prefix (language, gender, accent) - Voice metadata derived from ID prefix (language, gender, accent)