From af9c5799af36abbc7f54d366c98d2530eaf4fb75 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:44:33 -0600 Subject: [PATCH] =?UTF-8?q?fix(#388):=20address=20PR=20review=20findings?= =?UTF-8?q?=20=E2=80=94=20fix=20WebSocket/REST=20bugs,=20improve=20error?= =?UTF-8?q?=20handling,=20fix=20types=20and=20comments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 --- apps/api/src/speech/AGENTS.md | 2 +- apps/api/src/speech/dto/synthesize.dto.ts | 30 +++-------- apps/api/src/speech/interfaces/index.ts | 1 + .../api/src/speech/interfaces/speech-types.ts | 10 ++-- .../interfaces/stt-provider.interface.ts | 2 +- .../speech/providers/kokoro-tts.provider.ts | 6 +-- .../speech/providers/piper-tts.provider.ts | 2 +- .../speech/providers/tts-provider.factory.ts | 4 +- apps/api/src/speech/speech.config.ts | 5 +- apps/api/src/speech/speech.gateway.ts | 15 ++++++ .../components/speech/TextToSpeechButton.tsx | 2 +- apps/web/src/hooks/useTextToSpeech.ts | 13 ++++- apps/web/src/hooks/useVoiceInput.ts | 50 ++++++++++++++----- docs/SPEECH.md | 2 +- 14 files changed, 91 insertions(+), 53 deletions(-) diff --git a/apps/api/src/speech/AGENTS.md b/apps/api/src/speech/AGENTS.md index 04b6d97..c3553b6 100644 --- a/apps/api/src/speech/AGENTS.md +++ b/apps/api/src/speech/AGENTS.md @@ -34,7 +34,7 @@ speech/ └── providers/ ├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic) ├── base-tts.provider.spec.ts - ├── kokoro-tts.provider.ts # Default tier (CPU, 54 voices, 8 languages) + ├── kokoro-tts.provider.ts # Default tier (CPU, 53 voices, 8 languages) ├── kokoro-tts.provider.spec.ts ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control) ├── chatterbox-tts.provider.spec.ts diff --git a/apps/api/src/speech/dto/synthesize.dto.ts b/apps/api/src/speech/dto/synthesize.dto.ts index 171dc0e..4b2c1e7 100644 --- a/apps/api/src/speech/dto/synthesize.dto.ts +++ b/apps/api/src/speech/dto/synthesize.dto.ts @@ -2,7 +2,7 @@ * SynthesizeDto * * DTO for text-to-speech synthesis requests. - * The text field is validated by TextValidationPipe for length/emptiness. + * Text and option fields are validated by class-validator decorators. * Additional options control voice, speed, format, and tier selection. * * Issue #398 @@ -10,29 +10,13 @@ import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator"; import { Type } from "class-transformer"; +import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types"; import type { AudioFormat, SpeechTier } from "../interfaces/speech-types"; -/** - * Valid audio output formats for TTS synthesis. - */ -const VALID_AUDIO_FORMATS: readonly AudioFormat[] = [ - "mp3", - "wav", - "opus", - "flac", - "aac", - "pcm", -] as const; - -/** - * Valid TTS tiers for provider selection. - */ -const VALID_SPEECH_TIERS: readonly SpeechTier[] = ["default", "premium", "fallback"] as const; - export class SynthesizeDto { /** * Text to convert to speech. - * Validated separately by TextValidationPipe for length and emptiness. + * Validated by class-validator decorators for type and maximum length. */ @IsString({ message: "text must be a string" }) @MaxLength(4096, { message: "text must not exceed 4096 characters" }) @@ -66,8 +50,8 @@ export class SynthesizeDto { */ @IsOptional() @IsString({ message: "format must be a string" }) - @IsIn(VALID_AUDIO_FORMATS, { - message: `format must be one of: ${VALID_AUDIO_FORMATS.join(", ")}`, + @IsIn(AUDIO_FORMATS, { + message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`, }) format?: AudioFormat; @@ -78,8 +62,8 @@ export class SynthesizeDto { */ @IsOptional() @IsString({ message: "tier must be a string" }) - @IsIn(VALID_SPEECH_TIERS, { - message: `tier must be one of: ${VALID_SPEECH_TIERS.join(", ")}`, + @IsIn(SPEECH_TIERS, { + message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`, }) tier?: SpeechTier; } diff --git a/apps/api/src/speech/interfaces/index.ts b/apps/api/src/speech/interfaces/index.ts index ded8bd2..5674169 100644 --- a/apps/api/src/speech/interfaces/index.ts +++ b/apps/api/src/speech/interfaces/index.ts @@ -6,6 +6,7 @@ export type { ISTTProvider } from "./stt-provider.interface"; export type { ITTSProvider } from "./tts-provider.interface"; +export { SPEECH_TIERS, AUDIO_FORMATS } from "./speech-types"; export type { SpeechTier, AudioFormat, diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts index c3b93c1..a472eae 100644 --- a/apps/api/src/speech/interfaces/speech-types.ts +++ b/apps/api/src/speech/interfaces/speech-types.ts @@ -12,19 +12,21 @@ // ========================================== /** - * TTS provider tier. + * Canonical array of TTS provider tiers. * Determines which TTS engine is used for synthesis. * * - default: Primary TTS engine (e.g., Kokoro) * - premium: Higher quality TTS engine (e.g., Chatterbox) * - fallback: Backup TTS engine (e.g., Piper/OpenedAI) */ -export type SpeechTier = "default" | "premium" | "fallback"; +export const SPEECH_TIERS = ["default", "premium", "fallback"] as const; +export type SpeechTier = (typeof SPEECH_TIERS)[number]; /** - * Audio output format for TTS synthesis. + * Canonical array of audio output formats for TTS synthesis. */ -export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm"; +export const AUDIO_FORMATS = ["mp3", "wav", "opus", "flac", "aac", "pcm"] as const; +export type AudioFormat = (typeof AUDIO_FORMATS)[number]; // ========================================== // STT Types diff --git a/apps/api/src/speech/interfaces/stt-provider.interface.ts b/apps/api/src/speech/interfaces/stt-provider.interface.ts index 871fdd1..8f36ce2 100644 --- a/apps/api/src/speech/interfaces/stt-provider.interface.ts +++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts @@ -16,7 +16,7 @@ import type { TranscribeOptions, TranscriptionResult } from "./speech-types"; * * @example * ```typescript - * class SpeachesProvider implements ISTTProvider { + * class SpeachesSttProvider implements ISTTProvider { * readonly name = "speaches"; * * async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.ts b/apps/api/src/speech/providers/kokoro-tts.provider.ts index ac1b7d3..a7a0800 100644 --- a/apps/api/src/speech/providers/kokoro-tts.provider.ts +++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts @@ -5,7 +5,7 @@ * CPU-based, always available, Apache 2.0 license. * * Features: - * - 54 built-in voices across 8 languages + * - 53 built-in voices across 8 languages * - Speed control: 0.25x to 4.0x * - Output formats: mp3, wav, opus, flac * - Voice metadata derived from ID prefix (language, gender, accent) @@ -222,7 +222,7 @@ export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata { /** * Kokoro-FastAPI TTS provider (default tier). * - * CPU-based text-to-speech engine with 54 built-in voices across 8 languages. + * CPU-based text-to-speech engine with 53 built-in voices across 8 languages. * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI. * * @example @@ -254,7 +254,7 @@ export class KokoroTtsProvider extends BaseTTSProvider { /** * List all available Kokoro voices with metadata. * - * Returns the full catalog of 54 built-in voices with language, gender, + * Returns the full catalog of 53 built-in voices with language, gender, * and accent information derived from voice ID prefixes. * * @returns Array of VoiceInfo objects for all Kokoro voices diff --git a/apps/api/src/speech/providers/piper-tts.provider.ts b/apps/api/src/speech/providers/piper-tts.provider.ts index 40e4638..c86ffc4 100644 --- a/apps/api/src/speech/providers/piper-tts.provider.ts +++ b/apps/api/src/speech/providers/piper-tts.provider.ts @@ -9,7 +9,7 @@ * - OpenAI-compatible API via OpenedAI Speech server * - 100+ Piper voices across 40+ languages * - 6 standard OpenAI voice names mapped to Piper voices - * - Output formats: mp3, wav, opus, flac, aac, pcm + * - Output formats: mp3, wav, opus, flac * - CPU-only, no GPU required * - GPL license (via OpenedAI Speech) * diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts index 5a1f69f..21d7b32 100644 --- a/apps/api/src/speech/providers/tts-provider.factory.ts +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -18,7 +18,7 @@ import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; import { KokoroTtsProvider } from "./kokoro-tts.provider"; import { PiperTtsProvider } from "./piper-tts.provider"; import type { ITTSProvider } from "../interfaces/tts-provider.interface"; -import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; +import type { SpeechTier } from "../interfaces/speech-types"; import type { SpeechConfig } from "../speech.config"; // ========================================== @@ -44,7 +44,7 @@ export function createTTSProviders(config: SpeechConfig): Map { if (!authenticatedClient.data.userId) { this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`); + authenticatedClient.emit("transcription-error", { + message: "Authentication timed out.", + }); authenticatedClient.disconnect(); } }, this.CONNECTION_TIMEOUT_MS); @@ -109,6 +112,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { if (!token) { this.logger.warn(`Client ${authenticatedClient.id} connected without token`); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: no token provided.", + }); authenticatedClient.disconnect(); clearTimeout(timeoutId); return; @@ -118,6 +124,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { if (!sessionData) { this.logger.warn(`Client ${authenticatedClient.id} has invalid token`); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: invalid or expired token.", + }); authenticatedClient.disconnect(); clearTimeout(timeoutId); return; @@ -133,6 +142,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { if (!workspaceMembership) { this.logger.warn(`User ${userId} has no workspace access`); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: no workspace access.", + }); authenticatedClient.disconnect(); clearTimeout(timeoutId); return; @@ -151,6 +163,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { `Authentication failed for speech client ${authenticatedClient.id}:`, error instanceof Error ? error.message : "Unknown error" ); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: an unexpected error occurred.", + }); authenticatedClient.disconnect(); } } diff --git a/apps/web/src/components/speech/TextToSpeechButton.tsx b/apps/web/src/components/speech/TextToSpeechButton.tsx index a8f97f7..e208296 100644 --- a/apps/web/src/components/speech/TextToSpeechButton.tsx +++ b/apps/web/src/components/speech/TextToSpeechButton.tsx @@ -19,7 +19,7 @@ export interface TextToSpeechButtonProps { text: string; /** Optional voice ID to use */ voice?: string; - /** Optional tier (e.g. "standard", "premium") */ + /** Optional tier (e.g. "default", "premium", "fallback") */ tier?: string; /** Optional className for the container */ className?: string; diff --git a/apps/web/src/hooks/useTextToSpeech.ts b/apps/web/src/hooks/useTextToSpeech.ts index cc04cc4..c1152fa 100644 --- a/apps/web/src/hooks/useTextToSpeech.ts +++ b/apps/web/src/hooks/useTextToSpeech.ts @@ -173,8 +173,17 @@ export function useTextToSpeech(): UseTextToSpeechReturn { const play = useCallback(async (): Promise => { const audio = audioRef.current; if (audio) { - await audio.play(); - setIsPlaying(true); + try { + await audio.play(); + setIsPlaying(true); + } catch (err) { + const message = + err instanceof DOMException && err.name === "NotAllowedError" + ? "Playback was blocked by the browser. Try interacting with the page first." + : "Unable to play audio. The format may not be supported."; + setError(message); + setIsPlaying(false); + } } }, []); diff --git a/apps/web/src/hooks/useVoiceInput.ts b/apps/web/src/hooks/useVoiceInput.ts index 24e792d..46506a5 100644 --- a/apps/web/src/hooks/useVoiceInput.ts +++ b/apps/web/src/hooks/useVoiceInput.ts @@ -1,8 +1,8 @@ /** * useVoiceInput hook * - * Custom hook for microphone capture and real-time transcription. - * Supports WebSocket streaming for real-time partial transcriptions + * Custom hook for microphone capture and speech-to-text transcription. + * Supports WebSocket streaming with batch transcription on stop, * with REST upload fallback when WebSocket is unavailable. */ @@ -20,6 +20,8 @@ export interface UseVoiceInputOptions { useWebSocket?: boolean; /** Audio sample rate in Hz (default: 16000) */ sampleRate?: number; + /** Authentication token for WebSocket connection */ + token?: string; } /** Return type for the useVoiceInput hook */ @@ -75,14 +77,14 @@ function getAudioMimeType(): string { } /** - * Hook for microphone capture and real-time speech-to-text transcription. + * Hook for microphone capture and speech-to-text transcription. * - * Uses WebSocket streaming by default for real-time partial transcriptions. + * Uses WebSocket streaming by default with batch transcription on stop. * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket * is disabled or unavailable. */ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn { - const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options; + const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options; const [isRecording, setIsRecording] = useState(false); const [transcript, setTranscript] = useState(""); @@ -143,9 +145,12 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput }; animationFrameRef.current = requestAnimationFrame(updateLevel); - } catch { + } catch (err) { // Audio analysis is non-critical; continue without it - console.warn("Audio analysis not available"); + console.warn( + "Audio level visualization unavailable:", + err instanceof Error ? err.message : String(err) + ); } }, []); @@ -169,11 +174,14 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput * Connect to the speech WebSocket namespace */ const connectSocket = useCallback((): Socket => { - const socket = io(API_BASE_URL, { + const socket = io(`${API_BASE_URL}/speech`, { path: "/socket.io", transports: ["websocket", "polling"], + ...(token ? { auth: { token } } : {}), }); + // Future use: the gateway does not currently emit transcription-partial, + // but the listener is registered for when real-time partial transcription is added. socket.on("transcription-partial", (data: TranscriptionPartialPayload) => { setPartialTranscript(data.text); }); @@ -188,9 +196,19 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput setError(data.message); }); + socket.on("connect_error", (err: Error) => { + setError(`WebSocket connection failed: ${err.message}`); + }); + + socket.on("disconnect", (reason: string) => { + if (reason !== "io client disconnect") { + setError(`WebSocket disconnected unexpectedly: ${reason}`); + } + }); + socketRef.current = socket; return socket; - }, []); + }, [token]); /** * Disconnect the WebSocket @@ -200,6 +218,8 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput socketRef.current.off("transcription-partial"); socketRef.current.off("transcription-final"); socketRef.current.off("transcription-error"); + socketRef.current.off("connect_error"); + socketRef.current.off("disconnect"); socketRef.current.disconnect(); socketRef.current = null; } @@ -211,7 +231,7 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise => { try { const formData = new FormData(); - formData.append("audio", audioBlob, "recording.webm"); + formData.append("file", audioBlob, "recording.webm"); const response = await apiPostFormData( "/api/speech/transcribe", @@ -315,10 +335,16 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput }); // Handle errors - mediaRecorder.addEventListener("error", () => { - setError("Recording encountered an issue. Please try again."); + mediaRecorder.addEventListener("error", (event: Event) => { + let errorMessage = "Recording encountered an issue. Please try again."; + if ("error" in event && event.error instanceof DOMException) { + errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`; + } + setError(errorMessage); setIsRecording(false); isRecordingRef.current = false; + stopMediaTracks(); + cleanupAudioAnalysis(); }); // Start recording with timeslice for streaming chunks (250ms intervals) diff --git a/docs/SPEECH.md b/docs/SPEECH.md index 3ea7dd4..2f2b078 100644 --- a/docs/SPEECH.md +++ b/docs/SPEECH.md @@ -494,7 +494,7 @@ Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values de **Capabilities:** -- 54 built-in voices across 8 languages +- 53 built-in voices across 8 languages - Speed control: 0.25x to 4.0x - Output formats: mp3, wav, opus, flac - Voice metadata derived from ID prefix (language, gender, accent)