From af9c5799af36abbc7f54d366c98d2530eaf4fb75 Mon Sep 17 00:00:00 2001
From: Jason Woltje <jason@diversecanvas.com>
Date: Sun, 15 Feb 2026 03:44:33 -0600
Subject: [PATCH] =?UTF-8?q?fix(#388):=20address=20PR=20review=20findings?=
 =?UTF-8?q?=20=E2=80=94=20fix=20WebSocket/REST=20bugs,=20improve=20error?=
 =?UTF-8?q?=20handling,=20fix=20types=20and=20comments?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical fixes:
- Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor
- Add /speech namespace to WebSocket connection URL
- Pass auth token in WebSocket handshake options
- Wrap audio.play() in try-catch for NotAllowedError and DOMException handling
- Replace bare catch block with named error parameter and descriptive message
- Add connect_error and disconnect event handlers to WebSocket
- Update JSDoc to accurately describe batch transcription (not real-time partial)

Important fixes:
- Emit transcription-error before disconnect in gateway auth failures
- Capture MediaRecorder error details and clean up media tracks on error
- Change TtsDefaultConfig.format type from string to AudioFormat
- Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth
- Fix voice count from 54 to 53 in provider, AGENTS.md, and docs
- Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 apps/api/src/speech/AGENTS.md                 |  2 +-
 apps/api/src/speech/dto/synthesize.dto.ts     | 30 +++--------
 apps/api/src/speech/interfaces/index.ts       |  1 +
 .../api/src/speech/interfaces/speech-types.ts | 10 ++--
 .../interfaces/stt-provider.interface.ts      |  2 +-
 .../speech/providers/kokoro-tts.provider.ts   |  6 +--
 .../speech/providers/piper-tts.provider.ts    |  2 +-
 .../speech/providers/tts-provider.factory.ts  |  4 +-
 apps/api/src/speech/speech.config.ts          |  5 +-
 apps/api/src/speech/speech.gateway.ts         | 15 ++++++
 .../components/speech/TextToSpeechButton.tsx  |  2 +-
 apps/web/src/hooks/useTextToSpeech.ts         | 13 ++++-
 apps/web/src/hooks/useVoiceInput.ts           | 50 ++++++++++++++-----
 docs/SPEECH.md                                |  2 +-
 14 files changed, 91 insertions(+), 53 deletions(-)

diff --git a/apps/api/src/speech/AGENTS.md b/apps/api/src/speech/AGENTS.md
index 04b6d97..c3553b6 100644
--- a/apps/api/src/speech/AGENTS.md
+++ b/apps/api/src/speech/AGENTS.md
@@ -34,7 +34,7 @@ speech/
 └── providers/
     ├── base-tts.provider.ts       # Abstract base class (OpenAI SDK + common logic)
     ├── base-tts.provider.spec.ts
-    ├── kokoro-tts.provider.ts     # Default tier (CPU, 54 voices, 8 languages)
+    ├── kokoro-tts.provider.ts     # Default tier (CPU, 53 voices, 8 languages)
     ├── kokoro-tts.provider.spec.ts
     ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control)
     ├── chatterbox-tts.provider.spec.ts
diff --git a/apps/api/src/speech/dto/synthesize.dto.ts b/apps/api/src/speech/dto/synthesize.dto.ts
index 171dc0e..4b2c1e7 100644
--- a/apps/api/src/speech/dto/synthesize.dto.ts
+++ b/apps/api/src/speech/dto/synthesize.dto.ts
@@ -2,7 +2,7 @@
  * SynthesizeDto
  *
  * DTO for text-to-speech synthesis requests.
- * The text field is validated by TextValidationPipe for length/emptiness.
+ * Text and option fields are validated by class-validator decorators.
  * Additional options control voice, speed, format, and tier selection.
  *
  * Issue #398
@@ -10,29 +10,13 @@
 
 import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
 import { Type } from "class-transformer";
+import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types";
 import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";
 
-/**
- * Valid audio output formats for TTS synthesis.
- */
-const VALID_AUDIO_FORMATS: readonly AudioFormat[] = [
-  "mp3",
-  "wav",
-  "opus",
-  "flac",
-  "aac",
-  "pcm",
-] as const;
-
-/**
- * Valid TTS tiers for provider selection.
- */
-const VALID_SPEECH_TIERS: readonly SpeechTier[] = ["default", "premium", "fallback"] as const;
-
 export class SynthesizeDto {
   /**
    * Text to convert to speech.
-   * Validated separately by TextValidationPipe for length and emptiness.
+   * Validated by class-validator decorators for type and maximum length.
    */
   @IsString({ message: "text must be a string" })
   @MaxLength(4096, { message: "text must not exceed 4096 characters" })
@@ -66,8 +50,8 @@ export class SynthesizeDto {
    */
   @IsOptional()
   @IsString({ message: "format must be a string" })
-  @IsIn(VALID_AUDIO_FORMATS, {
-    message: `format must be one of: ${VALID_AUDIO_FORMATS.join(", ")}`,
+  @IsIn(AUDIO_FORMATS, {
+    message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`,
   })
   format?: AudioFormat;
 
@@ -78,8 +62,8 @@ export class SynthesizeDto {
    */
   @IsOptional()
   @IsString({ message: "tier must be a string" })
-  @IsIn(VALID_SPEECH_TIERS, {
-    message: `tier must be one of: ${VALID_SPEECH_TIERS.join(", ")}`,
+  @IsIn(SPEECH_TIERS, {
+    message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`,
   })
   tier?: SpeechTier;
 }
diff --git a/apps/api/src/speech/interfaces/index.ts b/apps/api/src/speech/interfaces/index.ts
index ded8bd2..5674169 100644
--- a/apps/api/src/speech/interfaces/index.ts
+++ b/apps/api/src/speech/interfaces/index.ts
@@ -6,6 +6,7 @@
 
 export type { ISTTProvider } from "./stt-provider.interface";
 export type { ITTSProvider } from "./tts-provider.interface";
+export { SPEECH_TIERS, AUDIO_FORMATS } from "./speech-types";
 export type {
   SpeechTier,
   AudioFormat,
diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts
index c3b93c1..a472eae 100644
--- a/apps/api/src/speech/interfaces/speech-types.ts
+++ b/apps/api/src/speech/interfaces/speech-types.ts
@@ -12,19 +12,21 @@
 // ==========================================
 
 /**
- * TTS provider tier.
+ * Canonical array of TTS provider tiers.
  * Determines which TTS engine is used for synthesis.
  *
  * - default: Primary TTS engine (e.g., Kokoro)
  * - premium: Higher quality TTS engine (e.g., Chatterbox)
  * - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
  */
-export type SpeechTier = "default" | "premium" | "fallback";
+export const SPEECH_TIERS = ["default", "premium", "fallback"] as const;
+export type SpeechTier = (typeof SPEECH_TIERS)[number];
 
 /**
- * Audio output format for TTS synthesis.
+ * Canonical array of audio output formats for TTS synthesis.
  */
-export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm";
+export const AUDIO_FORMATS = ["mp3", "wav", "opus", "flac", "aac", "pcm"] as const;
+export type AudioFormat = (typeof AUDIO_FORMATS)[number];
 
 // ==========================================
 // STT Types
diff --git a/apps/api/src/speech/interfaces/stt-provider.interface.ts b/apps/api/src/speech/interfaces/stt-provider.interface.ts
index 871fdd1..8f36ce2 100644
--- a/apps/api/src/speech/interfaces/stt-provider.interface.ts
+++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts
@@ -16,7 +16,7 @@ import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
  *
  * @example
  * ```typescript
- * class SpeachesProvider implements ISTTProvider {
+ * class SpeachesSttProvider implements ISTTProvider {
  *   readonly name = "speaches";
  *
  *   async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.ts b/apps/api/src/speech/providers/kokoro-tts.provider.ts
index ac1b7d3..a7a0800 100644
--- a/apps/api/src/speech/providers/kokoro-tts.provider.ts
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts
@@ -5,7 +5,7 @@
  * CPU-based, always available, Apache 2.0 license.
  *
  * Features:
- * - 54 built-in voices across 8 languages
+ * - 53 built-in voices across 8 languages
  * - Speed control: 0.25x to 4.0x
  * - Output formats: mp3, wav, opus, flac
  * - Voice metadata derived from ID prefix (language, gender, accent)
@@ -222,7 +222,7 @@ export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
 /**
  * Kokoro-FastAPI TTS provider (default tier).
  *
- * CPU-based text-to-speech engine with 54 built-in voices across 8 languages.
+ * CPU-based text-to-speech engine with 53 built-in voices across 8 languages.
  * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
  *
  * @example
@@ -254,7 +254,7 @@ export class KokoroTtsProvider extends BaseTTSProvider {
   /**
    * List all available Kokoro voices with metadata.
    *
-   * Returns the full catalog of 54 built-in voices with language, gender,
+   * Returns the full catalog of 53 built-in voices with language, gender,
    * and accent information derived from voice ID prefixes.
    *
    * @returns Array of VoiceInfo objects for all Kokoro voices
diff --git a/apps/api/src/speech/providers/piper-tts.provider.ts b/apps/api/src/speech/providers/piper-tts.provider.ts
index 40e4638..c86ffc4 100644
--- a/apps/api/src/speech/providers/piper-tts.provider.ts
+++ b/apps/api/src/speech/providers/piper-tts.provider.ts
@@ -9,7 +9,7 @@
  * - OpenAI-compatible API via OpenedAI Speech server
  * - 100+ Piper voices across 40+ languages
  * - 6 standard OpenAI voice names mapped to Piper voices
- * - Output formats: mp3, wav, opus, flac, aac, pcm
+ * - Output formats: mp3, wav, opus, flac
  * - CPU-only, no GPU required
  * - GPL license (via OpenedAI Speech)
  *
diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts
index 5a1f69f..21d7b32 100644
--- a/apps/api/src/speech/providers/tts-provider.factory.ts
+++ b/apps/api/src/speech/providers/tts-provider.factory.ts
@@ -18,7 +18,7 @@ import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
 import { KokoroTtsProvider } from "./kokoro-tts.provider";
 import { PiperTtsProvider } from "./piper-tts.provider";
 import type { ITTSProvider } from "../interfaces/tts-provider.interface";
-import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
+import type { SpeechTier } from "../interfaces/speech-types";
 import type { SpeechConfig } from "../speech.config";
 
 // ==========================================
@@ -44,7 +44,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
     const provider = new KokoroTtsProvider(
       config.tts.default.url,
       config.tts.default.voice,
-      config.tts.default.format as AudioFormat
+      config.tts.default.format
     );
     providers.set("default", provider);
     logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`);
diff --git a/apps/api/src/speech/speech.config.ts b/apps/api/src/speech/speech.config.ts
index 48487de..4e57229 100644
--- a/apps/api/src/speech/speech.config.ts
+++ b/apps/api/src/speech/speech.config.ts
@@ -12,6 +12,7 @@
  */
 
 import { registerAs } from "@nestjs/config";
+import type { AudioFormat } from "./interfaces/speech-types";
 
 // ==========================================
 // Default values
@@ -58,7 +59,7 @@ export interface TtsDefaultConfig {
   enabled: boolean;
   url: string;
   voice: string;
-  format: string;
+  format: AudioFormat;
 }
 
 export interface TtsPremiumConfig {
@@ -247,7 +248,7 @@ export function getSpeechConfig(): SpeechConfig {
         enabled: isTtsEnabled(),
         url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url,
         voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice,
-        format: process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format,
+        format: (process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format) as AudioFormat,
       },
       premium: {
         enabled: isTtsPremiumEnabled(),
diff --git a/apps/api/src/speech/speech.gateway.ts b/apps/api/src/speech/speech.gateway.ts
index 907ec57..235ffcc 100644
--- a/apps/api/src/speech/speech.gateway.ts
+++ b/apps/api/src/speech/speech.gateway.ts
@@ -100,6 +100,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
     const timeoutId = setTimeout(() => {
       if (!authenticatedClient.data.userId) {
         this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication timed out.",
+        });
         authenticatedClient.disconnect();
       }
     }, this.CONNECTION_TIMEOUT_MS);
@@ -109,6 +112,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
 
       if (!token) {
         this.logger.warn(`Client ${authenticatedClient.id} connected without token`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: no token provided.",
+        });
         authenticatedClient.disconnect();
         clearTimeout(timeoutId);
         return;
@@ -118,6 +124,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
 
       if (!sessionData) {
         this.logger.warn(`Client ${authenticatedClient.id} has invalid token`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: invalid or expired token.",
+        });
         authenticatedClient.disconnect();
         clearTimeout(timeoutId);
         return;
@@ -133,6 +142,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
 
       if (!workspaceMembership) {
         this.logger.warn(`User ${userId} has no workspace access`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: no workspace access.",
+        });
         authenticatedClient.disconnect();
         clearTimeout(timeoutId);
         return;
@@ -151,6 +163,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
         `Authentication failed for speech client ${authenticatedClient.id}:`,
         error instanceof Error ? error.message : "Unknown error"
       );
+      authenticatedClient.emit("transcription-error", {
+        message: "Authentication failed: an unexpected error occurred.",
+      });
       authenticatedClient.disconnect();
     }
   }
diff --git a/apps/web/src/components/speech/TextToSpeechButton.tsx b/apps/web/src/components/speech/TextToSpeechButton.tsx
index a8f97f7..e208296 100644
--- a/apps/web/src/components/speech/TextToSpeechButton.tsx
+++ b/apps/web/src/components/speech/TextToSpeechButton.tsx
@@ -19,7 +19,7 @@ export interface TextToSpeechButtonProps {
   text: string;
   /** Optional voice ID to use */
   voice?: string;
-  /** Optional tier (e.g. "standard", "premium") */
+  /** Optional tier (e.g. "default", "premium", "fallback") */
   tier?: string;
   /** Optional className for the container */
   className?: string;
diff --git a/apps/web/src/hooks/useTextToSpeech.ts b/apps/web/src/hooks/useTextToSpeech.ts
index cc04cc4..c1152fa 100644
--- a/apps/web/src/hooks/useTextToSpeech.ts
+++ b/apps/web/src/hooks/useTextToSpeech.ts
@@ -173,8 +173,17 @@ export function useTextToSpeech(): UseTextToSpeechReturn {
   const play = useCallback(async (): Promise<void> => {
     const audio = audioRef.current;
     if (audio) {
-      await audio.play();
-      setIsPlaying(true);
+      try {
+        await audio.play();
+        setIsPlaying(true);
+      } catch (err) {
+        const message =
+          err instanceof DOMException && err.name === "NotAllowedError"
+            ? "Playback was blocked by the browser. Try interacting with the page first."
+            : "Unable to play audio. The format may not be supported.";
+        setError(message);
+        setIsPlaying(false);
+      }
     }
   }, []);
 
diff --git a/apps/web/src/hooks/useVoiceInput.ts b/apps/web/src/hooks/useVoiceInput.ts
index 24e792d..46506a5 100644
--- a/apps/web/src/hooks/useVoiceInput.ts
+++ b/apps/web/src/hooks/useVoiceInput.ts
@@ -1,8 +1,8 @@
 /**
  * useVoiceInput hook
  *
- * Custom hook for microphone capture and real-time transcription.
- * Supports WebSocket streaming for real-time partial transcriptions
+ * Custom hook for microphone capture and speech-to-text transcription.
+ * Supports WebSocket streaming with batch transcription on stop,
  * with REST upload fallback when WebSocket is unavailable.
  */
 
@@ -20,6 +20,8 @@ export interface UseVoiceInputOptions {
   useWebSocket?: boolean;
   /** Audio sample rate in Hz (default: 16000) */
   sampleRate?: number;
+  /** Authentication token for WebSocket connection */
+  token?: string;
 }
 
 /** Return type for the useVoiceInput hook */
@@ -75,14 +77,14 @@ function getAudioMimeType(): string {
 }
 
 /**
- * Hook for microphone capture and real-time speech-to-text transcription.
+ * Hook for microphone capture and speech-to-text transcription.
  *
- * Uses WebSocket streaming by default for real-time partial transcriptions.
+ * Uses WebSocket streaming by default with batch transcription on stop.
  * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
  * is disabled or unavailable.
  */
 export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
-  const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options;
+  const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options;
 
   const [isRecording, setIsRecording] = useState(false);
   const [transcript, setTranscript] = useState("");
@@ -143,9 +145,12 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
       };
 
       animationFrameRef.current = requestAnimationFrame(updateLevel);
-    } catch {
+    } catch (err) {
       // Audio analysis is non-critical; continue without it
-      console.warn("Audio analysis not available");
+      console.warn(
+        "Audio level visualization unavailable:",
+        err instanceof Error ? err.message : String(err)
+      );
     }
   }, []);
 
@@ -169,11 +174,14 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
    * Connect to the speech WebSocket namespace
    */
   const connectSocket = useCallback((): Socket => {
-    const socket = io(API_BASE_URL, {
+    const socket = io(`${API_BASE_URL}/speech`, {
       path: "/socket.io",
       transports: ["websocket", "polling"],
+      ...(token ? { auth: { token } } : {}),
     });
 
+    // Future use: the gateway does not currently emit transcription-partial,
+    // but the listener is registered for when real-time partial transcription is added.
     socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
       setPartialTranscript(data.text);
     });
@@ -188,9 +196,19 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
       setError(data.message);
     });
 
+    socket.on("connect_error", (err: Error) => {
+      setError(`WebSocket connection failed: ${err.message}`);
+    });
+
+    socket.on("disconnect", (reason: string) => {
+      if (reason !== "io client disconnect") {
+        setError(`WebSocket disconnected unexpectedly: ${reason}`);
+      }
+    });
+
     socketRef.current = socket;
     return socket;
-  }, []);
+  }, [token]);
 
   /**
    * Disconnect the WebSocket
@@ -200,6 +218,8 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
       socketRef.current.off("transcription-partial");
       socketRef.current.off("transcription-final");
       socketRef.current.off("transcription-error");
+      socketRef.current.off("connect_error");
+      socketRef.current.off("disconnect");
       socketRef.current.disconnect();
       socketRef.current = null;
     }
@@ -211,7 +231,7 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
   const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
     try {
       const formData = new FormData();
-      formData.append("audio", audioBlob, "recording.webm");
+      formData.append("file", audioBlob, "recording.webm");
 
       const response = await apiPostFormData<TranscribeResponse>(
         "/api/speech/transcribe",
@@ -315,10 +335,16 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
       });
 
       // Handle errors
-      mediaRecorder.addEventListener("error", () => {
-        setError("Recording encountered an issue. Please try again.");
+      mediaRecorder.addEventListener("error", (event: Event) => {
+        let errorMessage = "Recording encountered an issue. Please try again.";
+        if ("error" in event && event.error instanceof DOMException) {
+          errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`;
+        }
+        setError(errorMessage);
         setIsRecording(false);
         isRecordingRef.current = false;
+        stopMediaTracks();
+        cleanupAudioAnalysis();
       });
 
       // Start recording with timeslice for streaming chunks (250ms intervals)
diff --git a/docs/SPEECH.md b/docs/SPEECH.md
index 3ea7dd4..2f2b078 100644
--- a/docs/SPEECH.md
+++ b/docs/SPEECH.md
@@ -494,7 +494,7 @@ Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values de
 
 **Capabilities:**
 
-- 54 built-in voices across 8 languages
+- 53 built-in voices across 8 languages
 - Speed control: 0.25x to 4.0x
 - Output formats: mp3, wav, opus, flac
 - Voice metadata derived from ID prefix (language, gender, accent)