fix(#388): address PR review findings — fix WebSocket/REST bugs, improve error handling, fix types and comments

Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 03:44:33 -06:00
parent dcbc8d1053
commit af9c5799af
14 changed files with 91 additions and 53 deletions
--- a/apps/web/src/components/speech/TextToSpeechButton.tsx
+++ b/apps/web/src/components/speech/TextToSpeechButton.tsx
@@ -19,7 +19,7 @@ export interface TextToSpeechButtonProps {
  text: string;
  /** Optional voice ID to use */
  voice?: string;
-  /** Optional tier (e.g. "standard", "premium") */
+  /** Optional tier (e.g. "default", "premium", "fallback") */
  tier?: string;
  /** Optional className for the container */
  className?: string;
--- a/apps/web/src/hooks/useTextToSpeech.ts
+++ b/apps/web/src/hooks/useTextToSpeech.ts
@@ -173,8 +173,17 @@ export function useTextToSpeech(): UseTextToSpeechReturn {
  const play = useCallback(async (): Promise<void> => {
    const audio = audioRef.current;
    if (audio) {
-      await audio.play();
-      setIsPlaying(true);
+      try {
+        await audio.play();
+        setIsPlaying(true);
+      } catch (err) {
+        const message =
+          err instanceof DOMException && err.name === "NotAllowedError"
+            ? "Playback was blocked by the browser. Try interacting with the page first."
+            : "Unable to play audio. The format may not be supported.";
+        setError(message);
+        setIsPlaying(false);
+      }
    }
  }, []);

--- a/apps/web/src/hooks/useVoiceInput.ts
+++ b/apps/web/src/hooks/useVoiceInput.ts
@@ -1,8 +1,8 @@
 /**
 * useVoiceInput hook
 *
- * Custom hook for microphone capture and real-time transcription.
- * Supports WebSocket streaming for real-time partial transcriptions
+ * Custom hook for microphone capture and speech-to-text transcription.
+ * Supports WebSocket streaming with batch transcription on stop,
 * with REST upload fallback when WebSocket is unavailable.
 */

@@ -20,6 +20,8 @@ export interface UseVoiceInputOptions {
  useWebSocket?: boolean;
  /** Audio sample rate in Hz (default: 16000) */
  sampleRate?: number;
+  /** Authentication token for WebSocket connection */
+  token?: string;
 }

 /** Return type for the useVoiceInput hook */
@@ -75,14 +77,14 @@ function getAudioMimeType(): string {
 }

 /**
- * Hook for microphone capture and real-time speech-to-text transcription.
+ * Hook for microphone capture and speech-to-text transcription.
 *
- * Uses WebSocket streaming by default for real-time partial transcriptions.
+ * Uses WebSocket streaming by default with batch transcription on stop.
 * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
 * is disabled or unavailable.
 */
 export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
-  const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options;
+  const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options;

  const [isRecording, setIsRecording] = useState(false);
  const [transcript, setTranscript] = useState("");
@@ -143,9 +145,12 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
      };

      animationFrameRef.current = requestAnimationFrame(updateLevel);
-    } catch {
+    } catch (err) {
      // Audio analysis is non-critical; continue without it
-      console.warn("Audio analysis not available");
+      console.warn(
+        "Audio level visualization unavailable:",
+        err instanceof Error ? err.message : String(err)
+      );
    }
  }, []);

@@ -169,11 +174,14 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
   * Connect to the speech WebSocket namespace
   */
  const connectSocket = useCallback((): Socket => {
-    const socket = io(API_BASE_URL, {
+    const socket = io(`${API_BASE_URL}/speech`, {
      path: "/socket.io",
      transports: ["websocket", "polling"],
+      ...(token ? { auth: { token } } : {}),
    });

+    // Future use: the gateway does not currently emit transcription-partial,
+    // but the listener is registered for when real-time partial transcription is added.
    socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
      setPartialTranscript(data.text);
    });
@@ -188,9 +196,19 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
      setError(data.message);
    });

+    socket.on("connect_error", (err: Error) => {
+      setError(`WebSocket connection failed: ${err.message}`);
+    });
+
+    socket.on("disconnect", (reason: string) => {
+      if (reason !== "io client disconnect") {
+        setError(`WebSocket disconnected unexpectedly: ${reason}`);
+      }
+    });
+
    socketRef.current = socket;
    return socket;
-  }, []);
+  }, [token]);

  /**
   * Disconnect the WebSocket
@@ -200,6 +218,8 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
      socketRef.current.off("transcription-partial");
      socketRef.current.off("transcription-final");
      socketRef.current.off("transcription-error");
+      socketRef.current.off("connect_error");
+      socketRef.current.off("disconnect");
      socketRef.current.disconnect();
      socketRef.current = null;
    }
@@ -211,7 +231,7 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
  const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
    try {
      const formData = new FormData();
-      formData.append("audio", audioBlob, "recording.webm");
+      formData.append("file", audioBlob, "recording.webm");

      const response = await apiPostFormData<TranscribeResponse>(
        "/api/speech/transcribe",
@@ -315,10 +335,16 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
      });

      // Handle errors
-      mediaRecorder.addEventListener("error", () => {
-        setError("Recording encountered an issue. Please try again.");
+      mediaRecorder.addEventListener("error", (event: Event) => {
+        let errorMessage = "Recording encountered an issue. Please try again.";
+        if ("error" in event && event.error instanceof DOMException) {
+          errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`;
+        }
+        setError(errorMessage);
        setIsRecording(false);
        isRecordingRef.current = false;
+        stopMediaTracks();
+        cleanupAudioAnalysis();
      });

      // Start recording with timeslice for streaming chunks (250ms intervals)