/** * useVoiceInput hook * * Custom hook for microphone capture and speech-to-text transcription. * Supports WebSocket streaming with batch transcription on stop, * with REST upload fallback when WebSocket is unavailable. */ import { useState, useCallback, useRef, useEffect } from "react"; import type { Socket } from "socket.io-client"; import { io } from "socket.io-client"; import { API_BASE_URL } from "@/lib/config"; import { apiPostFormData } from "@/lib/api/client"; /** Options for the useVoiceInput hook */ export interface UseVoiceInputOptions { /** Callback fired when final transcription is received */ onTranscript?: (text: string) => void; /** Whether to use WebSocket streaming (default: true) */ useWebSocket?: boolean; /** Audio sample rate in Hz (default: 16000) */ sampleRate?: number; /** Authentication token for WebSocket connection */ token?: string; } /** Return type for the useVoiceInput hook */ export interface UseVoiceInputReturn { /** Whether the microphone is currently recording */ isRecording: boolean; /** Start microphone capture and transcription */ startRecording: () => Promise; /** Stop microphone capture and transcription */ stopRecording: () => void; /** The final transcription text */ transcript: string; /** Partial transcription text (updates in real-time) */ partialTranscript: string; /** Error message if something went wrong */ error: string | null; /** Current audio input level (0-1) */ audioLevel: number; } interface TranscriptionPartialPayload { text: string; } interface TranscriptionFinalPayload { text: string; } interface TranscriptionErrorPayload { message: string; } interface TranscribeResponse { data: { text: string; }; } /** * Determine the best MIME type for audio recording */ function getAudioMimeType(): string { if (typeof MediaRecorder === "undefined") { return "audio/webm"; } const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"]; for (const type of types) { if (MediaRecorder.isTypeSupported(type)) { return type; } } return "audio/webm"; } /** * Hook for microphone capture and speech-to-text transcription. * * Uses WebSocket streaming by default with batch transcription on stop. * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket * is disabled or unavailable. */ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn { const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options; const [isRecording, setIsRecording] = useState(false); const [transcript, setTranscript] = useState(""); const [partialTranscript, setPartialTranscript] = useState(""); const [error, setError] = useState(null); const [audioLevel, setAudioLevel] = useState(0); // Refs to hold mutable state without re-renders const socketRef = useRef(null); const mediaRecorderRef = useRef(null); const streamRef = useRef(null); const audioContextRef = useRef(null); const analyserRef = useRef(null); const animationFrameRef = useRef(null); const onTranscriptRef = useRef(onTranscript); const recordedChunksRef = useRef([]); const isRecordingRef = useRef(false); // Keep callback ref up to date useEffect(() => { onTranscriptRef.current = onTranscript; }, [onTranscript]); /** * Set up audio analysis for visualizing input level */ const setupAudioAnalysis = useCallback((stream: MediaStream): void => { try { const audioContext = new AudioContext(); const analyser = audioContext.createAnalyser(); const source = audioContext.createMediaStreamSource(stream); analyser.fftSize = 256; source.connect(analyser); audioContextRef.current = audioContext; analyserRef.current = analyser; // Start level monitoring const dataArray = new Uint8Array(analyser.frequencyBinCount); const updateLevel = (): void => { if (!isRecordingRef.current) { return; } analyser.getByteFrequencyData(dataArray); // Calculate average level let sum = 0; for (const value of dataArray) { sum += value; } const average = sum / dataArray.length / 255; setAudioLevel(average); animationFrameRef.current = requestAnimationFrame(updateLevel); }; animationFrameRef.current = requestAnimationFrame(updateLevel); } catch (err) { // Audio analysis is non-critical; continue without it console.warn( "Audio level visualization unavailable:", err instanceof Error ? err.message : String(err) ); } }, []); /** * Clean up audio analysis resources */ const cleanupAudioAnalysis = useCallback((): void => { if (animationFrameRef.current !== null) { cancelAnimationFrame(animationFrameRef.current); animationFrameRef.current = null; } if (audioContextRef.current) { void audioContextRef.current.close(); audioContextRef.current = null; } analyserRef.current = null; setAudioLevel(0); }, []); /** * Connect to the speech WebSocket namespace */ const connectSocket = useCallback((): Socket => { const socket = io(`${API_BASE_URL}/speech`, { path: "/socket.io", transports: ["websocket", "polling"], ...(token ? { auth: { token } } : {}), }); // Future use: the gateway does not currently emit transcription-partial, // but the listener is registered for when real-time partial transcription is added. socket.on("transcription-partial", (data: TranscriptionPartialPayload) => { setPartialTranscript(data.text); }); socket.on("transcription-final", (data: TranscriptionFinalPayload) => { setTranscript(data.text); setPartialTranscript(""); onTranscriptRef.current?.(data.text); }); socket.on("transcription-error", (data: TranscriptionErrorPayload) => { setError(data.message); }); socket.on("connect_error", (err: Error) => { setError(`WebSocket connection failed: ${err.message}`); }); socket.on("disconnect", (reason: string) => { if (reason !== "io client disconnect") { setError(`WebSocket disconnected unexpectedly: ${reason}`); } }); socketRef.current = socket; return socket; }, [token]); /** * Disconnect the WebSocket */ const disconnectSocket = useCallback((): void => { if (socketRef.current) { socketRef.current.off("transcription-partial"); socketRef.current.off("transcription-final"); socketRef.current.off("transcription-error"); socketRef.current.off("connect_error"); socketRef.current.off("disconnect"); socketRef.current.disconnect(); socketRef.current = null; } }, []); /** * Send recorded audio via REST API as fallback */ const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise => { try { const formData = new FormData(); formData.append("file", audioBlob, "recording.webm"); const response = await apiPostFormData( "/api/speech/transcribe", formData ); if (response.data.text) { setTranscript(response.data.text); setPartialTranscript(""); onTranscriptRef.current?.(response.data.text); } } catch (err) { const message = err instanceof Error ? err.message : "Transcription request failed"; setError(message); } }, []); /** * Stop all media tracks on the stream */ const stopMediaTracks = useCallback((): void => { if (streamRef.current) { streamRef.current.getTracks().forEach((track) => { track.stop(); }); streamRef.current = null; } }, []); /** * Start microphone capture and transcription */ const startRecording = useCallback(async (): Promise => { // Prevent double-start if (isRecordingRef.current) { return; } setError(null); setPartialTranscript(""); recordedChunksRef.current = []; try { // Request microphone access const stream = await navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: true, noiseSuppression: true, sampleRate, }, }); streamRef.current = stream; // Set up audio level visualization setupAudioAnalysis(stream); // Determine MIME type const mimeType = getAudioMimeType(); // Create MediaRecorder const mediaRecorder = new MediaRecorder(stream, { mimeType }); mediaRecorderRef.current = mediaRecorder; // Connect WebSocket if enabled let socket: Socket | null = null; if (useWs) { socket = connectSocket(); // Emit start-transcription event socket.emit("start-transcription", { format: mimeType, sampleRate, }); } // Handle audio data chunks mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => { if (event.data.size > 0) { if (socket?.connected) { // Stream chunks via WebSocket socket.emit("audio-chunk", event.data); } else { // Collect chunks for REST upload recordedChunksRef.current.push(event.data); } } }); // Handle recording stop mediaRecorder.addEventListener("stop", () => { // If using REST fallback, send collected audio if (!useWs || !socket?.connected) { if (recordedChunksRef.current.length > 0) { const audioBlob = new Blob(recordedChunksRef.current, { type: mimeType, }); void sendAudioViaRest(audioBlob); } } }); // Handle errors mediaRecorder.addEventListener("error", (event: Event) => { let errorMessage = "Recording encountered an issue. Please try again."; if ("error" in event && event.error instanceof DOMException) { errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`; } setError(errorMessage); setIsRecording(false); isRecordingRef.current = false; stopMediaTracks(); cleanupAudioAnalysis(); }); // Start recording with timeslice for streaming chunks (250ms intervals) mediaRecorder.start(250); setIsRecording(true); isRecordingRef.current = true; } catch (err) { // Handle specific error types if (err instanceof DOMException) { if (err.name === "NotAllowedError") { setError( "Microphone access was not granted. Please allow microphone access to use voice input." ); } else if (err.name === "NotFoundError") { setError("No microphone found. Please connect a microphone and try again."); } else { setError("Unable to access the microphone. Please check your device settings."); } } else { setError("Unable to start voice input. Please try again."); } // Clean up on failure stopMediaTracks(); cleanupAudioAnalysis(); } }, [ useWs, sampleRate, setupAudioAnalysis, connectSocket, sendAudioViaRest, stopMediaTracks, cleanupAudioAnalysis, ]); /** * Stop microphone capture and transcription */ const stopRecording = useCallback((): void => { setIsRecording(false); isRecordingRef.current = false; // Stop MediaRecorder if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { mediaRecorderRef.current.stop(); mediaRecorderRef.current = null; } // Stop media tracks stopMediaTracks(); // Clean up audio analysis cleanupAudioAnalysis(); // Emit stop event and disconnect WebSocket if (socketRef.current) { socketRef.current.emit("stop-transcription"); // Give the server a moment to process the final chunk before disconnecting setTimeout(() => { disconnectSocket(); }, 500); } }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]); // Cleanup on unmount useEffect(() => { return (): void => { isRecordingRef.current = false; if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { mediaRecorderRef.current.stop(); } stopMediaTracks(); cleanupAudioAnalysis(); disconnectSocket(); }; }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]); return { isRecording, startRecording, stopRecording, transcript, partialTranscript, error, audioLevel, }; }