stack/apps/web/src/hooks/useVoiceInput.ts

/**
 * useVoiceInput hook
 *
 * Custom hook for microphone capture and speech-to-text transcription.
 * Supports WebSocket streaming with batch transcription on stop,
 * with REST upload fallback when WebSocket is unavailable.
 */

import { useState, useCallback, useRef, useEffect } from "react";
import type { Socket } from "socket.io-client";
import { io } from "socket.io-client";
import { API_BASE_URL } from "@/lib/config";
import { apiPostFormData } from "@/lib/api/client";

/** Options for the useVoiceInput hook */
export interface UseVoiceInputOptions {
  /** Callback fired when final transcription is received */
  onTranscript?: (text: string) => void;
  /** Whether to use WebSocket streaming (default: true) */
  useWebSocket?: boolean;
  /** Audio sample rate in Hz (default: 16000) */
  sampleRate?: number;
  /** Authentication token for WebSocket connection */
  token?: string;
}

/** Return type for the useVoiceInput hook */
export interface UseVoiceInputReturn {
  /** Whether the microphone is currently recording */
  isRecording: boolean;
  /** Start microphone capture and transcription */
  startRecording: () => Promise<void>;
  /** Stop microphone capture and transcription */
  stopRecording: () => void;
  /** The final transcription text */
  transcript: string;
  /** Partial transcription text (updates in real-time) */
  partialTranscript: string;
  /** Error message if something went wrong */
  error: string | null;
  /** Current audio input level (0-1) */
  audioLevel: number;
}

interface TranscriptionPartialPayload {
  text: string;
}

interface TranscriptionFinalPayload {
  text: string;
}

interface TranscriptionErrorPayload {
  message: string;
}

interface TranscribeResponse {
  data: {
    text: string;
  };
}

/**
 * Determine the best MIME type for audio recording
 */
function getAudioMimeType(): string {
  if (typeof MediaRecorder === "undefined") {
    return "audio/webm";
  }
  const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"];
  for (const type of types) {
    if (MediaRecorder.isTypeSupported(type)) {
      return type;
    }
  }
  return "audio/webm";
}

/**
 * Hook for microphone capture and speech-to-text transcription.
 *
 * Uses WebSocket streaming by default with batch transcription on stop.
 * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
 * is disabled or unavailable.
 */
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
  const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options;

  const [isRecording, setIsRecording] = useState(false);
  const [transcript, setTranscript] = useState("");
  const [partialTranscript, setPartialTranscript] = useState("");
  const [error, setError] = useState<string | null>(null);
  const [audioLevel, setAudioLevel] = useState(0);

  // Refs to hold mutable state without re-renders
  const socketRef = useRef<Socket | null>(null);
  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
  const streamRef = useRef<MediaStream | null>(null);
  const audioContextRef = useRef<AudioContext | null>(null);
  const analyserRef = useRef<AnalyserNode | null>(null);
  const animationFrameRef = useRef<number | null>(null);
  const onTranscriptRef = useRef(onTranscript);
  const recordedChunksRef = useRef<Blob[]>([]);
  const isRecordingRef = useRef(false);

  // Keep callback ref up to date
  useEffect(() => {
    onTranscriptRef.current = onTranscript;
  }, [onTranscript]);

  /**
   * Set up audio analysis for visualizing input level
   */
  const setupAudioAnalysis = useCallback((stream: MediaStream): void => {
    try {
      const audioContext = new AudioContext();
      const analyser = audioContext.createAnalyser();
      const source = audioContext.createMediaStreamSource(stream);

      analyser.fftSize = 256;
      source.connect(analyser);

      audioContextRef.current = audioContext;
      analyserRef.current = analyser;

      // Start level monitoring
      const dataArray = new Uint8Array(analyser.frequencyBinCount);

      const updateLevel = (): void => {
        if (!isRecordingRef.current) {
          return;
        }

        analyser.getByteFrequencyData(dataArray);

        // Calculate average level
        let sum = 0;
        for (const value of dataArray) {
          sum += value;
        }
        const average = sum / dataArray.length / 255;
        setAudioLevel(average);

        animationFrameRef.current = requestAnimationFrame(updateLevel);
      };

      animationFrameRef.current = requestAnimationFrame(updateLevel);
    } catch (err) {
      // Audio analysis is non-critical; continue without it
      console.warn(
        "Audio level visualization unavailable:",
        err instanceof Error ? err.message : String(err)
      );
    }
  }, []);

  /**
   * Clean up audio analysis resources
   */
  const cleanupAudioAnalysis = useCallback((): void => {
    if (animationFrameRef.current !== null) {
      cancelAnimationFrame(animationFrameRef.current);
      animationFrameRef.current = null;
    }
    if (audioContextRef.current) {
      void audioContextRef.current.close();
      audioContextRef.current = null;
    }
    analyserRef.current = null;
    setAudioLevel(0);
  }, []);

  /**
   * Connect to the speech WebSocket namespace
   */
  const connectSocket = useCallback((): Socket => {
    const socket = io(`${API_BASE_URL}/speech`, {
      path: "/socket.io",
      transports: ["websocket", "polling"],
      ...(token ? { auth: { token } } : {}),
    });

    // Future use: the gateway does not currently emit transcription-partial,
    // but the listener is registered for when real-time partial transcription is added.
    socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
      setPartialTranscript(data.text);
    });

    socket.on("transcription-final", (data: TranscriptionFinalPayload) => {
      setTranscript(data.text);
      setPartialTranscript("");
      onTranscriptRef.current?.(data.text);
    });

    socket.on("transcription-error", (data: TranscriptionErrorPayload) => {
      setError(data.message);
    });

    socket.on("connect_error", (err: Error) => {
      setError(`WebSocket connection failed: ${err.message}`);
    });

    socket.on("disconnect", (reason: string) => {
      if (reason !== "io client disconnect") {
        setError(`WebSocket disconnected unexpectedly: ${reason}`);
      }
    });

    socketRef.current = socket;
    return socket;
  }, [token]);

  /**
   * Disconnect the WebSocket
   */
  const disconnectSocket = useCallback((): void => {
    if (socketRef.current) {
      socketRef.current.off("transcription-partial");
      socketRef.current.off("transcription-final");
      socketRef.current.off("transcription-error");
      socketRef.current.off("connect_error");
      socketRef.current.off("disconnect");
      socketRef.current.disconnect();
      socketRef.current = null;
    }
  }, []);

  /**
   * Send recorded audio via REST API as fallback
   */
  const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
    try {
      const formData = new FormData();
      formData.append("file", audioBlob, "recording.webm");

      const response = await apiPostFormData<TranscribeResponse>(
        "/api/speech/transcribe",
        formData
      );

      if (response.data.text) {
        setTranscript(response.data.text);
        setPartialTranscript("");
        onTranscriptRef.current?.(response.data.text);
      }
    } catch (err) {
      const message = err instanceof Error ? err.message : "Transcription request failed";
      setError(message);
    }
  }, []);

  /**
   * Stop all media tracks on the stream
   */
  const stopMediaTracks = useCallback((): void => {
    if (streamRef.current) {
      streamRef.current.getTracks().forEach((track) => {
        track.stop();
      });
      streamRef.current = null;
    }
  }, []);

  /**
   * Start microphone capture and transcription
   */
  const startRecording = useCallback(async (): Promise<void> => {
    // Prevent double-start
    if (isRecordingRef.current) {
      return;
    }

    setError(null);
    setPartialTranscript("");
    recordedChunksRef.current = [];

    try {
      // Request microphone access
      const stream = await navigator.mediaDevices.getUserMedia({
        audio: {
          echoCancellation: true,
          noiseSuppression: true,
          sampleRate,
        },
      });

      streamRef.current = stream;

      // Set up audio level visualization
      setupAudioAnalysis(stream);

      // Determine MIME type
      const mimeType = getAudioMimeType();

      // Create MediaRecorder
      const mediaRecorder = new MediaRecorder(stream, { mimeType });
      mediaRecorderRef.current = mediaRecorder;

      // Connect WebSocket if enabled
      let socket: Socket | null = null;
      if (useWs) {
        socket = connectSocket();

        // Emit start-transcription event
        socket.emit("start-transcription", {
          format: mimeType,
          sampleRate,
        });
      }

      // Handle audio data chunks
      mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => {
        if (event.data.size > 0) {
          if (socket?.connected) {
            // Stream chunks via WebSocket
            socket.emit("audio-chunk", event.data);
          } else {
            // Collect chunks for REST upload
            recordedChunksRef.current.push(event.data);
          }
        }
      });

      // Handle recording stop
      mediaRecorder.addEventListener("stop", () => {
        // If using REST fallback, send collected audio
        if (!useWs || !socket?.connected) {
          if (recordedChunksRef.current.length > 0) {
            const audioBlob = new Blob(recordedChunksRef.current, {
              type: mimeType,
            });
            void sendAudioViaRest(audioBlob);
          }
        }
      });

      // Handle errors
      mediaRecorder.addEventListener("error", (event: Event) => {
        let errorMessage = "Recording encountered an issue. Please try again.";
        if ("error" in event && event.error instanceof DOMException) {
          errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`;
        }
        setError(errorMessage);
        setIsRecording(false);
        isRecordingRef.current = false;
        stopMediaTracks();
        cleanupAudioAnalysis();
      });

      // Start recording with timeslice for streaming chunks (250ms intervals)
      mediaRecorder.start(250);
      setIsRecording(true);
      isRecordingRef.current = true;
    } catch (err) {
      // Handle specific error types
      if (err instanceof DOMException) {
        if (err.name === "NotAllowedError") {
          setError(
            "Microphone access was not granted. Please allow microphone access to use voice input."
          );
        } else if (err.name === "NotFoundError") {
          setError("No microphone found. Please connect a microphone and try again.");
        } else {
          setError("Unable to access the microphone. Please check your device settings.");
        }
      } else {
        setError("Unable to start voice input. Please try again.");
      }

      // Clean up on failure
      stopMediaTracks();
      cleanupAudioAnalysis();
    }
  }, [
    useWs,
    sampleRate,
    setupAudioAnalysis,
    connectSocket,
    sendAudioViaRest,
    stopMediaTracks,
    cleanupAudioAnalysis,
  ]);

  /**
   * Stop microphone capture and transcription
   */
  const stopRecording = useCallback((): void => {
    setIsRecording(false);
    isRecordingRef.current = false;

    // Stop MediaRecorder
    if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
      mediaRecorderRef.current.stop();
      mediaRecorderRef.current = null;
    }

    // Stop media tracks
    stopMediaTracks();

    // Clean up audio analysis
    cleanupAudioAnalysis();

    // Emit stop event and disconnect WebSocket
    if (socketRef.current) {
      socketRef.current.emit("stop-transcription");
      // Give the server a moment to process the final chunk before disconnecting
      setTimeout(() => {
        disconnectSocket();
      }, 500);
    }
  }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);

  // Cleanup on unmount
  useEffect(() => {
    return (): void => {
      isRecordingRef.current = false;
      if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
        mediaRecorderRef.current.stop();
      }
      stopMediaTracks();
      cleanupAudioAnalysis();
      disconnectSocket();
    };
  }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);

  return {
    isRecording,
    startRecording,
    stopRecording,
    transcript,
    partialTranscript,
    error,
    audioLevel,
  };
}