Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
436 lines
13 KiB
TypeScript
436 lines
13 KiB
TypeScript
/**
|
|
* useVoiceInput hook
|
|
*
|
|
* Custom hook for microphone capture and speech-to-text transcription.
|
|
* Supports WebSocket streaming with batch transcription on stop,
|
|
* with REST upload fallback when WebSocket is unavailable.
|
|
*/
|
|
|
|
import { useState, useCallback, useRef, useEffect } from "react";
|
|
import type { Socket } from "socket.io-client";
|
|
import { io } from "socket.io-client";
|
|
import { API_BASE_URL } from "@/lib/config";
|
|
import { apiPostFormData } from "@/lib/api/client";
|
|
|
|
/** Options for the useVoiceInput hook */
|
|
export interface UseVoiceInputOptions {
|
|
/** Callback fired when final transcription is received */
|
|
onTranscript?: (text: string) => void;
|
|
/** Whether to use WebSocket streaming (default: true) */
|
|
useWebSocket?: boolean;
|
|
/** Audio sample rate in Hz (default: 16000) */
|
|
sampleRate?: number;
|
|
/** Authentication token for WebSocket connection */
|
|
token?: string;
|
|
}
|
|
|
|
/** Return type for the useVoiceInput hook */
|
|
export interface UseVoiceInputReturn {
|
|
/** Whether the microphone is currently recording */
|
|
isRecording: boolean;
|
|
/** Start microphone capture and transcription */
|
|
startRecording: () => Promise<void>;
|
|
/** Stop microphone capture and transcription */
|
|
stopRecording: () => void;
|
|
/** The final transcription text */
|
|
transcript: string;
|
|
/** Partial transcription text (updates in real-time) */
|
|
partialTranscript: string;
|
|
/** Error message if something went wrong */
|
|
error: string | null;
|
|
/** Current audio input level (0-1) */
|
|
audioLevel: number;
|
|
}
|
|
|
|
interface TranscriptionPartialPayload {
|
|
text: string;
|
|
}
|
|
|
|
interface TranscriptionFinalPayload {
|
|
text: string;
|
|
}
|
|
|
|
interface TranscriptionErrorPayload {
|
|
message: string;
|
|
}
|
|
|
|
interface TranscribeResponse {
|
|
data: {
|
|
text: string;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Determine the best MIME type for audio recording
|
|
*/
|
|
function getAudioMimeType(): string {
|
|
if (typeof MediaRecorder === "undefined") {
|
|
return "audio/webm";
|
|
}
|
|
const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"];
|
|
for (const type of types) {
|
|
if (MediaRecorder.isTypeSupported(type)) {
|
|
return type;
|
|
}
|
|
}
|
|
return "audio/webm";
|
|
}
|
|
|
|
/**
|
|
* Hook for microphone capture and speech-to-text transcription.
|
|
*
|
|
* Uses WebSocket streaming by default with batch transcription on stop.
|
|
* Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
|
|
* is disabled or unavailable.
|
|
*/
|
|
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
|
|
const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options;
|
|
|
|
const [isRecording, setIsRecording] = useState(false);
|
|
const [transcript, setTranscript] = useState("");
|
|
const [partialTranscript, setPartialTranscript] = useState("");
|
|
const [error, setError] = useState<string | null>(null);
|
|
const [audioLevel, setAudioLevel] = useState(0);
|
|
|
|
// Refs to hold mutable state without re-renders
|
|
const socketRef = useRef<Socket | null>(null);
|
|
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
|
const streamRef = useRef<MediaStream | null>(null);
|
|
const audioContextRef = useRef<AudioContext | null>(null);
|
|
const analyserRef = useRef<AnalyserNode | null>(null);
|
|
const animationFrameRef = useRef<number | null>(null);
|
|
const onTranscriptRef = useRef(onTranscript);
|
|
const recordedChunksRef = useRef<Blob[]>([]);
|
|
const isRecordingRef = useRef(false);
|
|
|
|
// Keep callback ref up to date
|
|
useEffect(() => {
|
|
onTranscriptRef.current = onTranscript;
|
|
}, [onTranscript]);
|
|
|
|
/**
|
|
* Set up audio analysis for visualizing input level
|
|
*/
|
|
const setupAudioAnalysis = useCallback((stream: MediaStream): void => {
|
|
try {
|
|
const audioContext = new AudioContext();
|
|
const analyser = audioContext.createAnalyser();
|
|
const source = audioContext.createMediaStreamSource(stream);
|
|
|
|
analyser.fftSize = 256;
|
|
source.connect(analyser);
|
|
|
|
audioContextRef.current = audioContext;
|
|
analyserRef.current = analyser;
|
|
|
|
// Start level monitoring
|
|
const dataArray = new Uint8Array(analyser.frequencyBinCount);
|
|
|
|
const updateLevel = (): void => {
|
|
if (!isRecordingRef.current) {
|
|
return;
|
|
}
|
|
|
|
analyser.getByteFrequencyData(dataArray);
|
|
|
|
// Calculate average level
|
|
let sum = 0;
|
|
for (const value of dataArray) {
|
|
sum += value;
|
|
}
|
|
const average = sum / dataArray.length / 255;
|
|
setAudioLevel(average);
|
|
|
|
animationFrameRef.current = requestAnimationFrame(updateLevel);
|
|
};
|
|
|
|
animationFrameRef.current = requestAnimationFrame(updateLevel);
|
|
} catch (err) {
|
|
// Audio analysis is non-critical; continue without it
|
|
console.warn(
|
|
"Audio level visualization unavailable:",
|
|
err instanceof Error ? err.message : String(err)
|
|
);
|
|
}
|
|
}, []);
|
|
|
|
/**
|
|
* Clean up audio analysis resources
|
|
*/
|
|
const cleanupAudioAnalysis = useCallback((): void => {
|
|
if (animationFrameRef.current !== null) {
|
|
cancelAnimationFrame(animationFrameRef.current);
|
|
animationFrameRef.current = null;
|
|
}
|
|
if (audioContextRef.current) {
|
|
void audioContextRef.current.close();
|
|
audioContextRef.current = null;
|
|
}
|
|
analyserRef.current = null;
|
|
setAudioLevel(0);
|
|
}, []);
|
|
|
|
/**
|
|
* Connect to the speech WebSocket namespace
|
|
*/
|
|
const connectSocket = useCallback((): Socket => {
|
|
const socket = io(`${API_BASE_URL}/speech`, {
|
|
path: "/socket.io",
|
|
transports: ["websocket", "polling"],
|
|
...(token ? { auth: { token } } : {}),
|
|
});
|
|
|
|
// Future use: the gateway does not currently emit transcription-partial,
|
|
// but the listener is registered for when real-time partial transcription is added.
|
|
socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
|
|
setPartialTranscript(data.text);
|
|
});
|
|
|
|
socket.on("transcription-final", (data: TranscriptionFinalPayload) => {
|
|
setTranscript(data.text);
|
|
setPartialTranscript("");
|
|
onTranscriptRef.current?.(data.text);
|
|
});
|
|
|
|
socket.on("transcription-error", (data: TranscriptionErrorPayload) => {
|
|
setError(data.message);
|
|
});
|
|
|
|
socket.on("connect_error", (err: Error) => {
|
|
setError(`WebSocket connection failed: ${err.message}`);
|
|
});
|
|
|
|
socket.on("disconnect", (reason: string) => {
|
|
if (reason !== "io client disconnect") {
|
|
setError(`WebSocket disconnected unexpectedly: ${reason}`);
|
|
}
|
|
});
|
|
|
|
socketRef.current = socket;
|
|
return socket;
|
|
}, [token]);
|
|
|
|
/**
|
|
* Disconnect the WebSocket
|
|
*/
|
|
const disconnectSocket = useCallback((): void => {
|
|
if (socketRef.current) {
|
|
socketRef.current.off("transcription-partial");
|
|
socketRef.current.off("transcription-final");
|
|
socketRef.current.off("transcription-error");
|
|
socketRef.current.off("connect_error");
|
|
socketRef.current.off("disconnect");
|
|
socketRef.current.disconnect();
|
|
socketRef.current = null;
|
|
}
|
|
}, []);
|
|
|
|
/**
|
|
* Send recorded audio via REST API as fallback
|
|
*/
|
|
const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
|
|
try {
|
|
const formData = new FormData();
|
|
formData.append("file", audioBlob, "recording.webm");
|
|
|
|
const response = await apiPostFormData<TranscribeResponse>(
|
|
"/api/speech/transcribe",
|
|
formData
|
|
);
|
|
|
|
if (response.data.text) {
|
|
setTranscript(response.data.text);
|
|
setPartialTranscript("");
|
|
onTranscriptRef.current?.(response.data.text);
|
|
}
|
|
} catch (err) {
|
|
const message = err instanceof Error ? err.message : "Transcription request failed";
|
|
setError(message);
|
|
}
|
|
}, []);
|
|
|
|
/**
|
|
* Stop all media tracks on the stream
|
|
*/
|
|
const stopMediaTracks = useCallback((): void => {
|
|
if (streamRef.current) {
|
|
streamRef.current.getTracks().forEach((track) => {
|
|
track.stop();
|
|
});
|
|
streamRef.current = null;
|
|
}
|
|
}, []);
|
|
|
|
/**
|
|
* Start microphone capture and transcription
|
|
*/
|
|
const startRecording = useCallback(async (): Promise<void> => {
|
|
// Prevent double-start
|
|
if (isRecordingRef.current) {
|
|
return;
|
|
}
|
|
|
|
setError(null);
|
|
setPartialTranscript("");
|
|
recordedChunksRef.current = [];
|
|
|
|
try {
|
|
// Request microphone access
|
|
const stream = await navigator.mediaDevices.getUserMedia({
|
|
audio: {
|
|
echoCancellation: true,
|
|
noiseSuppression: true,
|
|
sampleRate,
|
|
},
|
|
});
|
|
|
|
streamRef.current = stream;
|
|
|
|
// Set up audio level visualization
|
|
setupAudioAnalysis(stream);
|
|
|
|
// Determine MIME type
|
|
const mimeType = getAudioMimeType();
|
|
|
|
// Create MediaRecorder
|
|
const mediaRecorder = new MediaRecorder(stream, { mimeType });
|
|
mediaRecorderRef.current = mediaRecorder;
|
|
|
|
// Connect WebSocket if enabled
|
|
let socket: Socket | null = null;
|
|
if (useWs) {
|
|
socket = connectSocket();
|
|
|
|
// Emit start-transcription event
|
|
socket.emit("start-transcription", {
|
|
format: mimeType,
|
|
sampleRate,
|
|
});
|
|
}
|
|
|
|
// Handle audio data chunks
|
|
mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => {
|
|
if (event.data.size > 0) {
|
|
if (socket?.connected) {
|
|
// Stream chunks via WebSocket
|
|
socket.emit("audio-chunk", event.data);
|
|
} else {
|
|
// Collect chunks for REST upload
|
|
recordedChunksRef.current.push(event.data);
|
|
}
|
|
}
|
|
});
|
|
|
|
// Handle recording stop
|
|
mediaRecorder.addEventListener("stop", () => {
|
|
// If using REST fallback, send collected audio
|
|
if (!useWs || !socket?.connected) {
|
|
if (recordedChunksRef.current.length > 0) {
|
|
const audioBlob = new Blob(recordedChunksRef.current, {
|
|
type: mimeType,
|
|
});
|
|
void sendAudioViaRest(audioBlob);
|
|
}
|
|
}
|
|
});
|
|
|
|
// Handle errors
|
|
mediaRecorder.addEventListener("error", (event: Event) => {
|
|
let errorMessage = "Recording encountered an issue. Please try again.";
|
|
if ("error" in event && event.error instanceof DOMException) {
|
|
errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`;
|
|
}
|
|
setError(errorMessage);
|
|
setIsRecording(false);
|
|
isRecordingRef.current = false;
|
|
stopMediaTracks();
|
|
cleanupAudioAnalysis();
|
|
});
|
|
|
|
// Start recording with timeslice for streaming chunks (250ms intervals)
|
|
mediaRecorder.start(250);
|
|
setIsRecording(true);
|
|
isRecordingRef.current = true;
|
|
} catch (err) {
|
|
// Handle specific error types
|
|
if (err instanceof DOMException) {
|
|
if (err.name === "NotAllowedError") {
|
|
setError(
|
|
"Microphone access was not granted. Please allow microphone access to use voice input."
|
|
);
|
|
} else if (err.name === "NotFoundError") {
|
|
setError("No microphone found. Please connect a microphone and try again.");
|
|
} else {
|
|
setError("Unable to access the microphone. Please check your device settings.");
|
|
}
|
|
} else {
|
|
setError("Unable to start voice input. Please try again.");
|
|
}
|
|
|
|
// Clean up on failure
|
|
stopMediaTracks();
|
|
cleanupAudioAnalysis();
|
|
}
|
|
}, [
|
|
useWs,
|
|
sampleRate,
|
|
setupAudioAnalysis,
|
|
connectSocket,
|
|
sendAudioViaRest,
|
|
stopMediaTracks,
|
|
cleanupAudioAnalysis,
|
|
]);
|
|
|
|
/**
|
|
* Stop microphone capture and transcription
|
|
*/
|
|
const stopRecording = useCallback((): void => {
|
|
setIsRecording(false);
|
|
isRecordingRef.current = false;
|
|
|
|
// Stop MediaRecorder
|
|
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
|
|
mediaRecorderRef.current.stop();
|
|
mediaRecorderRef.current = null;
|
|
}
|
|
|
|
// Stop media tracks
|
|
stopMediaTracks();
|
|
|
|
// Clean up audio analysis
|
|
cleanupAudioAnalysis();
|
|
|
|
// Emit stop event and disconnect WebSocket
|
|
if (socketRef.current) {
|
|
socketRef.current.emit("stop-transcription");
|
|
// Give the server a moment to process the final chunk before disconnecting
|
|
setTimeout(() => {
|
|
disconnectSocket();
|
|
}, 500);
|
|
}
|
|
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
|
|
|
|
// Cleanup on unmount
|
|
useEffect(() => {
|
|
return (): void => {
|
|
isRecordingRef.current = false;
|
|
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
|
|
mediaRecorderRef.current.stop();
|
|
}
|
|
stopMediaTracks();
|
|
cleanupAudioAnalysis();
|
|
disconnectSocket();
|
|
};
|
|
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
|
|
|
|
return {
|
|
isRecording,
|
|
startRecording,
|
|
stopRecording,
|
|
transcript,
|
|
partialTranscript,
|
|
error,
|
|
audioLevel,
|
|
};
|
|
}
|