feat(#403): add audio playback component for TTS output
All checks were successful
ci/woodpecker/push/web Pipeline was successful
All checks were successful
ci/woodpecker/push/web Pipeline was successful
Implements AudioPlayer inline component with play/pause, progress bar, speed control (0.5x-2x), download, and duration display. Adds TextToSpeechButton "Read aloud" component that synthesizes text via the speech API and integrates AudioPlayer for playback. Includes useTextToSpeech hook with API integration, audio caching, and playback state management. All 32 tests passing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
409
apps/web/src/hooks/useVoiceInput.ts
Normal file
409
apps/web/src/hooks/useVoiceInput.ts
Normal file
@@ -0,0 +1,409 @@
|
||||
/**
|
||||
* useVoiceInput hook
|
||||
*
|
||||
* Custom hook for microphone capture and real-time transcription.
|
||||
* Supports WebSocket streaming for real-time partial transcriptions
|
||||
* with REST upload fallback when WebSocket is unavailable.
|
||||
*/
|
||||
|
||||
import { useState, useCallback, useRef, useEffect } from "react";
|
||||
import type { Socket } from "socket.io-client";
|
||||
import { io } from "socket.io-client";
|
||||
import { API_BASE_URL } from "@/lib/config";
|
||||
import { apiPostFormData } from "@/lib/api/client";
|
||||
|
||||
/** Options for the useVoiceInput hook */
|
||||
export interface UseVoiceInputOptions {
|
||||
/** Callback fired when final transcription is received */
|
||||
onTranscript?: (text: string) => void;
|
||||
/** Whether to use WebSocket streaming (default: true) */
|
||||
useWebSocket?: boolean;
|
||||
/** Audio sample rate in Hz (default: 16000) */
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
/** Return type for the useVoiceInput hook */
|
||||
export interface UseVoiceInputReturn {
|
||||
/** Whether the microphone is currently recording */
|
||||
isRecording: boolean;
|
||||
/** Start microphone capture and transcription */
|
||||
startRecording: () => Promise<void>;
|
||||
/** Stop microphone capture and transcription */
|
||||
stopRecording: () => void;
|
||||
/** The final transcription text */
|
||||
transcript: string;
|
||||
/** Partial transcription text (updates in real-time) */
|
||||
partialTranscript: string;
|
||||
/** Error message if something went wrong */
|
||||
error: string | null;
|
||||
/** Current audio input level (0-1) */
|
||||
audioLevel: number;
|
||||
}
|
||||
|
||||
interface TranscriptionPartialPayload {
|
||||
text: string;
|
||||
}
|
||||
|
||||
interface TranscriptionFinalPayload {
|
||||
text: string;
|
||||
}
|
||||
|
||||
interface TranscriptionErrorPayload {
|
||||
message: string;
|
||||
}
|
||||
|
||||
interface TranscribeResponse {
|
||||
data: {
|
||||
text: string;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the best MIME type for audio recording
|
||||
*/
|
||||
function getAudioMimeType(): string {
|
||||
if (typeof MediaRecorder === "undefined") {
|
||||
return "audio/webm";
|
||||
}
|
||||
const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"];
|
||||
for (const type of types) {
|
||||
if (MediaRecorder.isTypeSupported(type)) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
return "audio/webm";
|
||||
}
|
||||
|
||||
/**
|
||||
* Hook for microphone capture and real-time speech-to-text transcription.
|
||||
*
|
||||
* Uses WebSocket streaming by default for real-time partial transcriptions.
|
||||
* Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
|
||||
* is disabled or unavailable.
|
||||
*/
|
||||
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
|
||||
const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options;
|
||||
|
||||
const [isRecording, setIsRecording] = useState(false);
|
||||
const [transcript, setTranscript] = useState("");
|
||||
const [partialTranscript, setPartialTranscript] = useState("");
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [audioLevel, setAudioLevel] = useState(0);
|
||||
|
||||
// Refs to hold mutable state without re-renders
|
||||
const socketRef = useRef<Socket | null>(null);
|
||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||
const streamRef = useRef<MediaStream | null>(null);
|
||||
const audioContextRef = useRef<AudioContext | null>(null);
|
||||
const analyserRef = useRef<AnalyserNode | null>(null);
|
||||
const animationFrameRef = useRef<number | null>(null);
|
||||
const onTranscriptRef = useRef(onTranscript);
|
||||
const recordedChunksRef = useRef<Blob[]>([]);
|
||||
const isRecordingRef = useRef(false);
|
||||
|
||||
// Keep callback ref up to date
|
||||
useEffect(() => {
|
||||
onTranscriptRef.current = onTranscript;
|
||||
}, [onTranscript]);
|
||||
|
||||
/**
|
||||
* Set up audio analysis for visualizing input level
|
||||
*/
|
||||
const setupAudioAnalysis = useCallback((stream: MediaStream): void => {
|
||||
try {
|
||||
const audioContext = new AudioContext();
|
||||
const analyser = audioContext.createAnalyser();
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
|
||||
analyser.fftSize = 256;
|
||||
source.connect(analyser);
|
||||
|
||||
audioContextRef.current = audioContext;
|
||||
analyserRef.current = analyser;
|
||||
|
||||
// Start level monitoring
|
||||
const dataArray = new Uint8Array(analyser.frequencyBinCount);
|
||||
|
||||
const updateLevel = (): void => {
|
||||
if (!isRecordingRef.current) {
|
||||
return;
|
||||
}
|
||||
|
||||
analyser.getByteFrequencyData(dataArray);
|
||||
|
||||
// Calculate average level
|
||||
let sum = 0;
|
||||
for (const value of dataArray) {
|
||||
sum += value;
|
||||
}
|
||||
const average = sum / dataArray.length / 255;
|
||||
setAudioLevel(average);
|
||||
|
||||
animationFrameRef.current = requestAnimationFrame(updateLevel);
|
||||
};
|
||||
|
||||
animationFrameRef.current = requestAnimationFrame(updateLevel);
|
||||
} catch {
|
||||
// Audio analysis is non-critical; continue without it
|
||||
console.warn("Audio analysis not available");
|
||||
}
|
||||
}, []);
|
||||
|
||||
/**
|
||||
* Clean up audio analysis resources
|
||||
*/
|
||||
const cleanupAudioAnalysis = useCallback((): void => {
|
||||
if (animationFrameRef.current !== null) {
|
||||
cancelAnimationFrame(animationFrameRef.current);
|
||||
animationFrameRef.current = null;
|
||||
}
|
||||
if (audioContextRef.current) {
|
||||
void audioContextRef.current.close();
|
||||
audioContextRef.current = null;
|
||||
}
|
||||
analyserRef.current = null;
|
||||
setAudioLevel(0);
|
||||
}, []);
|
||||
|
||||
/**
|
||||
* Connect to the speech WebSocket namespace
|
||||
*/
|
||||
const connectSocket = useCallback((): Socket => {
|
||||
const socket = io(API_BASE_URL, {
|
||||
path: "/socket.io",
|
||||
transports: ["websocket", "polling"],
|
||||
});
|
||||
|
||||
socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
|
||||
setPartialTranscript(data.text);
|
||||
});
|
||||
|
||||
socket.on("transcription-final", (data: TranscriptionFinalPayload) => {
|
||||
setTranscript(data.text);
|
||||
setPartialTranscript("");
|
||||
onTranscriptRef.current?.(data.text);
|
||||
});
|
||||
|
||||
socket.on("transcription-error", (data: TranscriptionErrorPayload) => {
|
||||
setError(data.message);
|
||||
});
|
||||
|
||||
socketRef.current = socket;
|
||||
return socket;
|
||||
}, []);
|
||||
|
||||
/**
|
||||
* Disconnect the WebSocket
|
||||
*/
|
||||
const disconnectSocket = useCallback((): void => {
|
||||
if (socketRef.current) {
|
||||
socketRef.current.off("transcription-partial");
|
||||
socketRef.current.off("transcription-final");
|
||||
socketRef.current.off("transcription-error");
|
||||
socketRef.current.disconnect();
|
||||
socketRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
/**
|
||||
* Send recorded audio via REST API as fallback
|
||||
*/
|
||||
const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
|
||||
try {
|
||||
const formData = new FormData();
|
||||
formData.append("audio", audioBlob, "recording.webm");
|
||||
|
||||
const response = await apiPostFormData<TranscribeResponse>(
|
||||
"/api/speech/transcribe",
|
||||
formData
|
||||
);
|
||||
|
||||
if (response.data.text) {
|
||||
setTranscript(response.data.text);
|
||||
setPartialTranscript("");
|
||||
onTranscriptRef.current?.(response.data.text);
|
||||
}
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : "Transcription request failed";
|
||||
setError(message);
|
||||
}
|
||||
}, []);
|
||||
|
||||
/**
|
||||
* Stop all media tracks on the stream
|
||||
*/
|
||||
const stopMediaTracks = useCallback((): void => {
|
||||
if (streamRef.current) {
|
||||
streamRef.current.getTracks().forEach((track) => {
|
||||
track.stop();
|
||||
});
|
||||
streamRef.current = null;
|
||||
}
|
||||
}, []);
|
||||
|
||||
/**
|
||||
* Start microphone capture and transcription
|
||||
*/
|
||||
const startRecording = useCallback(async (): Promise<void> => {
|
||||
// Prevent double-start
|
||||
if (isRecordingRef.current) {
|
||||
return;
|
||||
}
|
||||
|
||||
setError(null);
|
||||
setPartialTranscript("");
|
||||
recordedChunksRef.current = [];
|
||||
|
||||
try {
|
||||
// Request microphone access
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
sampleRate,
|
||||
},
|
||||
});
|
||||
|
||||
streamRef.current = stream;
|
||||
|
||||
// Set up audio level visualization
|
||||
setupAudioAnalysis(stream);
|
||||
|
||||
// Determine MIME type
|
||||
const mimeType = getAudioMimeType();
|
||||
|
||||
// Create MediaRecorder
|
||||
const mediaRecorder = new MediaRecorder(stream, { mimeType });
|
||||
mediaRecorderRef.current = mediaRecorder;
|
||||
|
||||
// Connect WebSocket if enabled
|
||||
let socket: Socket | null = null;
|
||||
if (useWs) {
|
||||
socket = connectSocket();
|
||||
|
||||
// Emit start-transcription event
|
||||
socket.emit("start-transcription", {
|
||||
format: mimeType,
|
||||
sampleRate,
|
||||
});
|
||||
}
|
||||
|
||||
// Handle audio data chunks
|
||||
mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => {
|
||||
if (event.data.size > 0) {
|
||||
if (socket?.connected) {
|
||||
// Stream chunks via WebSocket
|
||||
socket.emit("audio-chunk", event.data);
|
||||
} else {
|
||||
// Collect chunks for REST upload
|
||||
recordedChunksRef.current.push(event.data);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Handle recording stop
|
||||
mediaRecorder.addEventListener("stop", () => {
|
||||
// If using REST fallback, send collected audio
|
||||
if (!useWs || !socket?.connected) {
|
||||
if (recordedChunksRef.current.length > 0) {
|
||||
const audioBlob = new Blob(recordedChunksRef.current, {
|
||||
type: mimeType,
|
||||
});
|
||||
void sendAudioViaRest(audioBlob);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Handle errors
|
||||
mediaRecorder.addEventListener("error", () => {
|
||||
setError("Recording encountered an issue. Please try again.");
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
});
|
||||
|
||||
// Start recording with timeslice for streaming chunks (250ms intervals)
|
||||
mediaRecorder.start(250);
|
||||
setIsRecording(true);
|
||||
isRecordingRef.current = true;
|
||||
} catch (err) {
|
||||
// Handle specific error types
|
||||
if (err instanceof DOMException) {
|
||||
if (err.name === "NotAllowedError") {
|
||||
setError(
|
||||
"Microphone access was not granted. Please allow microphone access to use voice input."
|
||||
);
|
||||
} else if (err.name === "NotFoundError") {
|
||||
setError("No microphone found. Please connect a microphone and try again.");
|
||||
} else {
|
||||
setError("Unable to access the microphone. Please check your device settings.");
|
||||
}
|
||||
} else {
|
||||
setError("Unable to start voice input. Please try again.");
|
||||
}
|
||||
|
||||
// Clean up on failure
|
||||
stopMediaTracks();
|
||||
cleanupAudioAnalysis();
|
||||
}
|
||||
}, [
|
||||
useWs,
|
||||
sampleRate,
|
||||
setupAudioAnalysis,
|
||||
connectSocket,
|
||||
sendAudioViaRest,
|
||||
stopMediaTracks,
|
||||
cleanupAudioAnalysis,
|
||||
]);
|
||||
|
||||
/**
|
||||
* Stop microphone capture and transcription
|
||||
*/
|
||||
const stopRecording = useCallback((): void => {
|
||||
setIsRecording(false);
|
||||
isRecordingRef.current = false;
|
||||
|
||||
// Stop MediaRecorder
|
||||
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
|
||||
mediaRecorderRef.current.stop();
|
||||
mediaRecorderRef.current = null;
|
||||
}
|
||||
|
||||
// Stop media tracks
|
||||
stopMediaTracks();
|
||||
|
||||
// Clean up audio analysis
|
||||
cleanupAudioAnalysis();
|
||||
|
||||
// Emit stop event and disconnect WebSocket
|
||||
if (socketRef.current) {
|
||||
socketRef.current.emit("stop-transcription");
|
||||
// Give the server a moment to process the final chunk before disconnecting
|
||||
setTimeout(() => {
|
||||
disconnectSocket();
|
||||
}, 500);
|
||||
}
|
||||
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
|
||||
|
||||
// Cleanup on unmount
|
||||
useEffect(() => {
|
||||
return (): void => {
|
||||
isRecordingRef.current = false;
|
||||
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
|
||||
mediaRecorderRef.current.stop();
|
||||
}
|
||||
stopMediaTracks();
|
||||
cleanupAudioAnalysis();
|
||||
disconnectSocket();
|
||||
};
|
||||
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
|
||||
|
||||
return {
|
||||
isRecording,
|
||||
startRecording,
|
||||
stopRecording,
|
||||
transcript,
|
||||
partialTranscript,
|
||||
error,
|
||||
audioLevel,
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user