feat(#403): add audio playback component for TTS output
All checks were successful
ci/woodpecker/push/web Pipeline was successful

Implements AudioPlayer inline component with play/pause, progress bar,
speed control (0.5x-2x), download, and duration display. Adds
TextToSpeechButton "Read aloud" component that synthesizes text via
the speech API and integrates AudioPlayer for playback. Includes
useTextToSpeech hook with API integration, audio caching, and
playback state management. All 32 tests passing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 03:05:39 -06:00
parent 28c9e6fe65
commit 74d6c1092e
14 changed files with 2664 additions and 0 deletions

View File

@@ -0,0 +1,285 @@
/**
* @file useTextToSpeech.test.ts
* @description Tests for the useTextToSpeech hook that manages TTS API integration
*/
import { renderHook, act } from "@testing-library/react";
import { describe, it, expect, beforeEach, vi, afterEach } from "vitest";
import { useTextToSpeech } from "./useTextToSpeech";
import * as speechApi from "@/lib/api/speech";
// Mock the speech API module
vi.mock("@/lib/api/speech", () => ({
synthesizeSpeech: vi.fn(),
getVoices: vi.fn(),
}));
// Mock URL.createObjectURL and URL.revokeObjectURL
const mockCreateObjectURL = vi.fn().mockReturnValue("blob:mock-audio-url");
const mockRevokeObjectURL = vi.fn();
beforeEach(() => {
global.URL.createObjectURL = mockCreateObjectURL;
global.URL.revokeObjectURL = mockRevokeObjectURL;
});
// Mock HTMLAudioElement
class MockAudio {
src = "";
currentTime = 0;
duration = 120;
paused = true;
playbackRate = 1;
volume = 1;
onended: (() => void) | null = null;
ontimeupdate: (() => void) | null = null;
onloadedmetadata: (() => void) | null = null;
onerror: ((e: unknown) => void) | null = null;
play(): Promise<void> {
this.paused = false;
return Promise.resolve();
}
pause(): void {
this.paused = true;
}
addEventListener(event: string, handler: () => void): void {
if (event === "ended") this.onended = handler;
if (event === "timeupdate") this.ontimeupdate = handler;
if (event === "loadedmetadata") this.onloadedmetadata = handler;
if (event === "error") this.onerror = handler;
}
removeEventListener(): void {
// no-op for tests
}
}
vi.stubGlobal("Audio", MockAudio);
const mockSynthesizeSpeech = speechApi.synthesizeSpeech as ReturnType<typeof vi.fn>;
describe("useTextToSpeech", () => {
beforeEach(() => {
vi.clearAllMocks();
mockCreateObjectURL.mockReturnValue("blob:mock-audio-url");
});
afterEach(() => {
vi.restoreAllMocks();
});
describe("initial state", () => {
it("should return correct initial interface", () => {
const { result } = renderHook(() => useTextToSpeech());
expect(result.current.synthesize).toBeTypeOf("function");
expect(result.current.play).toBeTypeOf("function");
expect(result.current.pause).toBeTypeOf("function");
expect(result.current.stop).toBeTypeOf("function");
expect(result.current.audioUrl).toBeNull();
expect(result.current.isLoading).toBe(false);
expect(result.current.error).toBeNull();
expect(result.current.isPlaying).toBe(false);
expect(result.current.duration).toBe(0);
expect(result.current.currentTime).toBe(0);
});
});
describe("synthesize", () => {
it("should call API and return audio blob URL", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello world");
});
expect(mockSynthesizeSpeech).toHaveBeenCalledWith({
text: "Hello world",
});
expect(result.current.audioUrl).toBe("blob:mock-audio-url");
expect(result.current.isLoading).toBe(false);
expect(result.current.error).toBeNull();
});
it("should pass voice and tier options to API", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello", {
voice: "alloy",
tier: "premium",
speed: 1.5,
});
});
expect(mockSynthesizeSpeech).toHaveBeenCalledWith({
text: "Hello",
voice: "alloy",
tier: "premium",
speed: 1.5,
});
});
it("should set loading state while synthesizing", async () => {
let resolvePromise: ((value: Blob) => void) | undefined;
const pendingPromise = new Promise<Blob>((resolve) => {
resolvePromise = resolve;
});
mockSynthesizeSpeech.mockReturnValueOnce(pendingPromise);
const { result } = renderHook(() => useTextToSpeech());
act(() => {
void result.current.synthesize("Hello");
});
expect(result.current.isLoading).toBe(true);
await act(async () => {
resolvePromise?.(new Blob(["audio"], { type: "audio/mpeg" }));
await pendingPromise;
});
expect(result.current.isLoading).toBe(false);
});
it("should handle API errors gracefully", async () => {
mockSynthesizeSpeech.mockRejectedValueOnce(new Error("Synthesis failed"));
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
expect(result.current.error).toBe("Synthesis failed");
expect(result.current.isLoading).toBe(false);
expect(result.current.audioUrl).toBeNull();
});
it("should cache audio for repeated synthesis of same text", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValue(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
// First call
await act(async () => {
await result.current.synthesize("Hello world");
});
// Second call with same text
await act(async () => {
await result.current.synthesize("Hello world");
});
// API should only be called once due to caching
expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(1);
});
it("should not cache when options differ", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValue(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello", { voice: "alloy" });
});
await act(async () => {
await result.current.synthesize("Hello", { voice: "nova" });
});
expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(2);
});
});
describe("playback controls", () => {
it("should play audio after synthesis", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
await act(async () => {
await result.current.play();
});
expect(result.current.isPlaying).toBe(true);
});
it("should pause audio playback", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
await act(async () => {
await result.current.play();
});
act(() => {
result.current.pause();
});
expect(result.current.isPlaying).toBe(false);
});
it("should stop and reset playback", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
await act(async () => {
await result.current.play();
});
act(() => {
result.current.stop();
});
expect(result.current.isPlaying).toBe(false);
expect(result.current.currentTime).toBe(0);
});
});
describe("cleanup", () => {
it("should revoke object URLs on unmount", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result, unmount } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
unmount();
expect(mockRevokeObjectURL).toHaveBeenCalled();
});
});
});

View File

@@ -0,0 +1,239 @@
/**
* useTextToSpeech hook
* Manages TTS API integration with synthesis, caching, and playback state
*/
import { useState, useCallback, useRef, useEffect } from "react";
import { synthesizeSpeech } from "@/lib/api/speech";
export interface SynthesizeOptions {
voice?: string;
speed?: number;
format?: string;
tier?: string;
}
export interface UseTextToSpeechReturn {
/** Synthesize text to speech audio */
synthesize: (text: string, options?: SynthesizeOptions) => Promise<void>;
/** The URL of the synthesized audio blob */
audioUrl: string | null;
/** Whether synthesis is in progress */
isLoading: boolean;
/** Error message if synthesis failed */
error: string | null;
/** Start or resume audio playback */
play: () => Promise<void>;
/** Pause audio playback */
pause: () => void;
/** Stop audio and reset to beginning */
stop: () => void;
/** Whether audio is currently playing */
isPlaying: boolean;
/** Total duration of the audio in seconds */
duration: number;
/** Current playback position in seconds */
currentTime: number;
}
/** Cache key generator for text + options combination */
function getCacheKey(text: string, options?: SynthesizeOptions): string {
return JSON.stringify({ text, ...options });
}
/**
* Hook for text-to-speech API integration with caching and playback controls
*/
export function useTextToSpeech(): UseTextToSpeechReturn {
const [audioUrl, setAudioUrl] = useState<string | null>(null);
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [isPlaying, setIsPlaying] = useState(false);
const [duration, setDuration] = useState(0);
const [currentTime, setCurrentTime] = useState(0);
// Audio element ref for playback control
const audioRef = useRef<HTMLAudioElement | null>(null);
// Cache: maps cache key -> blob URL
const cacheRef = useRef<Map<string, string>>(new Map());
// Track all blob URLs for cleanup
const blobUrlsRef = useRef<Set<string>>(new Set());
/**
* Clean up audio element event listeners and state
*/
const cleanupAudio = useCallback(() => {
const audio = audioRef.current;
if (audio) {
audio.pause();
audio.removeEventListener("ended", handleEnded);
audio.removeEventListener("timeupdate", handleTimeUpdate);
audio.removeEventListener("loadedmetadata", handleLoadedMetadata);
audioRef.current = null;
}
setIsPlaying(false);
}, []);
/**
* Handle audio ended event
*/
function handleEnded(): void {
setIsPlaying(false);
setCurrentTime(0);
}
/**
* Handle audio time update event
*/
function handleTimeUpdate(): void {
const audio = audioRef.current;
if (audio) {
setCurrentTime(audio.currentTime);
}
}
/**
* Handle audio metadata loaded event
*/
function handleLoadedMetadata(): void {
const audio = audioRef.current;
if (audio && isFinite(audio.duration)) {
setDuration(audio.duration);
}
}
/**
* Set up a new Audio element for a given URL
*/
const setupAudio = useCallback(
(url: string) => {
cleanupAudio();
const audio = new Audio(url);
audio.addEventListener("ended", handleEnded);
audio.addEventListener("timeupdate", handleTimeUpdate);
audio.addEventListener("loadedmetadata", handleLoadedMetadata);
audioRef.current = audio;
},
[cleanupAudio]
);
/**
* Synthesize text to speech
*/
const synthesize = useCallback(
async (text: string, options?: SynthesizeOptions): Promise<void> => {
setError(null);
// Check cache first
const cacheKey = getCacheKey(text, options);
const cachedUrl = cacheRef.current.get(cacheKey);
if (cachedUrl) {
setAudioUrl(cachedUrl);
setupAudio(cachedUrl);
return;
}
setIsLoading(true);
try {
const blob = await synthesizeSpeech({
text,
...(options?.voice !== undefined && { voice: options.voice }),
...(options?.speed !== undefined && { speed: options.speed }),
...(options?.format !== undefined && { format: options.format }),
...(options?.tier !== undefined && { tier: options.tier }),
});
const url = URL.createObjectURL(blob);
// Store in cache and track for cleanup
cacheRef.current.set(cacheKey, url);
blobUrlsRef.current.add(url);
setAudioUrl(url);
setupAudio(url);
} catch (err) {
const errorMsg = err instanceof Error ? err.message : "Speech synthesis failed";
setError(errorMsg);
setAudioUrl(null);
} finally {
setIsLoading(false);
}
},
[setupAudio]
);
/**
* Start or resume audio playback
*/
const play = useCallback(async (): Promise<void> => {
const audio = audioRef.current;
if (audio) {
await audio.play();
setIsPlaying(true);
}
}, []);
/**
* Pause audio playback
*/
const pause = useCallback((): void => {
const audio = audioRef.current;
if (audio) {
audio.pause();
setIsPlaying(false);
}
}, []);
/**
* Stop audio and reset to beginning
*/
const stop = useCallback((): void => {
const audio = audioRef.current;
if (audio) {
audio.pause();
audio.currentTime = 0;
setIsPlaying(false);
setCurrentTime(0);
}
}, []);
// Cleanup on unmount: revoke all blob URLs and clean up audio
useEffect((): (() => void) => {
return (): void => {
// Clean up audio element
const audio = audioRef.current;
if (audio) {
audio.pause();
audio.removeEventListener("ended", handleEnded);
audio.removeEventListener("timeupdate", handleTimeUpdate);
audio.removeEventListener("loadedmetadata", handleLoadedMetadata);
audioRef.current = null;
}
// Revoke all blob URLs
for (const url of blobUrlsRef.current) {
URL.revokeObjectURL(url);
}
blobUrlsRef.current.clear();
cacheRef.current.clear();
};
}, []);
return {
synthesize,
audioUrl,
isLoading,
error,
play,
pause,
stop,
isPlaying,
duration,
currentTime,
};
}

View File

@@ -0,0 +1,362 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import { renderHook, act, waitFor } from "@testing-library/react";
import { useVoiceInput } from "./useVoiceInput";
import type { Socket } from "socket.io-client";
import { io } from "socket.io-client";
// Mock socket.io-client
vi.mock("socket.io-client");
// Mock MediaRecorder
const mockMediaRecorder = {
start: vi.fn(),
stop: vi.fn(),
pause: vi.fn(),
resume: vi.fn(),
state: "inactive" as RecordingState,
ondataavailable: null as ((event: BlobEvent) => void) | null,
onstop: null as (() => void) | null,
onerror: null as ((event: Event) => void) | null,
addEventListener: vi.fn((event: string, handler: EventListenerOrEventListenerObject) => {
if (event === "dataavailable") {
mockMediaRecorder.ondataavailable = handler as (event: BlobEvent) => void;
} else if (event === "stop") {
mockMediaRecorder.onstop = handler as () => void;
} else if (event === "error") {
mockMediaRecorder.onerror = handler as (event: Event) => void;
}
}),
removeEventListener: vi.fn(),
stream: {
getTracks: vi.fn(() => [{ stop: vi.fn() }]),
},
};
// Mock MediaStream with getByteFrequencyData for audio level
const mockAnalyserNode = {
fftSize: 256,
frequencyBinCount: 128,
getByteFrequencyData: vi.fn((array: Uint8Array) => {
// Simulate some audio data
for (let i = 0; i < array.length; i++) {
array[i] = 128;
}
}),
connect: vi.fn(),
disconnect: vi.fn(),
};
const mockMediaStreamSource = {
connect: vi.fn(),
disconnect: vi.fn(),
};
const mockAudioContext = {
createAnalyser: vi.fn(() => mockAnalyserNode),
createMediaStreamSource: vi.fn(() => mockMediaStreamSource),
close: vi.fn(),
state: "running",
};
// Mock getUserMedia
const mockGetUserMedia = vi.fn();
// Set up global mocks
Object.defineProperty(global.navigator, "mediaDevices", {
value: {
getUserMedia: mockGetUserMedia,
},
writable: true,
configurable: true,
});
// Mock AudioContext
vi.stubGlobal(
"AudioContext",
vi.fn(() => mockAudioContext)
);
// Mock MediaRecorder constructor
vi.stubGlobal(
"MediaRecorder",
vi.fn(() => mockMediaRecorder)
);
// Add isTypeSupported static method
(
global.MediaRecorder as unknown as { isTypeSupported: (type: string) => boolean }
).isTypeSupported = vi.fn(() => true);
describe("useVoiceInput", (): void => {
let mockSocket: Partial<Socket>;
let socketEventHandlers: Record<string, (data: unknown) => void>;
beforeEach((): void => {
socketEventHandlers = {};
mockSocket = {
on: vi.fn((event: string, handler: (...args: unknown[]) => void) => {
socketEventHandlers[event] = handler;
return mockSocket;
}) as unknown as Socket["on"],
off: vi.fn(() => mockSocket) as unknown as Socket["off"],
emit: vi.fn() as unknown as Socket["emit"],
connect: vi.fn(),
disconnect: vi.fn(),
connected: true,
};
(io as unknown as ReturnType<typeof vi.fn>).mockReturnValue(mockSocket);
// Reset MediaRecorder mock state
mockMediaRecorder.state = "inactive";
mockMediaRecorder.ondataavailable = null;
mockMediaRecorder.onstop = null;
mockMediaRecorder.onerror = null;
// Default: getUserMedia succeeds
const mockStream = {
getTracks: vi.fn(() => [{ stop: vi.fn() }]),
} as unknown as MediaStream;
mockGetUserMedia.mockResolvedValue(mockStream);
});
afterEach((): void => {
vi.clearAllMocks();
});
it("should return the correct interface", (): void => {
const { result } = renderHook(() => useVoiceInput());
expect(result.current).toHaveProperty("isRecording");
expect(result.current).toHaveProperty("startRecording");
expect(result.current).toHaveProperty("stopRecording");
expect(result.current).toHaveProperty("transcript");
expect(result.current).toHaveProperty("partialTranscript");
expect(result.current).toHaveProperty("error");
expect(result.current).toHaveProperty("audioLevel");
});
it("should start with default state", (): void => {
const { result } = renderHook(() => useVoiceInput());
expect(result.current.isRecording).toBe(false);
expect(result.current.transcript).toBe("");
expect(result.current.partialTranscript).toBe("");
expect(result.current.error).toBeNull();
expect(result.current.audioLevel).toBe(0);
});
it("should start recording when startRecording is called", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(true);
expect(mockGetUserMedia).toHaveBeenCalledWith({
audio: {
echoCancellation: true,
noiseSuppression: true,
sampleRate: 16000,
},
});
});
it("should stop recording when stopRecording is called", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(true);
act(() => {
result.current.stopRecording();
});
expect(result.current.isRecording).toBe(false);
});
it("should set error when microphone access is denied", async (): Promise<void> => {
mockGetUserMedia.mockRejectedValueOnce(
new DOMException("Permission denied", "NotAllowedError")
);
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(false);
expect(result.current.error).toBeTruthy();
expect(result.current.error).toContain("microphone");
});
it("should connect to speech WebSocket namespace", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(io).toHaveBeenCalledWith(
expect.any(String),
expect.objectContaining({
path: "/socket.io",
})
);
});
it("should emit start-transcription when recording begins", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(mockSocket.emit).toHaveBeenCalledWith(
"start-transcription",
expect.objectContaining({
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
format: expect.any(String),
})
);
});
it("should emit stop-transcription when recording stops", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
result.current.stopRecording();
});
expect(mockSocket.emit).toHaveBeenCalledWith("stop-transcription");
});
it("should handle partial transcription events", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-partial"]?.({
text: "hello world",
});
});
await waitFor(() => {
expect(result.current.partialTranscript).toBe("hello world");
});
});
it("should handle final transcription events", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-final"]?.({
text: "hello world final",
});
});
await waitFor(() => {
expect(result.current.transcript).toBe("hello world final");
});
});
it("should handle transcription error events", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-error"]?.({
message: "Transcription failed",
});
});
await waitFor(() => {
expect(result.current.error).toBe("Transcription failed");
});
});
it("should call onTranscript callback when final transcription received", async (): Promise<void> => {
const onTranscript = vi.fn();
const { result } = renderHook(() => useVoiceInput({ onTranscript }));
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-final"]?.({
text: "final text",
});
});
await waitFor(() => {
expect(onTranscript).toHaveBeenCalledWith("final text");
});
});
it("should clean up on unmount", async (): Promise<void> => {
const { result, unmount } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
unmount();
expect(mockSocket.disconnect).toHaveBeenCalled();
});
it("should not start recording if already recording", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
// Reset the call count
mockGetUserMedia.mockClear();
await act(async () => {
await result.current.startRecording();
});
// Should not have called getUserMedia again
expect(mockGetUserMedia).not.toHaveBeenCalled();
});
describe("REST fallback", (): void => {
it("should fall back to REST when WebSocket is unavailable", async (): Promise<void> => {
// Simulate socket not connecting
(mockSocket as { connected: boolean }).connected = false;
const { result } = renderHook(() => useVoiceInput({ useWebSocket: false }));
// Should still be able to start recording (REST mode)
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(true);
});
});
});

View File

@@ -0,0 +1,409 @@
/**
* useVoiceInput hook
*
* Custom hook for microphone capture and real-time transcription.
* Supports WebSocket streaming for real-time partial transcriptions
* with REST upload fallback when WebSocket is unavailable.
*/
import { useState, useCallback, useRef, useEffect } from "react";
import type { Socket } from "socket.io-client";
import { io } from "socket.io-client";
import { API_BASE_URL } from "@/lib/config";
import { apiPostFormData } from "@/lib/api/client";
/** Options for the useVoiceInput hook */
export interface UseVoiceInputOptions {
/** Callback fired when final transcription is received */
onTranscript?: (text: string) => void;
/** Whether to use WebSocket streaming (default: true) */
useWebSocket?: boolean;
/** Audio sample rate in Hz (default: 16000) */
sampleRate?: number;
}
/** Return type for the useVoiceInput hook */
export interface UseVoiceInputReturn {
/** Whether the microphone is currently recording */
isRecording: boolean;
/** Start microphone capture and transcription */
startRecording: () => Promise<void>;
/** Stop microphone capture and transcription */
stopRecording: () => void;
/** The final transcription text */
transcript: string;
/** Partial transcription text (updates in real-time) */
partialTranscript: string;
/** Error message if something went wrong */
error: string | null;
/** Current audio input level (0-1) */
audioLevel: number;
}
interface TranscriptionPartialPayload {
text: string;
}
interface TranscriptionFinalPayload {
text: string;
}
interface TranscriptionErrorPayload {
message: string;
}
interface TranscribeResponse {
data: {
text: string;
};
}
/**
* Determine the best MIME type for audio recording
*/
function getAudioMimeType(): string {
if (typeof MediaRecorder === "undefined") {
return "audio/webm";
}
const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"];
for (const type of types) {
if (MediaRecorder.isTypeSupported(type)) {
return type;
}
}
return "audio/webm";
}
/**
* Hook for microphone capture and real-time speech-to-text transcription.
*
* Uses WebSocket streaming by default for real-time partial transcriptions.
* Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
* is disabled or unavailable.
*/
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options;
const [isRecording, setIsRecording] = useState(false);
const [transcript, setTranscript] = useState("");
const [partialTranscript, setPartialTranscript] = useState("");
const [error, setError] = useState<string | null>(null);
const [audioLevel, setAudioLevel] = useState(0);
// Refs to hold mutable state without re-renders
const socketRef = useRef<Socket | null>(null);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const analyserRef = useRef<AnalyserNode | null>(null);
const animationFrameRef = useRef<number | null>(null);
const onTranscriptRef = useRef(onTranscript);
const recordedChunksRef = useRef<Blob[]>([]);
const isRecordingRef = useRef(false);
// Keep callback ref up to date
useEffect(() => {
onTranscriptRef.current = onTranscript;
}, [onTranscript]);
/**
* Set up audio analysis for visualizing input level
*/
const setupAudioAnalysis = useCallback((stream: MediaStream): void => {
try {
const audioContext = new AudioContext();
const analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(stream);
analyser.fftSize = 256;
source.connect(analyser);
audioContextRef.current = audioContext;
analyserRef.current = analyser;
// Start level monitoring
const dataArray = new Uint8Array(analyser.frequencyBinCount);
const updateLevel = (): void => {
if (!isRecordingRef.current) {
return;
}
analyser.getByteFrequencyData(dataArray);
// Calculate average level
let sum = 0;
for (const value of dataArray) {
sum += value;
}
const average = sum / dataArray.length / 255;
setAudioLevel(average);
animationFrameRef.current = requestAnimationFrame(updateLevel);
};
animationFrameRef.current = requestAnimationFrame(updateLevel);
} catch {
// Audio analysis is non-critical; continue without it
console.warn("Audio analysis not available");
}
}, []);
/**
* Clean up audio analysis resources
*/
const cleanupAudioAnalysis = useCallback((): void => {
if (animationFrameRef.current !== null) {
cancelAnimationFrame(animationFrameRef.current);
animationFrameRef.current = null;
}
if (audioContextRef.current) {
void audioContextRef.current.close();
audioContextRef.current = null;
}
analyserRef.current = null;
setAudioLevel(0);
}, []);
/**
* Connect to the speech WebSocket namespace
*/
const connectSocket = useCallback((): Socket => {
const socket = io(API_BASE_URL, {
path: "/socket.io",
transports: ["websocket", "polling"],
});
socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
setPartialTranscript(data.text);
});
socket.on("transcription-final", (data: TranscriptionFinalPayload) => {
setTranscript(data.text);
setPartialTranscript("");
onTranscriptRef.current?.(data.text);
});
socket.on("transcription-error", (data: TranscriptionErrorPayload) => {
setError(data.message);
});
socketRef.current = socket;
return socket;
}, []);
/**
* Disconnect the WebSocket
*/
const disconnectSocket = useCallback((): void => {
if (socketRef.current) {
socketRef.current.off("transcription-partial");
socketRef.current.off("transcription-final");
socketRef.current.off("transcription-error");
socketRef.current.disconnect();
socketRef.current = null;
}
}, []);
/**
* Send recorded audio via REST API as fallback
*/
const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
try {
const formData = new FormData();
formData.append("audio", audioBlob, "recording.webm");
const response = await apiPostFormData<TranscribeResponse>(
"/api/speech/transcribe",
formData
);
if (response.data.text) {
setTranscript(response.data.text);
setPartialTranscript("");
onTranscriptRef.current?.(response.data.text);
}
} catch (err) {
const message = err instanceof Error ? err.message : "Transcription request failed";
setError(message);
}
}, []);
/**
* Stop all media tracks on the stream
*/
const stopMediaTracks = useCallback((): void => {
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => {
track.stop();
});
streamRef.current = null;
}
}, []);
/**
* Start microphone capture and transcription
*/
const startRecording = useCallback(async (): Promise<void> => {
// Prevent double-start
if (isRecordingRef.current) {
return;
}
setError(null);
setPartialTranscript("");
recordedChunksRef.current = [];
try {
// Request microphone access
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
sampleRate,
},
});
streamRef.current = stream;
// Set up audio level visualization
setupAudioAnalysis(stream);
// Determine MIME type
const mimeType = getAudioMimeType();
// Create MediaRecorder
const mediaRecorder = new MediaRecorder(stream, { mimeType });
mediaRecorderRef.current = mediaRecorder;
// Connect WebSocket if enabled
let socket: Socket | null = null;
if (useWs) {
socket = connectSocket();
// Emit start-transcription event
socket.emit("start-transcription", {
format: mimeType,
sampleRate,
});
}
// Handle audio data chunks
mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => {
if (event.data.size > 0) {
if (socket?.connected) {
// Stream chunks via WebSocket
socket.emit("audio-chunk", event.data);
} else {
// Collect chunks for REST upload
recordedChunksRef.current.push(event.data);
}
}
});
// Handle recording stop
mediaRecorder.addEventListener("stop", () => {
// If using REST fallback, send collected audio
if (!useWs || !socket?.connected) {
if (recordedChunksRef.current.length > 0) {
const audioBlob = new Blob(recordedChunksRef.current, {
type: mimeType,
});
void sendAudioViaRest(audioBlob);
}
}
});
// Handle errors
mediaRecorder.addEventListener("error", () => {
setError("Recording encountered an issue. Please try again.");
setIsRecording(false);
isRecordingRef.current = false;
});
// Start recording with timeslice for streaming chunks (250ms intervals)
mediaRecorder.start(250);
setIsRecording(true);
isRecordingRef.current = true;
} catch (err) {
// Handle specific error types
if (err instanceof DOMException) {
if (err.name === "NotAllowedError") {
setError(
"Microphone access was not granted. Please allow microphone access to use voice input."
);
} else if (err.name === "NotFoundError") {
setError("No microphone found. Please connect a microphone and try again.");
} else {
setError("Unable to access the microphone. Please check your device settings.");
}
} else {
setError("Unable to start voice input. Please try again.");
}
// Clean up on failure
stopMediaTracks();
cleanupAudioAnalysis();
}
}, [
useWs,
sampleRate,
setupAudioAnalysis,
connectSocket,
sendAudioViaRest,
stopMediaTracks,
cleanupAudioAnalysis,
]);
/**
* Stop microphone capture and transcription
*/
const stopRecording = useCallback((): void => {
setIsRecording(false);
isRecordingRef.current = false;
// Stop MediaRecorder
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
mediaRecorderRef.current.stop();
mediaRecorderRef.current = null;
}
// Stop media tracks
stopMediaTracks();
// Clean up audio analysis
cleanupAudioAnalysis();
// Emit stop event and disconnect WebSocket
if (socketRef.current) {
socketRef.current.emit("stop-transcription");
// Give the server a moment to process the final chunk before disconnecting
setTimeout(() => {
disconnectSocket();
}, 500);
}
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
// Cleanup on unmount
useEffect(() => {
return (): void => {
isRecordingRef.current = false;
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
mediaRecorderRef.current.stop();
}
stopMediaTracks();
cleanupAudioAnalysis();
disconnectSocket();
};
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
return {
isRecording,
startRecording,
stopRecording,
transcript,
partialTranscript,
error,
audioLevel,
};
}