feat: M13-SpeechServices — TTS & STT integration #409

Merged
jason.woltje merged 20 commits from feature/m13-speech-services into develop 2026-02-15 18:37:54 +00:00
14 changed files with 2664 additions and 0 deletions
Showing only changes of commit 74d6c1092e - Show all commits

View File

@@ -0,0 +1,178 @@
/**
* @file AudioPlayer.test.tsx
* @description Tests for the AudioPlayer component that provides inline TTS audio playback
*/
import { describe, it, expect, vi, beforeEach } from "vitest";
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { AudioPlayer } from "./AudioPlayer";
// Mock HTMLAudioElement
class MockAudio {
src = "";
currentTime = 0;
duration = 60;
paused = true;
playbackRate = 1;
volume = 1;
onended: (() => void) | null = null;
ontimeupdate: (() => void) | null = null;
onloadedmetadata: (() => void) | null = null;
onerror: ((e: unknown) => void) | null = null;
play(): Promise<void> {
this.paused = false;
return Promise.resolve();
}
pause(): void {
this.paused = true;
}
addEventListener(event: string, handler: () => void): void {
if (event === "ended") this.onended = handler;
if (event === "timeupdate") this.ontimeupdate = handler;
if (event === "loadedmetadata") this.onloadedmetadata = handler;
if (event === "error") this.onerror = handler;
}
removeEventListener(): void {
// no-op for tests
}
}
vi.stubGlobal("Audio", MockAudio);
describe("AudioPlayer", () => {
beforeEach(() => {
vi.clearAllMocks();
});
describe("rendering", () => {
it("should render play button", () => {
render(<AudioPlayer src="blob:test-audio" />);
const playButton = screen.getByRole("button", { name: "Play audio" });
expect(playButton).toBeInTheDocument();
});
it("should render download button", () => {
render(<AudioPlayer src="blob:test-audio" />);
const downloadButton = screen.getByRole("button", { name: /download/i });
expect(downloadButton).toBeInTheDocument();
});
it("should render time display showing 0:00", () => {
render(<AudioPlayer src="blob:test-audio" />);
expect(screen.getByText("0:00")).toBeInTheDocument();
});
it("should render speed control", () => {
render(<AudioPlayer src="blob:test-audio" />);
const speedButton = screen.getByRole("button", { name: "Playback speed" });
expect(speedButton).toBeInTheDocument();
});
it("should render progress bar", () => {
render(<AudioPlayer src="blob:test-audio" />);
const progressBar = screen.getByRole("progressbar");
expect(progressBar).toBeInTheDocument();
});
it("should not render when src is null", () => {
const { container } = render(<AudioPlayer src={null} />);
expect(container.firstChild).toBeNull();
});
});
describe("play/pause", () => {
it("should toggle to pause button when playing", async () => {
const user = userEvent.setup();
render(<AudioPlayer src="blob:test-audio" />);
const playButton = screen.getByRole("button", { name: "Play audio" });
await user.click(playButton);
expect(screen.getByRole("button", { name: "Pause audio" })).toBeInTheDocument();
});
});
describe("speed control", () => {
it("should cycle through speed options on click", async () => {
const user = userEvent.setup();
render(<AudioPlayer src="blob:test-audio" />);
const speedButton = screen.getByRole("button", { name: "Playback speed" });
// Default should be 1x
expect(speedButton).toHaveTextContent("1x");
// Click to go to 1.5x
await user.click(speedButton);
expect(speedButton).toHaveTextContent("1.5x");
// Click to go to 2x
await user.click(speedButton);
expect(speedButton).toHaveTextContent("2x");
// Click to go to 0.5x
await user.click(speedButton);
expect(speedButton).toHaveTextContent("0.5x");
// Click to go back to 1x
await user.click(speedButton);
expect(speedButton).toHaveTextContent("1x");
});
});
describe("accessibility", () => {
it("should have proper aria labels on controls", () => {
render(<AudioPlayer src="blob:test-audio" />);
expect(screen.getByRole("button", { name: "Play audio" })).toBeInTheDocument();
expect(screen.getByRole("button", { name: /download/i })).toBeInTheDocument();
expect(screen.getByRole("button", { name: "Playback speed" })).toBeInTheDocument();
expect(screen.getByRole("progressbar")).toHaveAttribute("aria-label");
});
it("should have region role on the player container", () => {
render(<AudioPlayer src="blob:test-audio" />);
expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument();
});
});
describe("design", () => {
it("should not use aggressive red colors", () => {
const { container } = render(<AudioPlayer src="blob:test-audio" />);
const allElements = container.querySelectorAll("*");
allElements.forEach((el) => {
const className = el.className;
if (typeof className === "string") {
expect(className).not.toMatch(/bg-red-|text-red-|border-red-/);
}
});
});
});
describe("callbacks", () => {
it("should call onPlayStateChange when play state changes", async () => {
const onPlayStateChange = vi.fn();
const user = userEvent.setup();
render(<AudioPlayer src="blob:test-audio" onPlayStateChange={onPlayStateChange} />);
const playButton = screen.getByRole("button", { name: "Play audio" });
await user.click(playButton);
expect(onPlayStateChange).toHaveBeenCalledWith(true);
});
});
});

View File

@@ -0,0 +1,250 @@
/**
* AudioPlayer Component
* Inline audio player for TTS content with play/pause, progress,
* speed control, download, and duration display.
*
* Follows PDA-friendly design: no aggressive colors, calm interface.
*/
import { useState, useRef, useEffect, useCallback } from "react";
import type { ReactElement } from "react";
/** Playback speed options */
const SPEED_OPTIONS = [1, 1.5, 2, 0.5] as const;
export interface AudioPlayerProps {
/** URL of the audio to play (blob URL or HTTP URL). If null, nothing renders. */
src: string | null;
/** Whether to auto-play when src changes */
autoPlay?: boolean;
/** Callback when play state changes */
onPlayStateChange?: (isPlaying: boolean) => void;
/** Optional className for the container */
className?: string;
}
/**
* Format seconds into M:SS display
*/
function formatTime(seconds: number): string {
if (!isFinite(seconds) || seconds < 0) return "0:00";
const mins = Math.floor(seconds / 60);
const secs = Math.floor(seconds % 60);
return `${String(mins)}:${String(secs).padStart(2, "0")}`;
}
/**
* AudioPlayer displays an inline audio player with controls for
* play/pause, progress tracking, speed adjustment, and download.
*/
export function AudioPlayer({
src,
autoPlay = false,
onPlayStateChange,
className = "",
}: AudioPlayerProps): ReactElement | null {
const [isPlaying, setIsPlaying] = useState(false);
const [currentTime, setCurrentTime] = useState(0);
const [duration, setDuration] = useState(0);
const [speedIndex, setSpeedIndex] = useState(0);
const audioRef = useRef<HTMLAudioElement | null>(null);
/**
* Set up audio element when src changes
*/
useEffect((): (() => void) | undefined => {
if (!src) return undefined;
const audio = new Audio(src);
audioRef.current = audio;
const onLoadedMetadata = (): void => {
if (isFinite(audio.duration)) {
setDuration(audio.duration);
}
};
const onTimeUpdate = (): void => {
setCurrentTime(audio.currentTime);
};
const onEnded = (): void => {
setIsPlaying(false);
setCurrentTime(0);
onPlayStateChange?.(false);
};
audio.addEventListener("loadedmetadata", onLoadedMetadata);
audio.addEventListener("timeupdate", onTimeUpdate);
audio.addEventListener("ended", onEnded);
if (autoPlay) {
void audio.play().then(() => {
setIsPlaying(true);
onPlayStateChange?.(true);
});
}
return (): void => {
audio.pause();
audio.removeEventListener("loadedmetadata", onLoadedMetadata);
audio.removeEventListener("timeupdate", onTimeUpdate);
audio.removeEventListener("ended", onEnded);
audioRef.current = null;
};
}, [src, autoPlay, onPlayStateChange]);
/**
* Toggle play/pause
*/
const togglePlayPause = useCallback(async (): Promise<void> => {
const audio = audioRef.current;
if (!audio) return;
if (isPlaying) {
audio.pause();
setIsPlaying(false);
onPlayStateChange?.(false);
} else {
await audio.play();
setIsPlaying(true);
onPlayStateChange?.(true);
}
}, [isPlaying, onPlayStateChange]);
/**
* Cycle through speed options
*/
const cycleSpeed = useCallback((): void => {
const nextIndex = (speedIndex + 1) % SPEED_OPTIONS.length;
setSpeedIndex(nextIndex);
const audio = audioRef.current;
if (audio) {
audio.playbackRate = SPEED_OPTIONS[nextIndex] ?? 1;
}
}, [speedIndex]);
/**
* Handle progress bar click for seeking
*/
const handleProgressClick = useCallback(
(event: React.MouseEvent<HTMLDivElement>): void => {
const audio = audioRef.current;
if (!audio || !duration) return;
const rect = event.currentTarget.getBoundingClientRect();
const clickX = event.clientX - rect.left;
const fraction = clickX / rect.width;
audio.currentTime = fraction * duration;
setCurrentTime(audio.currentTime);
},
[duration]
);
/**
* Handle download
*/
const handleDownload = useCallback((): void => {
if (!src) return;
const link = document.createElement("a");
link.href = src;
link.download = "speech-audio.mp3";
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
}, [src]);
// Don't render if no source
if (!src) return null;
const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
const currentSpeed = SPEED_OPTIONS[speedIndex] ?? 1;
return (
<div
role="region"
aria-label="Audio player"
className={`flex items-center gap-2 rounded-lg border border-gray-200 bg-gray-50 px-3 py-2 ${className}`}
>
{/* Play/Pause Button */}
<button
type="button"
onClick={() => void togglePlayPause()}
aria-label={isPlaying ? "Pause audio" : "Play audio"}
className="flex h-8 w-8 shrink-0 items-center justify-center rounded-full bg-blue-500 text-white transition-colors hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-blue-300"
>
{isPlaying ? (
<svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true">
<rect x="6" y="4" width="4" height="16" rx="1" />
<rect x="14" y="4" width="4" height="16" rx="1" />
</svg>
) : (
<svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true">
<polygon points="6,4 20,12 6,20" />
</svg>
)}
</button>
{/* Time Display */}
<span className="min-w-[3.5rem] text-xs text-gray-500 tabular-nums">
{formatTime(currentTime)}
{duration > 0 && <span className="text-gray-400"> / {formatTime(duration)}</span>}
</span>
{/* Progress Bar */}
<div
role="progressbar"
aria-label="Audio progress"
aria-valuenow={Math.round(progress)}
aria-valuemin={0}
aria-valuemax={100}
className="relative h-1.5 flex-1 cursor-pointer rounded-full bg-gray-200"
onClick={handleProgressClick}
>
<div
className="absolute left-0 top-0 h-full rounded-full bg-blue-400 transition-all"
style={{ width: `${String(Math.min(progress, 100))}%` }}
/>
</div>
{/* Speed Control */}
<button
type="button"
onClick={cycleSpeed}
aria-label="Playback speed"
className="min-w-[2.5rem] rounded px-1.5 py-0.5 text-xs font-medium text-gray-600 transition-colors hover:bg-gray-200 focus:outline-none focus:ring-2 focus:ring-blue-300"
>
{String(currentSpeed)}x
</button>
{/* Download Button */}
<button
type="button"
onClick={handleDownload}
aria-label="Download audio"
className="flex h-7 w-7 shrink-0 items-center justify-center rounded text-gray-500 transition-colors hover:bg-gray-200 hover:text-gray-700 focus:outline-none focus:ring-2 focus:ring-blue-300"
>
<svg
width="14"
height="14"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
aria-hidden="true"
>
<path d="M21 15v4a2 2 0 01-2 2H5a2 2 0 01-2-2v-4" />
<polyline points="7 10 12 15 17 10" />
<line x1="12" y1="15" x2="12" y2="3" />
</svg>
</button>
</div>
);
}
export default AudioPlayer;

View File

@@ -0,0 +1,70 @@
import { describe, it, expect } from "vitest";
import { render, screen } from "@testing-library/react";
import { AudioVisualizer } from "./AudioVisualizer";
describe("AudioVisualizer", (): void => {
it("should render the visualizer container", (): void => {
render(<AudioVisualizer audioLevel={0} isActive={false} />);
const container = screen.getByTestId("audio-visualizer");
expect(container).toBeInTheDocument();
});
it("should render visualization bars", (): void => {
render(<AudioVisualizer audioLevel={0.5} isActive={true} />);
const bars = screen.getAllByTestId("visualizer-bar");
expect(bars.length).toBeGreaterThan(0);
});
it("should show inactive state when not active", (): void => {
render(<AudioVisualizer audioLevel={0} isActive={false} />);
const container = screen.getByTestId("audio-visualizer");
expect(container).toBeInTheDocument();
// Bars should be at minimum height when inactive
const bars = screen.getAllByTestId("visualizer-bar");
bars.forEach((bar) => {
const style = bar.getAttribute("style");
expect(style).toContain("height");
});
});
it("should reflect audio level in bar heights when active", (): void => {
render(<AudioVisualizer audioLevel={0.8} isActive={true} />);
const bars = screen.getAllByTestId("visualizer-bar");
// At least one bar should have non-minimal height
const hasActiveBars = bars.some((bar) => {
const style = bar.getAttribute("style") ?? "";
const heightMatch = /height:\s*(\d+)/.exec(style);
return heightMatch?.[1] ? parseInt(heightMatch[1], 10) > 4 : false;
});
expect(hasActiveBars).toBe(true);
});
it("should use calm colors (no aggressive reds)", (): void => {
render(<AudioVisualizer audioLevel={0.5} isActive={true} />);
const container = screen.getByTestId("audio-visualizer");
const allElements = container.querySelectorAll("*");
allElements.forEach((el) => {
const className = (el as HTMLElement).className;
expect(className).not.toMatch(/bg-red-|text-red-/);
});
});
it("should accept custom className", (): void => {
render(<AudioVisualizer audioLevel={0.5} isActive={true} className="custom-class" />);
const container = screen.getByTestId("audio-visualizer");
expect(container.className).toContain("custom-class");
});
it("should render with configurable bar count", (): void => {
render(<AudioVisualizer audioLevel={0.5} isActive={true} barCount={8} />);
const bars = screen.getAllByTestId("visualizer-bar");
expect(bars).toHaveLength(8);
});
});

View File

@@ -0,0 +1,87 @@
/**
* AudioVisualizer component
*
* Displays a simple audio level visualization using bars.
* Uses the Web Audio API's AnalyserNode data (passed as audioLevel)
* to show microphone input levels during recording.
*
* Design: Calm, non-aggressive colors following PDA-friendly guidelines.
*/
import { useMemo } from "react";
export interface AudioVisualizerProps {
/** Current audio level (0-1) */
audioLevel: number;
/** Whether the visualizer is actively listening */
isActive: boolean;
/** Number of bars to display (default: 5) */
barCount?: number;
/** Additional CSS classes */
className?: string;
}
/**
* Generate bar heights based on audio level.
* Creates a natural-looking wave pattern where center bars are taller.
*/
function generateBarHeights(level: number, count: number): number[] {
const heights: number[] = [];
const center = (count - 1) / 2;
for (let i = 0; i < count; i++) {
// Distance from center (0-1)
const distFromCenter = Math.abs(i - center) / center;
// Center bars are taller, edge bars shorter
const multiplier = 1 - distFromCenter * 0.5;
// Min height 4px, max height 24px when active
const minHeight = 4;
const maxHeight = 24;
const height = minHeight + level * (maxHeight - minHeight) * multiplier;
heights.push(Math.round(height));
}
return heights;
}
/**
* Audio level visualizer with animated bars.
* Shows microphone input levels during voice recording.
*/
export function AudioVisualizer({
audioLevel,
isActive,
barCount = 5,
className = "",
}: AudioVisualizerProps): React.JSX.Element {
const barHeights = useMemo(() => {
if (!isActive) {
return Array.from({ length: barCount }, () => 4);
}
return generateBarHeights(audioLevel, barCount);
}, [audioLevel, isActive, barCount]);
return (
<div
data-testid="audio-visualizer"
className={`flex items-center gap-0.5 ${className}`}
role="img"
aria-label={
isActive
? `Audio level: ${String(Math.round(audioLevel * 100))}%`
: "Audio visualizer inactive"
}
>
{barHeights.map((height, index) => (
<div
key={index}
data-testid="visualizer-bar"
className={`w-1 rounded-full transition-all duration-150 ease-out ${
isActive ? "bg-sky-400" : "bg-slate-300 dark:bg-slate-600"
}`}
style={{ height: `${height.toString()}px` }}
/>
))}
</div>
);
}

View File

@@ -0,0 +1,218 @@
/**
* @file TextToSpeechButton.test.tsx
* @description Tests for the TextToSpeechButton "Read aloud" component
*/
import { describe, it, expect, vi, beforeEach } from "vitest";
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { TextToSpeechButton } from "./TextToSpeechButton";
// Mock the useTextToSpeech hook
const mockSynthesize = vi.fn();
const mockPlay = vi.fn();
const mockPause = vi.fn();
const mockStop = vi.fn();
vi.mock("@/hooks/useTextToSpeech", () => ({
useTextToSpeech: vi.fn(() => ({
synthesize: mockSynthesize,
play: mockPlay,
pause: mockPause,
stop: mockStop,
audioUrl: null,
isLoading: false,
error: null,
isPlaying: false,
duration: 0,
currentTime: 0,
})),
}));
// Import after mocking
import { useTextToSpeech } from "@/hooks/useTextToSpeech";
const mockUseTextToSpeech = useTextToSpeech as ReturnType<typeof vi.fn>;
// Mock HTMLAudioElement for AudioPlayer used inside TextToSpeechButton
class MockAudio {
src = "";
currentTime = 0;
duration = 60;
paused = true;
playbackRate = 1;
volume = 1;
onended: (() => void) | null = null;
ontimeupdate: (() => void) | null = null;
onloadedmetadata: (() => void) | null = null;
onerror: ((e: unknown) => void) | null = null;
play(): Promise<void> {
this.paused = false;
return Promise.resolve();
}
pause(): void {
this.paused = true;
}
addEventListener(): void {
// no-op
}
removeEventListener(): void {
// no-op
}
}
vi.stubGlobal("Audio", MockAudio);
describe("TextToSpeechButton", () => {
beforeEach(() => {
vi.clearAllMocks();
mockUseTextToSpeech.mockReturnValue({
synthesize: mockSynthesize,
play: mockPlay,
pause: mockPause,
stop: mockStop,
audioUrl: null,
isLoading: false,
error: null,
isPlaying: false,
duration: 0,
currentTime: 0,
});
});
describe("rendering", () => {
it("should render a read aloud button", () => {
render(<TextToSpeechButton text="Hello world" />);
const button = screen.getByRole("button", { name: /read aloud/i });
expect(button).toBeInTheDocument();
});
it("should not render AudioPlayer initially when no audio is synthesized", () => {
render(<TextToSpeechButton text="Hello world" />);
expect(screen.queryByRole("region", { name: /audio player/i })).not.toBeInTheDocument();
});
});
describe("click behavior", () => {
it("should call synthesize with text on click", async () => {
const user = userEvent.setup();
mockSynthesize.mockResolvedValueOnce(undefined);
render(<TextToSpeechButton text="Hello world" />);
const button = screen.getByRole("button", { name: /read aloud/i });
await user.click(button);
expect(mockSynthesize).toHaveBeenCalledWith("Hello world", undefined);
});
it("should pass voice and tier options when provided", async () => {
const user = userEvent.setup();
mockSynthesize.mockResolvedValueOnce(undefined);
render(<TextToSpeechButton text="Hello" voice="alloy" tier="premium" />);
const button = screen.getByRole("button", { name: /read aloud/i });
await user.click(button);
expect(mockSynthesize).toHaveBeenCalledWith("Hello", {
voice: "alloy",
tier: "premium",
});
});
});
describe("loading state", () => {
it("should show loading indicator while synthesizing", () => {
mockUseTextToSpeech.mockReturnValue({
synthesize: mockSynthesize,
play: mockPlay,
pause: mockPause,
stop: mockStop,
audioUrl: null,
isLoading: true,
error: null,
isPlaying: false,
duration: 0,
currentTime: 0,
});
render(<TextToSpeechButton text="Hello world" />);
const button = screen.getByRole("button", { name: /synthesizing/i });
expect(button).toBeInTheDocument();
expect(button).toBeDisabled();
});
});
describe("audio player integration", () => {
it("should show AudioPlayer when audio is available", () => {
mockUseTextToSpeech.mockReturnValue({
synthesize: mockSynthesize,
play: mockPlay,
pause: mockPause,
stop: mockStop,
audioUrl: "blob:mock-url",
isLoading: false,
error: null,
isPlaying: false,
duration: 30,
currentTime: 0,
});
render(<TextToSpeechButton text="Hello world" />);
expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument();
});
});
describe("error state", () => {
it("should display error message when synthesis fails", () => {
mockUseTextToSpeech.mockReturnValue({
synthesize: mockSynthesize,
play: mockPlay,
pause: mockPause,
stop: mockStop,
audioUrl: null,
isLoading: false,
error: "Synthesis failed",
isPlaying: false,
duration: 0,
currentTime: 0,
});
render(<TextToSpeechButton text="Hello world" />);
expect(screen.getByText(/synthesis failed/i)).toBeInTheDocument();
});
});
describe("accessibility", () => {
it("should have proper aria label on button", () => {
render(<TextToSpeechButton text="Hello world" />);
const button = screen.getByRole("button", { name: /read aloud/i });
expect(button).toBeInTheDocument();
});
});
describe("design", () => {
it("should not use aggressive colors", () => {
const { container } = render(<TextToSpeechButton text="Hello world" />);
const allElements = container.querySelectorAll("*");
allElements.forEach((el) => {
const className = el.className;
if (typeof className === "string") {
expect(className).not.toMatch(/bg-red-|text-red-|border-red-/);
}
});
});
});
});

View File

@@ -0,0 +1,126 @@
/**
* TextToSpeechButton Component
* "Read aloud" button that synthesizes text and plays it via AudioPlayer.
*
* Accepts text as a prop, with optional voice and tier selection.
* Shows loading state during synthesis and integrates AudioPlayer for playback.
*
* Follows PDA-friendly design: no aggressive colors, calm interface.
*/
import { useCallback } from "react";
import type { ReactElement } from "react";
import { useTextToSpeech } from "@/hooks/useTextToSpeech";
import type { SynthesizeOptions } from "@/hooks/useTextToSpeech";
import { AudioPlayer } from "./AudioPlayer";
export interface TextToSpeechButtonProps {
/** The text to synthesize to speech */
text: string;
/** Optional voice ID to use */
voice?: string;
/** Optional tier (e.g. "standard", "premium") */
tier?: string;
/** Optional className for the container */
className?: string;
}
/**
* TextToSpeechButton provides a "Read aloud" button that synthesizes
* the given text and displays an AudioPlayer for playback control.
*/
export function TextToSpeechButton({
text,
voice,
tier,
className = "",
}: TextToSpeechButtonProps): ReactElement {
const { synthesize, audioUrl, isLoading, error } = useTextToSpeech();
/**
* Handle read aloud button click
*/
const handleClick = useCallback(async (): Promise<void> => {
let options: SynthesizeOptions | undefined;
if (voice !== undefined || tier !== undefined) {
options = {};
if (voice !== undefined) options.voice = voice;
if (tier !== undefined) options.tier = tier;
}
await synthesize(text, options);
}, [text, voice, tier, synthesize]);
return (
<div className={`flex flex-col gap-2 ${className}`}>
{/* Read Aloud Button */}
<button
type="button"
onClick={() => void handleClick()}
disabled={isLoading}
aria-label={isLoading ? "Synthesizing speech" : "Read aloud"}
className="inline-flex items-center gap-2 rounded-lg border border-gray-200 bg-white px-3 py-1.5 text-sm font-medium text-gray-700 transition-colors hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-blue-300 disabled:cursor-not-allowed disabled:opacity-50"
>
{isLoading ? (
<>
{/* Spinner */}
<svg
className="h-4 w-4 animate-spin text-gray-500"
viewBox="0 0 24 24"
fill="none"
aria-hidden="true"
>
<circle
cx="12"
cy="12"
r="10"
stroke="currentColor"
strokeWidth="3"
className="opacity-25"
/>
<path
fill="currentColor"
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
className="opacity-75"
/>
</svg>
<span>Synthesizing...</span>
</>
) : (
<>
{/* Speaker Icon */}
<svg
width="16"
height="16"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
aria-hidden="true"
>
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
<path d="M15.54 8.46a5 5 0 010 7.07" />
<path d="M19.07 4.93a10 10 0 010 14.14" />
</svg>
<span>Read aloud</span>
</>
)}
</button>
{/* Error Display */}
{error && (
<p className="text-sm text-amber-600" role="alert">
{error}
</p>
)}
{/* Audio Player (shown after synthesis) */}
{audioUrl && <AudioPlayer src={audioUrl} />}
</div>
);
}
export default TextToSpeechButton;

View File

@@ -0,0 +1,228 @@
import { describe, it, expect, vi, beforeEach } from "vitest";
import { render, screen } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { VoiceInput } from "./VoiceInput";
// Mock the useVoiceInput hook
const mockStartRecording = vi.fn();
const mockStopRecording = vi.fn();
vi.mock("@/hooks/useVoiceInput", () => ({
useVoiceInput: vi.fn(() => ({
isRecording: false,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "",
error: null,
audioLevel: 0,
})),
}));
// We need to import after mocking
import { useVoiceInput } from "@/hooks/useVoiceInput";
describe("VoiceInput", (): void => {
beforeEach((): void => {
vi.clearAllMocks();
// Reset mock implementation to default
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: false,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "",
error: null,
audioLevel: 0,
});
});
it("should render a microphone button", (): void => {
render(<VoiceInput />);
const button = screen.getByRole("button", {
name: /start voice input/i,
});
expect(button).toBeInTheDocument();
});
it("should have accessible aria label", (): void => {
render(<VoiceInput />);
const button = screen.getByRole("button", {
name: /start voice input/i,
});
expect(button).toHaveAttribute("aria-label", "Start voice input");
});
it("should call startRecording when mic button is clicked", async (): Promise<void> => {
const user = userEvent.setup();
render(<VoiceInput />);
const button = screen.getByRole("button", {
name: /start voice input/i,
});
await user.click(button);
expect(mockStartRecording).toHaveBeenCalledTimes(1);
});
it("should show recording state when isRecording is true", (): void => {
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: true,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "",
error: null,
audioLevel: 0.5,
});
render(<VoiceInput />);
const button = screen.getByRole("button", {
name: /stop voice input/i,
});
expect(button).toBeInTheDocument();
});
it("should call stopRecording when mic button is clicked while recording", async (): Promise<void> => {
const user = userEvent.setup();
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: true,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "",
error: null,
audioLevel: 0.5,
});
render(<VoiceInput />);
const button = screen.getByRole("button", {
name: /stop voice input/i,
});
await user.click(button);
expect(mockStopRecording).toHaveBeenCalledTimes(1);
});
it("should display partial transcription text", (): void => {
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: true,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "hello worl",
error: null,
audioLevel: 0.3,
});
render(<VoiceInput />);
expect(screen.getByText("hello worl")).toBeInTheDocument();
});
it("should display final transcript text", (): void => {
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: false,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "hello world",
partialTranscript: "",
error: null,
audioLevel: 0,
});
render(<VoiceInput />);
expect(screen.getByText("hello world")).toBeInTheDocument();
});
it("should display error message", (): void => {
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: false,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "",
error: "Microphone access not available",
audioLevel: 0,
});
render(<VoiceInput />);
expect(screen.getByText("Microphone access not available")).toBeInTheDocument();
});
it("should call onTranscript callback prop", (): void => {
const onTranscript = vi.fn();
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: false,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "final text",
partialTranscript: "",
error: null,
audioLevel: 0,
});
render(<VoiceInput onTranscript={onTranscript} />);
// The onTranscript prop is passed to the hook - we verify the prop is accepted
expect(useVoiceInput).toHaveBeenCalledWith(
expect.objectContaining({
onTranscript,
})
);
});
it("should use calm, non-aggressive design for recording indicator", (): void => {
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: true,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "",
error: null,
audioLevel: 0.5,
});
render(<VoiceInput />);
// Check there are no aggressive red colors in the recording state
const button = screen.getByRole("button", { name: /stop voice input/i });
const className = button.className;
expect(className).not.toMatch(/bg-red-|text-red-|border-red-/);
});
it("should use calm design for error display", (): void => {
vi.mocked(useVoiceInput).mockReturnValue({
isRecording: false,
startRecording: mockStartRecording,
stopRecording: mockStopRecording,
transcript: "",
partialTranscript: "",
error: "Something went wrong",
audioLevel: 0,
});
render(<VoiceInput />);
const errorEl = screen.getByText("Something went wrong");
const className = errorEl.className;
expect(className).not.toMatch(/text-red-600|bg-red-/);
});
it("should be disabled when disabled prop is true", (): void => {
render(<VoiceInput disabled />);
const button = screen.getByRole("button", {
name: /start voice input/i,
});
expect(button).toBeDisabled();
});
});

View File

@@ -0,0 +1,146 @@
/**
* VoiceInput component
*
* Provides a microphone button with visual feedback for voice input.
* Click to start/stop recording with real-time transcription display.
*
* Design principles:
* - PDA-friendly: calm, non-aggressive colors
* - Gentle pulsing animation for recording state (blue/green)
* - Mobile-friendly touch interaction
* - Accessible with proper aria labels
*/
import { useVoiceInput } from "@/hooks/useVoiceInput";
import type { UseVoiceInputOptions } from "@/hooks/useVoiceInput";
import { AudioVisualizer } from "./AudioVisualizer";
import { Mic, MicOff } from "lucide-react";
export interface VoiceInputProps {
/** Callback fired when final transcription is received */
onTranscript?: (text: string) => void;
/** Whether to use WebSocket streaming (default: true) */
useWebSocket?: boolean;
/** Whether the input is disabled */
disabled?: boolean;
/** Additional CSS classes for the container */
className?: string;
}
/**
* Voice input component with microphone capture and real-time transcription.
* Shows a mic button that toggles recording, with visual feedback
* and transcription text display.
*/
export function VoiceInput({
onTranscript,
useWebSocket: useWs,
disabled = false,
className = "",
}: VoiceInputProps): React.JSX.Element {
const hookOptions: UseVoiceInputOptions = {};
if (onTranscript !== undefined) {
hookOptions.onTranscript = onTranscript;
}
if (useWs !== undefined) {
hookOptions.useWebSocket = useWs;
}
const {
isRecording,
startRecording,
stopRecording,
transcript,
partialTranscript,
error,
audioLevel,
} = useVoiceInput(hookOptions);
const handleClick = (): void => {
if (isRecording) {
stopRecording();
} else {
void startRecording();
}
};
const displayText = isRecording ? partialTranscript : transcript;
return (
<div className={`flex flex-col items-center gap-3 ${className}`}>
{/* Mic button with recording indicator */}
<div className="relative flex items-center gap-2">
{/* Pulsing ring animation when recording */}
{isRecording && (
<div
className="absolute inset-0 -m-1 rounded-full bg-sky-400/20 animate-pulse"
aria-hidden="true"
/>
)}
<button
type="button"
onClick={handleClick}
disabled={disabled}
aria-label={isRecording ? "Stop voice input" : "Start voice input"}
className={`
relative z-10 flex items-center justify-center
w-10 h-10 rounded-full transition-all duration-200
focus:outline-none focus:ring-2 focus:ring-sky-400 focus:ring-offset-2
disabled:opacity-50 disabled:cursor-not-allowed
${
isRecording
? "bg-sky-500 text-white hover:bg-sky-600 shadow-md"
: "bg-slate-100 text-slate-600 hover:bg-slate-200 dark:bg-slate-700 dark:text-slate-300 dark:hover:bg-slate-600"
}
`}
>
{isRecording ? (
<MicOff className="w-5 h-5" aria-hidden="true" />
) : (
<Mic className="w-5 h-5" aria-hidden="true" />
)}
</button>
{/* Audio level visualizer - shown during recording */}
{isRecording && (
<AudioVisualizer audioLevel={audioLevel} isActive={isRecording} barCount={5} />
)}
</div>
{/* Recording status indicator */}
{isRecording && (
<div className="flex items-center gap-1.5 text-xs text-sky-600 dark:text-sky-400">
<span className="w-2 h-2 rounded-full bg-sky-500 animate-pulse" aria-hidden="true" />
<span>Listening...</span>
</div>
)}
{/* Transcription text display */}
{displayText && (
<p
className={`
text-sm max-w-md text-center px-3 py-1.5 rounded-lg
${
isRecording
? "text-slate-500 dark:text-slate-400 bg-slate-50 dark:bg-slate-800/50 italic"
: "text-slate-700 dark:text-slate-200 bg-slate-100 dark:bg-slate-800"
}
`}
>
{displayText}
</p>
)}
{/* Error display - calm, non-aggressive */}
{error && (
<p
className="text-sm text-amber-700 dark:text-amber-400 bg-amber-50 dark:bg-amber-900/20 px-3 py-1.5 rounded-lg max-w-md text-center"
role="alert"
>
{error}
</p>
)}
</div>
);
}

View File

@@ -0,0 +1,8 @@
export { VoiceInput } from "./VoiceInput";
export type { VoiceInputProps } from "./VoiceInput";
export { AudioVisualizer } from "./AudioVisualizer";
export type { AudioVisualizerProps } from "./AudioVisualizer";
export { AudioPlayer } from "./AudioPlayer";
export type { AudioPlayerProps } from "./AudioPlayer";
export { TextToSpeechButton } from "./TextToSpeechButton";
export type { TextToSpeechButtonProps } from "./TextToSpeechButton";

View File

@@ -0,0 +1,285 @@
/**
* @file useTextToSpeech.test.ts
* @description Tests for the useTextToSpeech hook that manages TTS API integration
*/
import { renderHook, act } from "@testing-library/react";
import { describe, it, expect, beforeEach, vi, afterEach } from "vitest";
import { useTextToSpeech } from "./useTextToSpeech";
import * as speechApi from "@/lib/api/speech";
// Mock the speech API module
vi.mock("@/lib/api/speech", () => ({
synthesizeSpeech: vi.fn(),
getVoices: vi.fn(),
}));
// Mock URL.createObjectURL and URL.revokeObjectURL
const mockCreateObjectURL = vi.fn().mockReturnValue("blob:mock-audio-url");
const mockRevokeObjectURL = vi.fn();
beforeEach(() => {
global.URL.createObjectURL = mockCreateObjectURL;
global.URL.revokeObjectURL = mockRevokeObjectURL;
});
// Mock HTMLAudioElement
class MockAudio {
src = "";
currentTime = 0;
duration = 120;
paused = true;
playbackRate = 1;
volume = 1;
onended: (() => void) | null = null;
ontimeupdate: (() => void) | null = null;
onloadedmetadata: (() => void) | null = null;
onerror: ((e: unknown) => void) | null = null;
play(): Promise<void> {
this.paused = false;
return Promise.resolve();
}
pause(): void {
this.paused = true;
}
addEventListener(event: string, handler: () => void): void {
if (event === "ended") this.onended = handler;
if (event === "timeupdate") this.ontimeupdate = handler;
if (event === "loadedmetadata") this.onloadedmetadata = handler;
if (event === "error") this.onerror = handler;
}
removeEventListener(): void {
// no-op for tests
}
}
vi.stubGlobal("Audio", MockAudio);
const mockSynthesizeSpeech = speechApi.synthesizeSpeech as ReturnType<typeof vi.fn>;
describe("useTextToSpeech", () => {
beforeEach(() => {
vi.clearAllMocks();
mockCreateObjectURL.mockReturnValue("blob:mock-audio-url");
});
afterEach(() => {
vi.restoreAllMocks();
});
describe("initial state", () => {
it("should return correct initial interface", () => {
const { result } = renderHook(() => useTextToSpeech());
expect(result.current.synthesize).toBeTypeOf("function");
expect(result.current.play).toBeTypeOf("function");
expect(result.current.pause).toBeTypeOf("function");
expect(result.current.stop).toBeTypeOf("function");
expect(result.current.audioUrl).toBeNull();
expect(result.current.isLoading).toBe(false);
expect(result.current.error).toBeNull();
expect(result.current.isPlaying).toBe(false);
expect(result.current.duration).toBe(0);
expect(result.current.currentTime).toBe(0);
});
});
describe("synthesize", () => {
it("should call API and return audio blob URL", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello world");
});
expect(mockSynthesizeSpeech).toHaveBeenCalledWith({
text: "Hello world",
});
expect(result.current.audioUrl).toBe("blob:mock-audio-url");
expect(result.current.isLoading).toBe(false);
expect(result.current.error).toBeNull();
});
it("should pass voice and tier options to API", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello", {
voice: "alloy",
tier: "premium",
speed: 1.5,
});
});
expect(mockSynthesizeSpeech).toHaveBeenCalledWith({
text: "Hello",
voice: "alloy",
tier: "premium",
speed: 1.5,
});
});
it("should set loading state while synthesizing", async () => {
let resolvePromise: ((value: Blob) => void) | undefined;
const pendingPromise = new Promise<Blob>((resolve) => {
resolvePromise = resolve;
});
mockSynthesizeSpeech.mockReturnValueOnce(pendingPromise);
const { result } = renderHook(() => useTextToSpeech());
act(() => {
void result.current.synthesize("Hello");
});
expect(result.current.isLoading).toBe(true);
await act(async () => {
resolvePromise?.(new Blob(["audio"], { type: "audio/mpeg" }));
await pendingPromise;
});
expect(result.current.isLoading).toBe(false);
});
it("should handle API errors gracefully", async () => {
mockSynthesizeSpeech.mockRejectedValueOnce(new Error("Synthesis failed"));
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
expect(result.current.error).toBe("Synthesis failed");
expect(result.current.isLoading).toBe(false);
expect(result.current.audioUrl).toBeNull();
});
it("should cache audio for repeated synthesis of same text", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValue(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
// First call
await act(async () => {
await result.current.synthesize("Hello world");
});
// Second call with same text
await act(async () => {
await result.current.synthesize("Hello world");
});
// API should only be called once due to caching
expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(1);
});
it("should not cache when options differ", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValue(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello", { voice: "alloy" });
});
await act(async () => {
await result.current.synthesize("Hello", { voice: "nova" });
});
expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(2);
});
});
describe("playback controls", () => {
it("should play audio after synthesis", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
await act(async () => {
await result.current.play();
});
expect(result.current.isPlaying).toBe(true);
});
it("should pause audio playback", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
await act(async () => {
await result.current.play();
});
act(() => {
result.current.pause();
});
expect(result.current.isPlaying).toBe(false);
});
it("should stop and reset playback", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
await act(async () => {
await result.current.play();
});
act(() => {
result.current.stop();
});
expect(result.current.isPlaying).toBe(false);
expect(result.current.currentTime).toBe(0);
});
});
describe("cleanup", () => {
it("should revoke object URLs on unmount", async () => {
const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
const { result, unmount } = renderHook(() => useTextToSpeech());
await act(async () => {
await result.current.synthesize("Hello");
});
unmount();
expect(mockRevokeObjectURL).toHaveBeenCalled();
});
});
});

View File

@@ -0,0 +1,239 @@
/**
* useTextToSpeech hook
* Manages TTS API integration with synthesis, caching, and playback state
*/
import { useState, useCallback, useRef, useEffect } from "react";
import { synthesizeSpeech } from "@/lib/api/speech";
export interface SynthesizeOptions {
voice?: string;
speed?: number;
format?: string;
tier?: string;
}
export interface UseTextToSpeechReturn {
/** Synthesize text to speech audio */
synthesize: (text: string, options?: SynthesizeOptions) => Promise<void>;
/** The URL of the synthesized audio blob */
audioUrl: string | null;
/** Whether synthesis is in progress */
isLoading: boolean;
/** Error message if synthesis failed */
error: string | null;
/** Start or resume audio playback */
play: () => Promise<void>;
/** Pause audio playback */
pause: () => void;
/** Stop audio and reset to beginning */
stop: () => void;
/** Whether audio is currently playing */
isPlaying: boolean;
/** Total duration of the audio in seconds */
duration: number;
/** Current playback position in seconds */
currentTime: number;
}
/** Cache key generator for text + options combination */
function getCacheKey(text: string, options?: SynthesizeOptions): string {
return JSON.stringify({ text, ...options });
}
/**
* Hook for text-to-speech API integration with caching and playback controls
*/
export function useTextToSpeech(): UseTextToSpeechReturn {
const [audioUrl, setAudioUrl] = useState<string | null>(null);
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [isPlaying, setIsPlaying] = useState(false);
const [duration, setDuration] = useState(0);
const [currentTime, setCurrentTime] = useState(0);
// Audio element ref for playback control
const audioRef = useRef<HTMLAudioElement | null>(null);
// Cache: maps cache key -> blob URL
const cacheRef = useRef<Map<string, string>>(new Map());
// Track all blob URLs for cleanup
const blobUrlsRef = useRef<Set<string>>(new Set());
/**
* Clean up audio element event listeners and state
*/
const cleanupAudio = useCallback(() => {
const audio = audioRef.current;
if (audio) {
audio.pause();
audio.removeEventListener("ended", handleEnded);
audio.removeEventListener("timeupdate", handleTimeUpdate);
audio.removeEventListener("loadedmetadata", handleLoadedMetadata);
audioRef.current = null;
}
setIsPlaying(false);
}, []);
/**
* Handle audio ended event
*/
function handleEnded(): void {
setIsPlaying(false);
setCurrentTime(0);
}
/**
* Handle audio time update event
*/
function handleTimeUpdate(): void {
const audio = audioRef.current;
if (audio) {
setCurrentTime(audio.currentTime);
}
}
/**
* Handle audio metadata loaded event
*/
function handleLoadedMetadata(): void {
const audio = audioRef.current;
if (audio && isFinite(audio.duration)) {
setDuration(audio.duration);
}
}
/**
* Set up a new Audio element for a given URL
*/
const setupAudio = useCallback(
(url: string) => {
cleanupAudio();
const audio = new Audio(url);
audio.addEventListener("ended", handleEnded);
audio.addEventListener("timeupdate", handleTimeUpdate);
audio.addEventListener("loadedmetadata", handleLoadedMetadata);
audioRef.current = audio;
},
[cleanupAudio]
);
/**
* Synthesize text to speech
*/
const synthesize = useCallback(
async (text: string, options?: SynthesizeOptions): Promise<void> => {
setError(null);
// Check cache first
const cacheKey = getCacheKey(text, options);
const cachedUrl = cacheRef.current.get(cacheKey);
if (cachedUrl) {
setAudioUrl(cachedUrl);
setupAudio(cachedUrl);
return;
}
setIsLoading(true);
try {
const blob = await synthesizeSpeech({
text,
...(options?.voice !== undefined && { voice: options.voice }),
...(options?.speed !== undefined && { speed: options.speed }),
...(options?.format !== undefined && { format: options.format }),
...(options?.tier !== undefined && { tier: options.tier }),
});
const url = URL.createObjectURL(blob);
// Store in cache and track for cleanup
cacheRef.current.set(cacheKey, url);
blobUrlsRef.current.add(url);
setAudioUrl(url);
setupAudio(url);
} catch (err) {
const errorMsg = err instanceof Error ? err.message : "Speech synthesis failed";
setError(errorMsg);
setAudioUrl(null);
} finally {
setIsLoading(false);
}
},
[setupAudio]
);
/**
* Start or resume audio playback
*/
const play = useCallback(async (): Promise<void> => {
const audio = audioRef.current;
if (audio) {
await audio.play();
setIsPlaying(true);
}
}, []);
/**
* Pause audio playback
*/
const pause = useCallback((): void => {
const audio = audioRef.current;
if (audio) {
audio.pause();
setIsPlaying(false);
}
}, []);
/**
* Stop audio and reset to beginning
*/
const stop = useCallback((): void => {
const audio = audioRef.current;
if (audio) {
audio.pause();
audio.currentTime = 0;
setIsPlaying(false);
setCurrentTime(0);
}
}, []);
// Cleanup on unmount: revoke all blob URLs and clean up audio
useEffect((): (() => void) => {
return (): void => {
// Clean up audio element
const audio = audioRef.current;
if (audio) {
audio.pause();
audio.removeEventListener("ended", handleEnded);
audio.removeEventListener("timeupdate", handleTimeUpdate);
audio.removeEventListener("loadedmetadata", handleLoadedMetadata);
audioRef.current = null;
}
// Revoke all blob URLs
for (const url of blobUrlsRef.current) {
URL.revokeObjectURL(url);
}
blobUrlsRef.current.clear();
cacheRef.current.clear();
};
}, []);
return {
synthesize,
audioUrl,
isLoading,
error,
play,
pause,
stop,
isPlaying,
duration,
currentTime,
};
}

View File

@@ -0,0 +1,362 @@
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import { renderHook, act, waitFor } from "@testing-library/react";
import { useVoiceInput } from "./useVoiceInput";
import type { Socket } from "socket.io-client";
import { io } from "socket.io-client";
// Mock socket.io-client
vi.mock("socket.io-client");
// Mock MediaRecorder
const mockMediaRecorder = {
start: vi.fn(),
stop: vi.fn(),
pause: vi.fn(),
resume: vi.fn(),
state: "inactive" as RecordingState,
ondataavailable: null as ((event: BlobEvent) => void) | null,
onstop: null as (() => void) | null,
onerror: null as ((event: Event) => void) | null,
addEventListener: vi.fn((event: string, handler: EventListenerOrEventListenerObject) => {
if (event === "dataavailable") {
mockMediaRecorder.ondataavailable = handler as (event: BlobEvent) => void;
} else if (event === "stop") {
mockMediaRecorder.onstop = handler as () => void;
} else if (event === "error") {
mockMediaRecorder.onerror = handler as (event: Event) => void;
}
}),
removeEventListener: vi.fn(),
stream: {
getTracks: vi.fn(() => [{ stop: vi.fn() }]),
},
};
// Mock MediaStream with getByteFrequencyData for audio level
const mockAnalyserNode = {
fftSize: 256,
frequencyBinCount: 128,
getByteFrequencyData: vi.fn((array: Uint8Array) => {
// Simulate some audio data
for (let i = 0; i < array.length; i++) {
array[i] = 128;
}
}),
connect: vi.fn(),
disconnect: vi.fn(),
};
const mockMediaStreamSource = {
connect: vi.fn(),
disconnect: vi.fn(),
};
const mockAudioContext = {
createAnalyser: vi.fn(() => mockAnalyserNode),
createMediaStreamSource: vi.fn(() => mockMediaStreamSource),
close: vi.fn(),
state: "running",
};
// Mock getUserMedia
const mockGetUserMedia = vi.fn();
// Set up global mocks
Object.defineProperty(global.navigator, "mediaDevices", {
value: {
getUserMedia: mockGetUserMedia,
},
writable: true,
configurable: true,
});
// Mock AudioContext
vi.stubGlobal(
"AudioContext",
vi.fn(() => mockAudioContext)
);
// Mock MediaRecorder constructor
vi.stubGlobal(
"MediaRecorder",
vi.fn(() => mockMediaRecorder)
);
// Add isTypeSupported static method
(
global.MediaRecorder as unknown as { isTypeSupported: (type: string) => boolean }
).isTypeSupported = vi.fn(() => true);
describe("useVoiceInput", (): void => {
let mockSocket: Partial<Socket>;
let socketEventHandlers: Record<string, (data: unknown) => void>;
beforeEach((): void => {
socketEventHandlers = {};
mockSocket = {
on: vi.fn((event: string, handler: (...args: unknown[]) => void) => {
socketEventHandlers[event] = handler;
return mockSocket;
}) as unknown as Socket["on"],
off: vi.fn(() => mockSocket) as unknown as Socket["off"],
emit: vi.fn() as unknown as Socket["emit"],
connect: vi.fn(),
disconnect: vi.fn(),
connected: true,
};
(io as unknown as ReturnType<typeof vi.fn>).mockReturnValue(mockSocket);
// Reset MediaRecorder mock state
mockMediaRecorder.state = "inactive";
mockMediaRecorder.ondataavailable = null;
mockMediaRecorder.onstop = null;
mockMediaRecorder.onerror = null;
// Default: getUserMedia succeeds
const mockStream = {
getTracks: vi.fn(() => [{ stop: vi.fn() }]),
} as unknown as MediaStream;
mockGetUserMedia.mockResolvedValue(mockStream);
});
afterEach((): void => {
vi.clearAllMocks();
});
it("should return the correct interface", (): void => {
const { result } = renderHook(() => useVoiceInput());
expect(result.current).toHaveProperty("isRecording");
expect(result.current).toHaveProperty("startRecording");
expect(result.current).toHaveProperty("stopRecording");
expect(result.current).toHaveProperty("transcript");
expect(result.current).toHaveProperty("partialTranscript");
expect(result.current).toHaveProperty("error");
expect(result.current).toHaveProperty("audioLevel");
});
it("should start with default state", (): void => {
const { result } = renderHook(() => useVoiceInput());
expect(result.current.isRecording).toBe(false);
expect(result.current.transcript).toBe("");
expect(result.current.partialTranscript).toBe("");
expect(result.current.error).toBeNull();
expect(result.current.audioLevel).toBe(0);
});
it("should start recording when startRecording is called", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(true);
expect(mockGetUserMedia).toHaveBeenCalledWith({
audio: {
echoCancellation: true,
noiseSuppression: true,
sampleRate: 16000,
},
});
});
it("should stop recording when stopRecording is called", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(true);
act(() => {
result.current.stopRecording();
});
expect(result.current.isRecording).toBe(false);
});
it("should set error when microphone access is denied", async (): Promise<void> => {
mockGetUserMedia.mockRejectedValueOnce(
new DOMException("Permission denied", "NotAllowedError")
);
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(false);
expect(result.current.error).toBeTruthy();
expect(result.current.error).toContain("microphone");
});
it("should connect to speech WebSocket namespace", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(io).toHaveBeenCalledWith(
expect.any(String),
expect.objectContaining({
path: "/socket.io",
})
);
});
it("should emit start-transcription when recording begins", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
expect(mockSocket.emit).toHaveBeenCalledWith(
"start-transcription",
expect.objectContaining({
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
format: expect.any(String),
})
);
});
it("should emit stop-transcription when recording stops", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
result.current.stopRecording();
});
expect(mockSocket.emit).toHaveBeenCalledWith("stop-transcription");
});
it("should handle partial transcription events", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-partial"]?.({
text: "hello world",
});
});
await waitFor(() => {
expect(result.current.partialTranscript).toBe("hello world");
});
});
it("should handle final transcription events", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-final"]?.({
text: "hello world final",
});
});
await waitFor(() => {
expect(result.current.transcript).toBe("hello world final");
});
});
it("should handle transcription error events", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-error"]?.({
message: "Transcription failed",
});
});
await waitFor(() => {
expect(result.current.error).toBe("Transcription failed");
});
});
it("should call onTranscript callback when final transcription received", async (): Promise<void> => {
const onTranscript = vi.fn();
const { result } = renderHook(() => useVoiceInput({ onTranscript }));
await act(async () => {
await result.current.startRecording();
});
act(() => {
socketEventHandlers["transcription-final"]?.({
text: "final text",
});
});
await waitFor(() => {
expect(onTranscript).toHaveBeenCalledWith("final text");
});
});
it("should clean up on unmount", async (): Promise<void> => {
const { result, unmount } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
unmount();
expect(mockSocket.disconnect).toHaveBeenCalled();
});
it("should not start recording if already recording", async (): Promise<void> => {
const { result } = renderHook(() => useVoiceInput());
await act(async () => {
await result.current.startRecording();
});
// Reset the call count
mockGetUserMedia.mockClear();
await act(async () => {
await result.current.startRecording();
});
// Should not have called getUserMedia again
expect(mockGetUserMedia).not.toHaveBeenCalled();
});
describe("REST fallback", (): void => {
it("should fall back to REST when WebSocket is unavailable", async (): Promise<void> => {
// Simulate socket not connecting
(mockSocket as { connected: boolean }).connected = false;
const { result } = renderHook(() => useVoiceInput({ useWebSocket: false }));
// Should still be able to start recording (REST mode)
await act(async () => {
await result.current.startRecording();
});
expect(result.current.isRecording).toBe(true);
});
});
});

View File

@@ -0,0 +1,409 @@
/**
* useVoiceInput hook
*
* Custom hook for microphone capture and real-time transcription.
* Supports WebSocket streaming for real-time partial transcriptions
* with REST upload fallback when WebSocket is unavailable.
*/
import { useState, useCallback, useRef, useEffect } from "react";
import type { Socket } from "socket.io-client";
import { io } from "socket.io-client";
import { API_BASE_URL } from "@/lib/config";
import { apiPostFormData } from "@/lib/api/client";
/** Options for the useVoiceInput hook */
export interface UseVoiceInputOptions {
/** Callback fired when final transcription is received */
onTranscript?: (text: string) => void;
/** Whether to use WebSocket streaming (default: true) */
useWebSocket?: boolean;
/** Audio sample rate in Hz (default: 16000) */
sampleRate?: number;
}
/** Return type for the useVoiceInput hook */
export interface UseVoiceInputReturn {
/** Whether the microphone is currently recording */
isRecording: boolean;
/** Start microphone capture and transcription */
startRecording: () => Promise<void>;
/** Stop microphone capture and transcription */
stopRecording: () => void;
/** The final transcription text */
transcript: string;
/** Partial transcription text (updates in real-time) */
partialTranscript: string;
/** Error message if something went wrong */
error: string | null;
/** Current audio input level (0-1) */
audioLevel: number;
}
interface TranscriptionPartialPayload {
text: string;
}
interface TranscriptionFinalPayload {
text: string;
}
interface TranscriptionErrorPayload {
message: string;
}
interface TranscribeResponse {
data: {
text: string;
};
}
/**
* Determine the best MIME type for audio recording
*/
function getAudioMimeType(): string {
if (typeof MediaRecorder === "undefined") {
return "audio/webm";
}
const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"];
for (const type of types) {
if (MediaRecorder.isTypeSupported(type)) {
return type;
}
}
return "audio/webm";
}
/**
* Hook for microphone capture and real-time speech-to-text transcription.
*
* Uses WebSocket streaming by default for real-time partial transcriptions.
* Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
* is disabled or unavailable.
*/
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options;
const [isRecording, setIsRecording] = useState(false);
const [transcript, setTranscript] = useState("");
const [partialTranscript, setPartialTranscript] = useState("");
const [error, setError] = useState<string | null>(null);
const [audioLevel, setAudioLevel] = useState(0);
// Refs to hold mutable state without re-renders
const socketRef = useRef<Socket | null>(null);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const streamRef = useRef<MediaStream | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const analyserRef = useRef<AnalyserNode | null>(null);
const animationFrameRef = useRef<number | null>(null);
const onTranscriptRef = useRef(onTranscript);
const recordedChunksRef = useRef<Blob[]>([]);
const isRecordingRef = useRef(false);
// Keep callback ref up to date
useEffect(() => {
onTranscriptRef.current = onTranscript;
}, [onTranscript]);
/**
* Set up audio analysis for visualizing input level
*/
const setupAudioAnalysis = useCallback((stream: MediaStream): void => {
try {
const audioContext = new AudioContext();
const analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(stream);
analyser.fftSize = 256;
source.connect(analyser);
audioContextRef.current = audioContext;
analyserRef.current = analyser;
// Start level monitoring
const dataArray = new Uint8Array(analyser.frequencyBinCount);
const updateLevel = (): void => {
if (!isRecordingRef.current) {
return;
}
analyser.getByteFrequencyData(dataArray);
// Calculate average level
let sum = 0;
for (const value of dataArray) {
sum += value;
}
const average = sum / dataArray.length / 255;
setAudioLevel(average);
animationFrameRef.current = requestAnimationFrame(updateLevel);
};
animationFrameRef.current = requestAnimationFrame(updateLevel);
} catch {
// Audio analysis is non-critical; continue without it
console.warn("Audio analysis not available");
}
}, []);
/**
* Clean up audio analysis resources
*/
const cleanupAudioAnalysis = useCallback((): void => {
if (animationFrameRef.current !== null) {
cancelAnimationFrame(animationFrameRef.current);
animationFrameRef.current = null;
}
if (audioContextRef.current) {
void audioContextRef.current.close();
audioContextRef.current = null;
}
analyserRef.current = null;
setAudioLevel(0);
}, []);
/**
* Connect to the speech WebSocket namespace
*/
const connectSocket = useCallback((): Socket => {
const socket = io(API_BASE_URL, {
path: "/socket.io",
transports: ["websocket", "polling"],
});
socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
setPartialTranscript(data.text);
});
socket.on("transcription-final", (data: TranscriptionFinalPayload) => {
setTranscript(data.text);
setPartialTranscript("");
onTranscriptRef.current?.(data.text);
});
socket.on("transcription-error", (data: TranscriptionErrorPayload) => {
setError(data.message);
});
socketRef.current = socket;
return socket;
}, []);
/**
* Disconnect the WebSocket
*/
const disconnectSocket = useCallback((): void => {
if (socketRef.current) {
socketRef.current.off("transcription-partial");
socketRef.current.off("transcription-final");
socketRef.current.off("transcription-error");
socketRef.current.disconnect();
socketRef.current = null;
}
}, []);
/**
* Send recorded audio via REST API as fallback
*/
const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
try {
const formData = new FormData();
formData.append("audio", audioBlob, "recording.webm");
const response = await apiPostFormData<TranscribeResponse>(
"/api/speech/transcribe",
formData
);
if (response.data.text) {
setTranscript(response.data.text);
setPartialTranscript("");
onTranscriptRef.current?.(response.data.text);
}
} catch (err) {
const message = err instanceof Error ? err.message : "Transcription request failed";
setError(message);
}
}, []);
/**
* Stop all media tracks on the stream
*/
const stopMediaTracks = useCallback((): void => {
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => {
track.stop();
});
streamRef.current = null;
}
}, []);
/**
* Start microphone capture and transcription
*/
const startRecording = useCallback(async (): Promise<void> => {
// Prevent double-start
if (isRecordingRef.current) {
return;
}
setError(null);
setPartialTranscript("");
recordedChunksRef.current = [];
try {
// Request microphone access
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
sampleRate,
},
});
streamRef.current = stream;
// Set up audio level visualization
setupAudioAnalysis(stream);
// Determine MIME type
const mimeType = getAudioMimeType();
// Create MediaRecorder
const mediaRecorder = new MediaRecorder(stream, { mimeType });
mediaRecorderRef.current = mediaRecorder;
// Connect WebSocket if enabled
let socket: Socket | null = null;
if (useWs) {
socket = connectSocket();
// Emit start-transcription event
socket.emit("start-transcription", {
format: mimeType,
sampleRate,
});
}
// Handle audio data chunks
mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => {
if (event.data.size > 0) {
if (socket?.connected) {
// Stream chunks via WebSocket
socket.emit("audio-chunk", event.data);
} else {
// Collect chunks for REST upload
recordedChunksRef.current.push(event.data);
}
}
});
// Handle recording stop
mediaRecorder.addEventListener("stop", () => {
// If using REST fallback, send collected audio
if (!useWs || !socket?.connected) {
if (recordedChunksRef.current.length > 0) {
const audioBlob = new Blob(recordedChunksRef.current, {
type: mimeType,
});
void sendAudioViaRest(audioBlob);
}
}
});
// Handle errors
mediaRecorder.addEventListener("error", () => {
setError("Recording encountered an issue. Please try again.");
setIsRecording(false);
isRecordingRef.current = false;
});
// Start recording with timeslice for streaming chunks (250ms intervals)
mediaRecorder.start(250);
setIsRecording(true);
isRecordingRef.current = true;
} catch (err) {
// Handle specific error types
if (err instanceof DOMException) {
if (err.name === "NotAllowedError") {
setError(
"Microphone access was not granted. Please allow microphone access to use voice input."
);
} else if (err.name === "NotFoundError") {
setError("No microphone found. Please connect a microphone and try again.");
} else {
setError("Unable to access the microphone. Please check your device settings.");
}
} else {
setError("Unable to start voice input. Please try again.");
}
// Clean up on failure
stopMediaTracks();
cleanupAudioAnalysis();
}
}, [
useWs,
sampleRate,
setupAudioAnalysis,
connectSocket,
sendAudioViaRest,
stopMediaTracks,
cleanupAudioAnalysis,
]);
/**
* Stop microphone capture and transcription
*/
const stopRecording = useCallback((): void => {
setIsRecording(false);
isRecordingRef.current = false;
// Stop MediaRecorder
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
mediaRecorderRef.current.stop();
mediaRecorderRef.current = null;
}
// Stop media tracks
stopMediaTracks();
// Clean up audio analysis
cleanupAudioAnalysis();
// Emit stop event and disconnect WebSocket
if (socketRef.current) {
socketRef.current.emit("stop-transcription");
// Give the server a moment to process the final chunk before disconnecting
setTimeout(() => {
disconnectSocket();
}, 500);
}
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
// Cleanup on unmount
useEffect(() => {
return (): void => {
isRecordingRef.current = false;
if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
mediaRecorderRef.current.stop();
}
stopMediaTracks();
cleanupAudioAnalysis();
disconnectSocket();
};
}, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
return {
isRecording,
startRecording,
stopRecording,
transcript,
partialTranscript,
error,
audioLevel,
};
}

View File

@@ -0,0 +1,58 @@
/**
* Speech API client
* Handles text-to-speech synthesis and voice listing via /api/speech
*/
import { apiGet } from "./client";
import { API_BASE_URL } from "../config";
export interface VoiceInfo {
id: string;
name: string;
language: string;
gender?: string;
preview_url?: string;
}
export interface SynthesizeOptions {
text: string;
voice?: string;
speed?: number;
format?: string;
tier?: string;
}
export interface VoicesResponse {
data: VoiceInfo[];
}
/**
* Fetch available TTS voices
*/
export async function getVoices(): Promise<VoicesResponse> {
return apiGet<VoicesResponse>("/api/speech/voices");
}
/**
* Synthesize text to speech audio
* Returns the audio as a Blob since the API returns binary audio data
*/
export async function synthesizeSpeech(options: SynthesizeOptions): Promise<Blob> {
const url = `${API_BASE_URL}/api/speech/synthesize`;
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
},
credentials: "include",
body: JSON.stringify(options),
});
if (!response.ok) {
const errorText = await response.text().catch(() => "Unknown error");
throw new Error(`Speech synthesis failed: ${errorText}`);
}
return response.blob();
}