diff --git a/apps/web/src/components/speech/AudioPlayer.test.tsx b/apps/web/src/components/speech/AudioPlayer.test.tsx new file mode 100644 index 0000000..f185b09 --- /dev/null +++ b/apps/web/src/components/speech/AudioPlayer.test.tsx @@ -0,0 +1,178 @@ +/** + * @file AudioPlayer.test.tsx + * @description Tests for the AudioPlayer component that provides inline TTS audio playback + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { AudioPlayer } from "./AudioPlayer"; + +// Mock HTMLAudioElement +class MockAudio { + src = ""; + currentTime = 0; + duration = 60; + paused = true; + playbackRate = 1; + volume = 1; + onended: (() => void) | null = null; + ontimeupdate: (() => void) | null = null; + onloadedmetadata: (() => void) | null = null; + onerror: ((e: unknown) => void) | null = null; + + play(): Promise { + this.paused = false; + return Promise.resolve(); + } + + pause(): void { + this.paused = true; + } + + addEventListener(event: string, handler: () => void): void { + if (event === "ended") this.onended = handler; + if (event === "timeupdate") this.ontimeupdate = handler; + if (event === "loadedmetadata") this.onloadedmetadata = handler; + if (event === "error") this.onerror = handler; + } + + removeEventListener(): void { + // no-op for tests + } +} + +vi.stubGlobal("Audio", MockAudio); + +describe("AudioPlayer", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + describe("rendering", () => { + it("should render play button", () => { + render(); + + const playButton = screen.getByRole("button", { name: "Play audio" }); + expect(playButton).toBeInTheDocument(); + }); + + it("should render download button", () => { + render(); + + const downloadButton = screen.getByRole("button", { name: /download/i }); + expect(downloadButton).toBeInTheDocument(); + }); + + it("should render time display showing 0:00", () => { + render(); + + expect(screen.getByText("0:00")).toBeInTheDocument(); + }); + + it("should render speed control", () => { + render(); + + const speedButton = screen.getByRole("button", { name: "Playback speed" }); + expect(speedButton).toBeInTheDocument(); + }); + + it("should render progress bar", () => { + render(); + + const progressBar = screen.getByRole("progressbar"); + expect(progressBar).toBeInTheDocument(); + }); + + it("should not render when src is null", () => { + const { container } = render(); + + expect(container.firstChild).toBeNull(); + }); + }); + + describe("play/pause", () => { + it("should toggle to pause button when playing", async () => { + const user = userEvent.setup(); + render(); + + const playButton = screen.getByRole("button", { name: "Play audio" }); + await user.click(playButton); + + expect(screen.getByRole("button", { name: "Pause audio" })).toBeInTheDocument(); + }); + }); + + describe("speed control", () => { + it("should cycle through speed options on click", async () => { + const user = userEvent.setup(); + render(); + + const speedButton = screen.getByRole("button", { name: "Playback speed" }); + + // Default should be 1x + expect(speedButton).toHaveTextContent("1x"); + + // Click to go to 1.5x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("1.5x"); + + // Click to go to 2x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("2x"); + + // Click to go to 0.5x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("0.5x"); + + // Click to go back to 1x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("1x"); + }); + }); + + describe("accessibility", () => { + it("should have proper aria labels on controls", () => { + render(); + + expect(screen.getByRole("button", { name: "Play audio" })).toBeInTheDocument(); + expect(screen.getByRole("button", { name: /download/i })).toBeInTheDocument(); + expect(screen.getByRole("button", { name: "Playback speed" })).toBeInTheDocument(); + expect(screen.getByRole("progressbar")).toHaveAttribute("aria-label"); + }); + + it("should have region role on the player container", () => { + render(); + + expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument(); + }); + }); + + describe("design", () => { + it("should not use aggressive red colors", () => { + const { container } = render(); + + const allElements = container.querySelectorAll("*"); + allElements.forEach((el) => { + const className = el.className; + if (typeof className === "string") { + expect(className).not.toMatch(/bg-red-|text-red-|border-red-/); + } + }); + }); + }); + + describe("callbacks", () => { + it("should call onPlayStateChange when play state changes", async () => { + const onPlayStateChange = vi.fn(); + const user = userEvent.setup(); + + render(); + + const playButton = screen.getByRole("button", { name: "Play audio" }); + await user.click(playButton); + + expect(onPlayStateChange).toHaveBeenCalledWith(true); + }); + }); +}); diff --git a/apps/web/src/components/speech/AudioPlayer.tsx b/apps/web/src/components/speech/AudioPlayer.tsx new file mode 100644 index 0000000..d4a9a50 --- /dev/null +++ b/apps/web/src/components/speech/AudioPlayer.tsx @@ -0,0 +1,250 @@ +/** + * AudioPlayer Component + * Inline audio player for TTS content with play/pause, progress, + * speed control, download, and duration display. + * + * Follows PDA-friendly design: no aggressive colors, calm interface. + */ + +import { useState, useRef, useEffect, useCallback } from "react"; +import type { ReactElement } from "react"; + +/** Playback speed options */ +const SPEED_OPTIONS = [1, 1.5, 2, 0.5] as const; + +export interface AudioPlayerProps { + /** URL of the audio to play (blob URL or HTTP URL). If null, nothing renders. */ + src: string | null; + /** Whether to auto-play when src changes */ + autoPlay?: boolean; + /** Callback when play state changes */ + onPlayStateChange?: (isPlaying: boolean) => void; + /** Optional className for the container */ + className?: string; +} + +/** + * Format seconds into M:SS display + */ +function formatTime(seconds: number): string { + if (!isFinite(seconds) || seconds < 0) return "0:00"; + const mins = Math.floor(seconds / 60); + const secs = Math.floor(seconds % 60); + return `${String(mins)}:${String(secs).padStart(2, "0")}`; +} + +/** + * AudioPlayer displays an inline audio player with controls for + * play/pause, progress tracking, speed adjustment, and download. + */ +export function AudioPlayer({ + src, + autoPlay = false, + onPlayStateChange, + className = "", +}: AudioPlayerProps): ReactElement | null { + const [isPlaying, setIsPlaying] = useState(false); + const [currentTime, setCurrentTime] = useState(0); + const [duration, setDuration] = useState(0); + const [speedIndex, setSpeedIndex] = useState(0); + + const audioRef = useRef(null); + + /** + * Set up audio element when src changes + */ + useEffect((): (() => void) | undefined => { + if (!src) return undefined; + + const audio = new Audio(src); + audioRef.current = audio; + + const onLoadedMetadata = (): void => { + if (isFinite(audio.duration)) { + setDuration(audio.duration); + } + }; + + const onTimeUpdate = (): void => { + setCurrentTime(audio.currentTime); + }; + + const onEnded = (): void => { + setIsPlaying(false); + setCurrentTime(0); + onPlayStateChange?.(false); + }; + + audio.addEventListener("loadedmetadata", onLoadedMetadata); + audio.addEventListener("timeupdate", onTimeUpdate); + audio.addEventListener("ended", onEnded); + + if (autoPlay) { + void audio.play().then(() => { + setIsPlaying(true); + onPlayStateChange?.(true); + }); + } + + return (): void => { + audio.pause(); + audio.removeEventListener("loadedmetadata", onLoadedMetadata); + audio.removeEventListener("timeupdate", onTimeUpdate); + audio.removeEventListener("ended", onEnded); + audioRef.current = null; + }; + }, [src, autoPlay, onPlayStateChange]); + + /** + * Toggle play/pause + */ + const togglePlayPause = useCallback(async (): Promise => { + const audio = audioRef.current; + if (!audio) return; + + if (isPlaying) { + audio.pause(); + setIsPlaying(false); + onPlayStateChange?.(false); + } else { + await audio.play(); + setIsPlaying(true); + onPlayStateChange?.(true); + } + }, [isPlaying, onPlayStateChange]); + + /** + * Cycle through speed options + */ + const cycleSpeed = useCallback((): void => { + const nextIndex = (speedIndex + 1) % SPEED_OPTIONS.length; + setSpeedIndex(nextIndex); + + const audio = audioRef.current; + if (audio) { + audio.playbackRate = SPEED_OPTIONS[nextIndex] ?? 1; + } + }, [speedIndex]); + + /** + * Handle progress bar click for seeking + */ + const handleProgressClick = useCallback( + (event: React.MouseEvent): void => { + const audio = audioRef.current; + if (!audio || !duration) return; + + const rect = event.currentTarget.getBoundingClientRect(); + const clickX = event.clientX - rect.left; + const fraction = clickX / rect.width; + audio.currentTime = fraction * duration; + setCurrentTime(audio.currentTime); + }, + [duration] + ); + + /** + * Handle download + */ + const handleDownload = useCallback((): void => { + if (!src) return; + + const link = document.createElement("a"); + link.href = src; + link.download = "speech-audio.mp3"; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + }, [src]); + + // Don't render if no source + if (!src) return null; + + const progress = duration > 0 ? (currentTime / duration) * 100 : 0; + const currentSpeed = SPEED_OPTIONS[speedIndex] ?? 1; + + return ( +
+ {/* Play/Pause Button */} + + + {/* Time Display */} + + {formatTime(currentTime)} + {duration > 0 && / {formatTime(duration)}} + + + {/* Progress Bar */} +
+
+
+ + {/* Speed Control */} + + + {/* Download Button */} + +
+ ); +} + +export default AudioPlayer; diff --git a/apps/web/src/components/speech/AudioVisualizer.test.tsx b/apps/web/src/components/speech/AudioVisualizer.test.tsx new file mode 100644 index 0000000..6132f7e --- /dev/null +++ b/apps/web/src/components/speech/AudioVisualizer.test.tsx @@ -0,0 +1,70 @@ +import { describe, it, expect } from "vitest"; +import { render, screen } from "@testing-library/react"; +import { AudioVisualizer } from "./AudioVisualizer"; + +describe("AudioVisualizer", (): void => { + it("should render the visualizer container", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + expect(container).toBeInTheDocument(); + }); + + it("should render visualization bars", (): void => { + render(); + + const bars = screen.getAllByTestId("visualizer-bar"); + expect(bars.length).toBeGreaterThan(0); + }); + + it("should show inactive state when not active", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + expect(container).toBeInTheDocument(); + // Bars should be at minimum height when inactive + const bars = screen.getAllByTestId("visualizer-bar"); + bars.forEach((bar) => { + const style = bar.getAttribute("style"); + expect(style).toContain("height"); + }); + }); + + it("should reflect audio level in bar heights when active", (): void => { + render(); + + const bars = screen.getAllByTestId("visualizer-bar"); + // At least one bar should have non-minimal height + const hasActiveBars = bars.some((bar) => { + const style = bar.getAttribute("style") ?? ""; + const heightMatch = /height:\s*(\d+)/.exec(style); + return heightMatch?.[1] ? parseInt(heightMatch[1], 10) > 4 : false; + }); + expect(hasActiveBars).toBe(true); + }); + + it("should use calm colors (no aggressive reds)", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + const allElements = container.querySelectorAll("*"); + allElements.forEach((el) => { + const className = (el as HTMLElement).className; + expect(className).not.toMatch(/bg-red-|text-red-/); + }); + }); + + it("should accept custom className", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + expect(container.className).toContain("custom-class"); + }); + + it("should render with configurable bar count", (): void => { + render(); + + const bars = screen.getAllByTestId("visualizer-bar"); + expect(bars).toHaveLength(8); + }); +}); diff --git a/apps/web/src/components/speech/AudioVisualizer.tsx b/apps/web/src/components/speech/AudioVisualizer.tsx new file mode 100644 index 0000000..e215fd0 --- /dev/null +++ b/apps/web/src/components/speech/AudioVisualizer.tsx @@ -0,0 +1,87 @@ +/** + * AudioVisualizer component + * + * Displays a simple audio level visualization using bars. + * Uses the Web Audio API's AnalyserNode data (passed as audioLevel) + * to show microphone input levels during recording. + * + * Design: Calm, non-aggressive colors following PDA-friendly guidelines. + */ + +import { useMemo } from "react"; + +export interface AudioVisualizerProps { + /** Current audio level (0-1) */ + audioLevel: number; + /** Whether the visualizer is actively listening */ + isActive: boolean; + /** Number of bars to display (default: 5) */ + barCount?: number; + /** Additional CSS classes */ + className?: string; +} + +/** + * Generate bar heights based on audio level. + * Creates a natural-looking wave pattern where center bars are taller. + */ +function generateBarHeights(level: number, count: number): number[] { + const heights: number[] = []; + const center = (count - 1) / 2; + + for (let i = 0; i < count; i++) { + // Distance from center (0-1) + const distFromCenter = Math.abs(i - center) / center; + // Center bars are taller, edge bars shorter + const multiplier = 1 - distFromCenter * 0.5; + // Min height 4px, max height 24px when active + const minHeight = 4; + const maxHeight = 24; + const height = minHeight + level * (maxHeight - minHeight) * multiplier; + heights.push(Math.round(height)); + } + + return heights; +} + +/** + * Audio level visualizer with animated bars. + * Shows microphone input levels during voice recording. + */ +export function AudioVisualizer({ + audioLevel, + isActive, + barCount = 5, + className = "", +}: AudioVisualizerProps): React.JSX.Element { + const barHeights = useMemo(() => { + if (!isActive) { + return Array.from({ length: barCount }, () => 4); + } + return generateBarHeights(audioLevel, barCount); + }, [audioLevel, isActive, barCount]); + + return ( +
+ {barHeights.map((height, index) => ( +
+ ))} +
+ ); +} diff --git a/apps/web/src/components/speech/TextToSpeechButton.test.tsx b/apps/web/src/components/speech/TextToSpeechButton.test.tsx new file mode 100644 index 0000000..cd265c3 --- /dev/null +++ b/apps/web/src/components/speech/TextToSpeechButton.test.tsx @@ -0,0 +1,218 @@ +/** + * @file TextToSpeechButton.test.tsx + * @description Tests for the TextToSpeechButton "Read aloud" component + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { TextToSpeechButton } from "./TextToSpeechButton"; + +// Mock the useTextToSpeech hook +const mockSynthesize = vi.fn(); +const mockPlay = vi.fn(); +const mockPause = vi.fn(); +const mockStop = vi.fn(); + +vi.mock("@/hooks/useTextToSpeech", () => ({ + useTextToSpeech: vi.fn(() => ({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: false, + error: null, + isPlaying: false, + duration: 0, + currentTime: 0, + })), +})); + +// Import after mocking +import { useTextToSpeech } from "@/hooks/useTextToSpeech"; + +const mockUseTextToSpeech = useTextToSpeech as ReturnType; + +// Mock HTMLAudioElement for AudioPlayer used inside TextToSpeechButton +class MockAudio { + src = ""; + currentTime = 0; + duration = 60; + paused = true; + playbackRate = 1; + volume = 1; + onended: (() => void) | null = null; + ontimeupdate: (() => void) | null = null; + onloadedmetadata: (() => void) | null = null; + onerror: ((e: unknown) => void) | null = null; + + play(): Promise { + this.paused = false; + return Promise.resolve(); + } + + pause(): void { + this.paused = true; + } + + addEventListener(): void { + // no-op + } + + removeEventListener(): void { + // no-op + } +} + +vi.stubGlobal("Audio", MockAudio); + +describe("TextToSpeechButton", () => { + beforeEach(() => { + vi.clearAllMocks(); + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: false, + error: null, + isPlaying: false, + duration: 0, + currentTime: 0, + }); + }); + + describe("rendering", () => { + it("should render a read aloud button", () => { + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + expect(button).toBeInTheDocument(); + }); + + it("should not render AudioPlayer initially when no audio is synthesized", () => { + render(); + + expect(screen.queryByRole("region", { name: /audio player/i })).not.toBeInTheDocument(); + }); + }); + + describe("click behavior", () => { + it("should call synthesize with text on click", async () => { + const user = userEvent.setup(); + mockSynthesize.mockResolvedValueOnce(undefined); + + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + await user.click(button); + + expect(mockSynthesize).toHaveBeenCalledWith("Hello world", undefined); + }); + + it("should pass voice and tier options when provided", async () => { + const user = userEvent.setup(); + mockSynthesize.mockResolvedValueOnce(undefined); + + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + await user.click(button); + + expect(mockSynthesize).toHaveBeenCalledWith("Hello", { + voice: "alloy", + tier: "premium", + }); + }); + }); + + describe("loading state", () => { + it("should show loading indicator while synthesizing", () => { + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: true, + error: null, + isPlaying: false, + duration: 0, + currentTime: 0, + }); + + render(); + + const button = screen.getByRole("button", { name: /synthesizing/i }); + expect(button).toBeInTheDocument(); + expect(button).toBeDisabled(); + }); + }); + + describe("audio player integration", () => { + it("should show AudioPlayer when audio is available", () => { + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: "blob:mock-url", + isLoading: false, + error: null, + isPlaying: false, + duration: 30, + currentTime: 0, + }); + + render(); + + expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument(); + }); + }); + + describe("error state", () => { + it("should display error message when synthesis fails", () => { + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: false, + error: "Synthesis failed", + isPlaying: false, + duration: 0, + currentTime: 0, + }); + + render(); + + expect(screen.getByText(/synthesis failed/i)).toBeInTheDocument(); + }); + }); + + describe("accessibility", () => { + it("should have proper aria label on button", () => { + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + expect(button).toBeInTheDocument(); + }); + }); + + describe("design", () => { + it("should not use aggressive colors", () => { + const { container } = render(); + + const allElements = container.querySelectorAll("*"); + allElements.forEach((el) => { + const className = el.className; + if (typeof className === "string") { + expect(className).not.toMatch(/bg-red-|text-red-|border-red-/); + } + }); + }); + }); +}); diff --git a/apps/web/src/components/speech/TextToSpeechButton.tsx b/apps/web/src/components/speech/TextToSpeechButton.tsx new file mode 100644 index 0000000..a8f97f7 --- /dev/null +++ b/apps/web/src/components/speech/TextToSpeechButton.tsx @@ -0,0 +1,126 @@ +/** + * TextToSpeechButton Component + * "Read aloud" button that synthesizes text and plays it via AudioPlayer. + * + * Accepts text as a prop, with optional voice and tier selection. + * Shows loading state during synthesis and integrates AudioPlayer for playback. + * + * Follows PDA-friendly design: no aggressive colors, calm interface. + */ + +import { useCallback } from "react"; +import type { ReactElement } from "react"; +import { useTextToSpeech } from "@/hooks/useTextToSpeech"; +import type { SynthesizeOptions } from "@/hooks/useTextToSpeech"; +import { AudioPlayer } from "./AudioPlayer"; + +export interface TextToSpeechButtonProps { + /** The text to synthesize to speech */ + text: string; + /** Optional voice ID to use */ + voice?: string; + /** Optional tier (e.g. "standard", "premium") */ + tier?: string; + /** Optional className for the container */ + className?: string; +} + +/** + * TextToSpeechButton provides a "Read aloud" button that synthesizes + * the given text and displays an AudioPlayer for playback control. + */ +export function TextToSpeechButton({ + text, + voice, + tier, + className = "", +}: TextToSpeechButtonProps): ReactElement { + const { synthesize, audioUrl, isLoading, error } = useTextToSpeech(); + + /** + * Handle read aloud button click + */ + const handleClick = useCallback(async (): Promise => { + let options: SynthesizeOptions | undefined; + + if (voice !== undefined || tier !== undefined) { + options = {}; + if (voice !== undefined) options.voice = voice; + if (tier !== undefined) options.tier = tier; + } + + await synthesize(text, options); + }, [text, voice, tier, synthesize]); + + return ( +
+ {/* Read Aloud Button */} + + + {/* Error Display */} + {error && ( +

+ {error} +

+ )} + + {/* Audio Player (shown after synthesis) */} + {audioUrl && } +
+ ); +} + +export default TextToSpeechButton; diff --git a/apps/web/src/components/speech/VoiceInput.test.tsx b/apps/web/src/components/speech/VoiceInput.test.tsx new file mode 100644 index 0000000..74c1f44 --- /dev/null +++ b/apps/web/src/components/speech/VoiceInput.test.tsx @@ -0,0 +1,228 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { VoiceInput } from "./VoiceInput"; + +// Mock the useVoiceInput hook +const mockStartRecording = vi.fn(); +const mockStopRecording = vi.fn(); + +vi.mock("@/hooks/useVoiceInput", () => ({ + useVoiceInput: vi.fn(() => ({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0, + })), +})); + +// We need to import after mocking +import { useVoiceInput } from "@/hooks/useVoiceInput"; + +describe("VoiceInput", (): void => { + beforeEach((): void => { + vi.clearAllMocks(); + // Reset mock implementation to default + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0, + }); + }); + + it("should render a microphone button", (): void => { + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + expect(button).toBeInTheDocument(); + }); + + it("should have accessible aria label", (): void => { + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + expect(button).toHaveAttribute("aria-label", "Start voice input"); + }); + + it("should call startRecording when mic button is clicked", async (): Promise => { + const user = userEvent.setup(); + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + await user.click(button); + + expect(mockStartRecording).toHaveBeenCalledTimes(1); + }); + + it("should show recording state when isRecording is true", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0.5, + }); + + render(); + + const button = screen.getByRole("button", { + name: /stop voice input/i, + }); + expect(button).toBeInTheDocument(); + }); + + it("should call stopRecording when mic button is clicked while recording", async (): Promise => { + const user = userEvent.setup(); + + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0.5, + }); + + render(); + + const button = screen.getByRole("button", { + name: /stop voice input/i, + }); + await user.click(button); + + expect(mockStopRecording).toHaveBeenCalledTimes(1); + }); + + it("should display partial transcription text", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "hello worl", + error: null, + audioLevel: 0.3, + }); + + render(); + + expect(screen.getByText("hello worl")).toBeInTheDocument(); + }); + + it("should display final transcript text", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "hello world", + partialTranscript: "", + error: null, + audioLevel: 0, + }); + + render(); + + expect(screen.getByText("hello world")).toBeInTheDocument(); + }); + + it("should display error message", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: "Microphone access not available", + audioLevel: 0, + }); + + render(); + + expect(screen.getByText("Microphone access not available")).toBeInTheDocument(); + }); + + it("should call onTranscript callback prop", (): void => { + const onTranscript = vi.fn(); + + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "final text", + partialTranscript: "", + error: null, + audioLevel: 0, + }); + + render(); + + // The onTranscript prop is passed to the hook - we verify the prop is accepted + expect(useVoiceInput).toHaveBeenCalledWith( + expect.objectContaining({ + onTranscript, + }) + ); + }); + + it("should use calm, non-aggressive design for recording indicator", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0.5, + }); + + render(); + + // Check there are no aggressive red colors in the recording state + const button = screen.getByRole("button", { name: /stop voice input/i }); + const className = button.className; + expect(className).not.toMatch(/bg-red-|text-red-|border-red-/); + }); + + it("should use calm design for error display", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: "Something went wrong", + audioLevel: 0, + }); + + render(); + + const errorEl = screen.getByText("Something went wrong"); + const className = errorEl.className; + expect(className).not.toMatch(/text-red-600|bg-red-/); + }); + + it("should be disabled when disabled prop is true", (): void => { + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + expect(button).toBeDisabled(); + }); +}); diff --git a/apps/web/src/components/speech/VoiceInput.tsx b/apps/web/src/components/speech/VoiceInput.tsx new file mode 100644 index 0000000..fa74e53 --- /dev/null +++ b/apps/web/src/components/speech/VoiceInput.tsx @@ -0,0 +1,146 @@ +/** + * VoiceInput component + * + * Provides a microphone button with visual feedback for voice input. + * Click to start/stop recording with real-time transcription display. + * + * Design principles: + * - PDA-friendly: calm, non-aggressive colors + * - Gentle pulsing animation for recording state (blue/green) + * - Mobile-friendly touch interaction + * - Accessible with proper aria labels + */ + +import { useVoiceInput } from "@/hooks/useVoiceInput"; +import type { UseVoiceInputOptions } from "@/hooks/useVoiceInput"; +import { AudioVisualizer } from "./AudioVisualizer"; +import { Mic, MicOff } from "lucide-react"; + +export interface VoiceInputProps { + /** Callback fired when final transcription is received */ + onTranscript?: (text: string) => void; + /** Whether to use WebSocket streaming (default: true) */ + useWebSocket?: boolean; + /** Whether the input is disabled */ + disabled?: boolean; + /** Additional CSS classes for the container */ + className?: string; +} + +/** + * Voice input component with microphone capture and real-time transcription. + * Shows a mic button that toggles recording, with visual feedback + * and transcription text display. + */ +export function VoiceInput({ + onTranscript, + useWebSocket: useWs, + disabled = false, + className = "", +}: VoiceInputProps): React.JSX.Element { + const hookOptions: UseVoiceInputOptions = {}; + if (onTranscript !== undefined) { + hookOptions.onTranscript = onTranscript; + } + if (useWs !== undefined) { + hookOptions.useWebSocket = useWs; + } + + const { + isRecording, + startRecording, + stopRecording, + transcript, + partialTranscript, + error, + audioLevel, + } = useVoiceInput(hookOptions); + + const handleClick = (): void => { + if (isRecording) { + stopRecording(); + } else { + void startRecording(); + } + }; + + const displayText = isRecording ? partialTranscript : transcript; + + return ( +
+ {/* Mic button with recording indicator */} +
+ {/* Pulsing ring animation when recording */} + {isRecording && ( + + + {/* Recording status indicator */} + {isRecording && ( +
+
+ )} + + {/* Transcription text display */} + {displayText && ( +

+ {displayText} +

+ )} + + {/* Error display - calm, non-aggressive */} + {error && ( +

+ {error} +

+ )} +
+ ); +} diff --git a/apps/web/src/components/speech/index.ts b/apps/web/src/components/speech/index.ts new file mode 100644 index 0000000..657e410 --- /dev/null +++ b/apps/web/src/components/speech/index.ts @@ -0,0 +1,8 @@ +export { VoiceInput } from "./VoiceInput"; +export type { VoiceInputProps } from "./VoiceInput"; +export { AudioVisualizer } from "./AudioVisualizer"; +export type { AudioVisualizerProps } from "./AudioVisualizer"; +export { AudioPlayer } from "./AudioPlayer"; +export type { AudioPlayerProps } from "./AudioPlayer"; +export { TextToSpeechButton } from "./TextToSpeechButton"; +export type { TextToSpeechButtonProps } from "./TextToSpeechButton"; diff --git a/apps/web/src/hooks/useTextToSpeech.test.ts b/apps/web/src/hooks/useTextToSpeech.test.ts new file mode 100644 index 0000000..a6e1a0f --- /dev/null +++ b/apps/web/src/hooks/useTextToSpeech.test.ts @@ -0,0 +1,285 @@ +/** + * @file useTextToSpeech.test.ts + * @description Tests for the useTextToSpeech hook that manages TTS API integration + */ + +import { renderHook, act } from "@testing-library/react"; +import { describe, it, expect, beforeEach, vi, afterEach } from "vitest"; +import { useTextToSpeech } from "./useTextToSpeech"; +import * as speechApi from "@/lib/api/speech"; + +// Mock the speech API module +vi.mock("@/lib/api/speech", () => ({ + synthesizeSpeech: vi.fn(), + getVoices: vi.fn(), +})); + +// Mock URL.createObjectURL and URL.revokeObjectURL +const mockCreateObjectURL = vi.fn().mockReturnValue("blob:mock-audio-url"); +const mockRevokeObjectURL = vi.fn(); + +beforeEach(() => { + global.URL.createObjectURL = mockCreateObjectURL; + global.URL.revokeObjectURL = mockRevokeObjectURL; +}); + +// Mock HTMLAudioElement +class MockAudio { + src = ""; + currentTime = 0; + duration = 120; + paused = true; + playbackRate = 1; + volume = 1; + onended: (() => void) | null = null; + ontimeupdate: (() => void) | null = null; + onloadedmetadata: (() => void) | null = null; + onerror: ((e: unknown) => void) | null = null; + + play(): Promise { + this.paused = false; + return Promise.resolve(); + } + + pause(): void { + this.paused = true; + } + + addEventListener(event: string, handler: () => void): void { + if (event === "ended") this.onended = handler; + if (event === "timeupdate") this.ontimeupdate = handler; + if (event === "loadedmetadata") this.onloadedmetadata = handler; + if (event === "error") this.onerror = handler; + } + + removeEventListener(): void { + // no-op for tests + } +} + +vi.stubGlobal("Audio", MockAudio); + +const mockSynthesizeSpeech = speechApi.synthesizeSpeech as ReturnType; + +describe("useTextToSpeech", () => { + beforeEach(() => { + vi.clearAllMocks(); + mockCreateObjectURL.mockReturnValue("blob:mock-audio-url"); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe("initial state", () => { + it("should return correct initial interface", () => { + const { result } = renderHook(() => useTextToSpeech()); + + expect(result.current.synthesize).toBeTypeOf("function"); + expect(result.current.play).toBeTypeOf("function"); + expect(result.current.pause).toBeTypeOf("function"); + expect(result.current.stop).toBeTypeOf("function"); + expect(result.current.audioUrl).toBeNull(); + expect(result.current.isLoading).toBe(false); + expect(result.current.error).toBeNull(); + expect(result.current.isPlaying).toBe(false); + expect(result.current.duration).toBe(0); + expect(result.current.currentTime).toBe(0); + }); + }); + + describe("synthesize", () => { + it("should call API and return audio blob URL", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello world"); + }); + + expect(mockSynthesizeSpeech).toHaveBeenCalledWith({ + text: "Hello world", + }); + expect(result.current.audioUrl).toBe("blob:mock-audio-url"); + expect(result.current.isLoading).toBe(false); + expect(result.current.error).toBeNull(); + }); + + it("should pass voice and tier options to API", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello", { + voice: "alloy", + tier: "premium", + speed: 1.5, + }); + }); + + expect(mockSynthesizeSpeech).toHaveBeenCalledWith({ + text: "Hello", + voice: "alloy", + tier: "premium", + speed: 1.5, + }); + }); + + it("should set loading state while synthesizing", async () => { + let resolvePromise: ((value: Blob) => void) | undefined; + const pendingPromise = new Promise((resolve) => { + resolvePromise = resolve; + }); + mockSynthesizeSpeech.mockReturnValueOnce(pendingPromise); + + const { result } = renderHook(() => useTextToSpeech()); + + act(() => { + void result.current.synthesize("Hello"); + }); + + expect(result.current.isLoading).toBe(true); + + await act(async () => { + resolvePromise?.(new Blob(["audio"], { type: "audio/mpeg" })); + await pendingPromise; + }); + + expect(result.current.isLoading).toBe(false); + }); + + it("should handle API errors gracefully", async () => { + mockSynthesizeSpeech.mockRejectedValueOnce(new Error("Synthesis failed")); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + expect(result.current.error).toBe("Synthesis failed"); + expect(result.current.isLoading).toBe(false); + expect(result.current.audioUrl).toBeNull(); + }); + + it("should cache audio for repeated synthesis of same text", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValue(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + // First call + await act(async () => { + await result.current.synthesize("Hello world"); + }); + + // Second call with same text + await act(async () => { + await result.current.synthesize("Hello world"); + }); + + // API should only be called once due to caching + expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(1); + }); + + it("should not cache when options differ", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValue(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello", { voice: "alloy" }); + }); + + await act(async () => { + await result.current.synthesize("Hello", { voice: "nova" }); + }); + + expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(2); + }); + }); + + describe("playback controls", () => { + it("should play audio after synthesis", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + await act(async () => { + await result.current.play(); + }); + + expect(result.current.isPlaying).toBe(true); + }); + + it("should pause audio playback", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + await act(async () => { + await result.current.play(); + }); + + act(() => { + result.current.pause(); + }); + + expect(result.current.isPlaying).toBe(false); + }); + + it("should stop and reset playback", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + await act(async () => { + await result.current.play(); + }); + + act(() => { + result.current.stop(); + }); + + expect(result.current.isPlaying).toBe(false); + expect(result.current.currentTime).toBe(0); + }); + }); + + describe("cleanup", () => { + it("should revoke object URLs on unmount", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result, unmount } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + unmount(); + + expect(mockRevokeObjectURL).toHaveBeenCalled(); + }); + }); +}); diff --git a/apps/web/src/hooks/useTextToSpeech.ts b/apps/web/src/hooks/useTextToSpeech.ts new file mode 100644 index 0000000..cc04cc4 --- /dev/null +++ b/apps/web/src/hooks/useTextToSpeech.ts @@ -0,0 +1,239 @@ +/** + * useTextToSpeech hook + * Manages TTS API integration with synthesis, caching, and playback state + */ + +import { useState, useCallback, useRef, useEffect } from "react"; +import { synthesizeSpeech } from "@/lib/api/speech"; + +export interface SynthesizeOptions { + voice?: string; + speed?: number; + format?: string; + tier?: string; +} + +export interface UseTextToSpeechReturn { + /** Synthesize text to speech audio */ + synthesize: (text: string, options?: SynthesizeOptions) => Promise; + /** The URL of the synthesized audio blob */ + audioUrl: string | null; + /** Whether synthesis is in progress */ + isLoading: boolean; + /** Error message if synthesis failed */ + error: string | null; + /** Start or resume audio playback */ + play: () => Promise; + /** Pause audio playback */ + pause: () => void; + /** Stop audio and reset to beginning */ + stop: () => void; + /** Whether audio is currently playing */ + isPlaying: boolean; + /** Total duration of the audio in seconds */ + duration: number; + /** Current playback position in seconds */ + currentTime: number; +} + +/** Cache key generator for text + options combination */ +function getCacheKey(text: string, options?: SynthesizeOptions): string { + return JSON.stringify({ text, ...options }); +} + +/** + * Hook for text-to-speech API integration with caching and playback controls + */ +export function useTextToSpeech(): UseTextToSpeechReturn { + const [audioUrl, setAudioUrl] = useState(null); + const [isLoading, setIsLoading] = useState(false); + const [error, setError] = useState(null); + const [isPlaying, setIsPlaying] = useState(false); + const [duration, setDuration] = useState(0); + const [currentTime, setCurrentTime] = useState(0); + + // Audio element ref for playback control + const audioRef = useRef(null); + + // Cache: maps cache key -> blob URL + const cacheRef = useRef>(new Map()); + + // Track all blob URLs for cleanup + const blobUrlsRef = useRef>(new Set()); + + /** + * Clean up audio element event listeners and state + */ + const cleanupAudio = useCallback(() => { + const audio = audioRef.current; + if (audio) { + audio.pause(); + audio.removeEventListener("ended", handleEnded); + audio.removeEventListener("timeupdate", handleTimeUpdate); + audio.removeEventListener("loadedmetadata", handleLoadedMetadata); + audioRef.current = null; + } + setIsPlaying(false); + }, []); + + /** + * Handle audio ended event + */ + function handleEnded(): void { + setIsPlaying(false); + setCurrentTime(0); + } + + /** + * Handle audio time update event + */ + function handleTimeUpdate(): void { + const audio = audioRef.current; + if (audio) { + setCurrentTime(audio.currentTime); + } + } + + /** + * Handle audio metadata loaded event + */ + function handleLoadedMetadata(): void { + const audio = audioRef.current; + if (audio && isFinite(audio.duration)) { + setDuration(audio.duration); + } + } + + /** + * Set up a new Audio element for a given URL + */ + const setupAudio = useCallback( + (url: string) => { + cleanupAudio(); + + const audio = new Audio(url); + audio.addEventListener("ended", handleEnded); + audio.addEventListener("timeupdate", handleTimeUpdate); + audio.addEventListener("loadedmetadata", handleLoadedMetadata); + audioRef.current = audio; + }, + [cleanupAudio] + ); + + /** + * Synthesize text to speech + */ + const synthesize = useCallback( + async (text: string, options?: SynthesizeOptions): Promise => { + setError(null); + + // Check cache first + const cacheKey = getCacheKey(text, options); + const cachedUrl = cacheRef.current.get(cacheKey); + + if (cachedUrl) { + setAudioUrl(cachedUrl); + setupAudio(cachedUrl); + return; + } + + setIsLoading(true); + + try { + const blob = await synthesizeSpeech({ + text, + ...(options?.voice !== undefined && { voice: options.voice }), + ...(options?.speed !== undefined && { speed: options.speed }), + ...(options?.format !== undefined && { format: options.format }), + ...(options?.tier !== undefined && { tier: options.tier }), + }); + + const url = URL.createObjectURL(blob); + + // Store in cache and track for cleanup + cacheRef.current.set(cacheKey, url); + blobUrlsRef.current.add(url); + + setAudioUrl(url); + setupAudio(url); + } catch (err) { + const errorMsg = err instanceof Error ? err.message : "Speech synthesis failed"; + setError(errorMsg); + setAudioUrl(null); + } finally { + setIsLoading(false); + } + }, + [setupAudio] + ); + + /** + * Start or resume audio playback + */ + const play = useCallback(async (): Promise => { + const audio = audioRef.current; + if (audio) { + await audio.play(); + setIsPlaying(true); + } + }, []); + + /** + * Pause audio playback + */ + const pause = useCallback((): void => { + const audio = audioRef.current; + if (audio) { + audio.pause(); + setIsPlaying(false); + } + }, []); + + /** + * Stop audio and reset to beginning + */ + const stop = useCallback((): void => { + const audio = audioRef.current; + if (audio) { + audio.pause(); + audio.currentTime = 0; + setIsPlaying(false); + setCurrentTime(0); + } + }, []); + + // Cleanup on unmount: revoke all blob URLs and clean up audio + useEffect((): (() => void) => { + return (): void => { + // Clean up audio element + const audio = audioRef.current; + if (audio) { + audio.pause(); + audio.removeEventListener("ended", handleEnded); + audio.removeEventListener("timeupdate", handleTimeUpdate); + audio.removeEventListener("loadedmetadata", handleLoadedMetadata); + audioRef.current = null; + } + + // Revoke all blob URLs + for (const url of blobUrlsRef.current) { + URL.revokeObjectURL(url); + } + blobUrlsRef.current.clear(); + cacheRef.current.clear(); + }; + }, []); + + return { + synthesize, + audioUrl, + isLoading, + error, + play, + pause, + stop, + isPlaying, + duration, + currentTime, + }; +} diff --git a/apps/web/src/hooks/useVoiceInput.test.ts b/apps/web/src/hooks/useVoiceInput.test.ts new file mode 100644 index 0000000..4f80a34 --- /dev/null +++ b/apps/web/src/hooks/useVoiceInput.test.ts @@ -0,0 +1,362 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { renderHook, act, waitFor } from "@testing-library/react"; +import { useVoiceInput } from "./useVoiceInput"; +import type { Socket } from "socket.io-client"; +import { io } from "socket.io-client"; + +// Mock socket.io-client +vi.mock("socket.io-client"); + +// Mock MediaRecorder +const mockMediaRecorder = { + start: vi.fn(), + stop: vi.fn(), + pause: vi.fn(), + resume: vi.fn(), + state: "inactive" as RecordingState, + ondataavailable: null as ((event: BlobEvent) => void) | null, + onstop: null as (() => void) | null, + onerror: null as ((event: Event) => void) | null, + addEventListener: vi.fn((event: string, handler: EventListenerOrEventListenerObject) => { + if (event === "dataavailable") { + mockMediaRecorder.ondataavailable = handler as (event: BlobEvent) => void; + } else if (event === "stop") { + mockMediaRecorder.onstop = handler as () => void; + } else if (event === "error") { + mockMediaRecorder.onerror = handler as (event: Event) => void; + } + }), + removeEventListener: vi.fn(), + stream: { + getTracks: vi.fn(() => [{ stop: vi.fn() }]), + }, +}; + +// Mock MediaStream with getByteFrequencyData for audio level +const mockAnalyserNode = { + fftSize: 256, + frequencyBinCount: 128, + getByteFrequencyData: vi.fn((array: Uint8Array) => { + // Simulate some audio data + for (let i = 0; i < array.length; i++) { + array[i] = 128; + } + }), + connect: vi.fn(), + disconnect: vi.fn(), +}; + +const mockMediaStreamSource = { + connect: vi.fn(), + disconnect: vi.fn(), +}; + +const mockAudioContext = { + createAnalyser: vi.fn(() => mockAnalyserNode), + createMediaStreamSource: vi.fn(() => mockMediaStreamSource), + close: vi.fn(), + state: "running", +}; + +// Mock getUserMedia +const mockGetUserMedia = vi.fn(); + +// Set up global mocks +Object.defineProperty(global.navigator, "mediaDevices", { + value: { + getUserMedia: mockGetUserMedia, + }, + writable: true, + configurable: true, +}); + +// Mock AudioContext +vi.stubGlobal( + "AudioContext", + vi.fn(() => mockAudioContext) +); + +// Mock MediaRecorder constructor +vi.stubGlobal( + "MediaRecorder", + vi.fn(() => mockMediaRecorder) +); + +// Add isTypeSupported static method +( + global.MediaRecorder as unknown as { isTypeSupported: (type: string) => boolean } +).isTypeSupported = vi.fn(() => true); + +describe("useVoiceInput", (): void => { + let mockSocket: Partial; + let socketEventHandlers: Record void>; + + beforeEach((): void => { + socketEventHandlers = {}; + + mockSocket = { + on: vi.fn((event: string, handler: (...args: unknown[]) => void) => { + socketEventHandlers[event] = handler; + return mockSocket; + }) as unknown as Socket["on"], + off: vi.fn(() => mockSocket) as unknown as Socket["off"], + emit: vi.fn() as unknown as Socket["emit"], + connect: vi.fn(), + disconnect: vi.fn(), + connected: true, + }; + + (io as unknown as ReturnType).mockReturnValue(mockSocket); + + // Reset MediaRecorder mock state + mockMediaRecorder.state = "inactive"; + mockMediaRecorder.ondataavailable = null; + mockMediaRecorder.onstop = null; + mockMediaRecorder.onerror = null; + + // Default: getUserMedia succeeds + const mockStream = { + getTracks: vi.fn(() => [{ stop: vi.fn() }]), + } as unknown as MediaStream; + mockGetUserMedia.mockResolvedValue(mockStream); + }); + + afterEach((): void => { + vi.clearAllMocks(); + }); + + it("should return the correct interface", (): void => { + const { result } = renderHook(() => useVoiceInput()); + + expect(result.current).toHaveProperty("isRecording"); + expect(result.current).toHaveProperty("startRecording"); + expect(result.current).toHaveProperty("stopRecording"); + expect(result.current).toHaveProperty("transcript"); + expect(result.current).toHaveProperty("partialTranscript"); + expect(result.current).toHaveProperty("error"); + expect(result.current).toHaveProperty("audioLevel"); + }); + + it("should start with default state", (): void => { + const { result } = renderHook(() => useVoiceInput()); + + expect(result.current.isRecording).toBe(false); + expect(result.current.transcript).toBe(""); + expect(result.current.partialTranscript).toBe(""); + expect(result.current.error).toBeNull(); + expect(result.current.audioLevel).toBe(0); + }); + + it("should start recording when startRecording is called", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(true); + expect(mockGetUserMedia).toHaveBeenCalledWith({ + audio: { + echoCancellation: true, + noiseSuppression: true, + sampleRate: 16000, + }, + }); + }); + + it("should stop recording when stopRecording is called", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(true); + + act(() => { + result.current.stopRecording(); + }); + + expect(result.current.isRecording).toBe(false); + }); + + it("should set error when microphone access is denied", async (): Promise => { + mockGetUserMedia.mockRejectedValueOnce( + new DOMException("Permission denied", "NotAllowedError") + ); + + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(false); + expect(result.current.error).toBeTruthy(); + expect(result.current.error).toContain("microphone"); + }); + + it("should connect to speech WebSocket namespace", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(io).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ + path: "/socket.io", + }) + ); + }); + + it("should emit start-transcription when recording begins", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(mockSocket.emit).toHaveBeenCalledWith( + "start-transcription", + expect.objectContaining({ + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + format: expect.any(String), + }) + ); + }); + + it("should emit stop-transcription when recording stops", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + result.current.stopRecording(); + }); + + expect(mockSocket.emit).toHaveBeenCalledWith("stop-transcription"); + }); + + it("should handle partial transcription events", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-partial"]?.({ + text: "hello world", + }); + }); + + await waitFor(() => { + expect(result.current.partialTranscript).toBe("hello world"); + }); + }); + + it("should handle final transcription events", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-final"]?.({ + text: "hello world final", + }); + }); + + await waitFor(() => { + expect(result.current.transcript).toBe("hello world final"); + }); + }); + + it("should handle transcription error events", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-error"]?.({ + message: "Transcription failed", + }); + }); + + await waitFor(() => { + expect(result.current.error).toBe("Transcription failed"); + }); + }); + + it("should call onTranscript callback when final transcription received", async (): Promise => { + const onTranscript = vi.fn(); + const { result } = renderHook(() => useVoiceInput({ onTranscript })); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-final"]?.({ + text: "final text", + }); + }); + + await waitFor(() => { + expect(onTranscript).toHaveBeenCalledWith("final text"); + }); + }); + + it("should clean up on unmount", async (): Promise => { + const { result, unmount } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + unmount(); + + expect(mockSocket.disconnect).toHaveBeenCalled(); + }); + + it("should not start recording if already recording", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + // Reset the call count + mockGetUserMedia.mockClear(); + + await act(async () => { + await result.current.startRecording(); + }); + + // Should not have called getUserMedia again + expect(mockGetUserMedia).not.toHaveBeenCalled(); + }); + + describe("REST fallback", (): void => { + it("should fall back to REST when WebSocket is unavailable", async (): Promise => { + // Simulate socket not connecting + (mockSocket as { connected: boolean }).connected = false; + + const { result } = renderHook(() => useVoiceInput({ useWebSocket: false })); + + // Should still be able to start recording (REST mode) + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(true); + }); + }); +}); diff --git a/apps/web/src/hooks/useVoiceInput.ts b/apps/web/src/hooks/useVoiceInput.ts new file mode 100644 index 0000000..24e792d --- /dev/null +++ b/apps/web/src/hooks/useVoiceInput.ts @@ -0,0 +1,409 @@ +/** + * useVoiceInput hook + * + * Custom hook for microphone capture and real-time transcription. + * Supports WebSocket streaming for real-time partial transcriptions + * with REST upload fallback when WebSocket is unavailable. + */ + +import { useState, useCallback, useRef, useEffect } from "react"; +import type { Socket } from "socket.io-client"; +import { io } from "socket.io-client"; +import { API_BASE_URL } from "@/lib/config"; +import { apiPostFormData } from "@/lib/api/client"; + +/** Options for the useVoiceInput hook */ +export interface UseVoiceInputOptions { + /** Callback fired when final transcription is received */ + onTranscript?: (text: string) => void; + /** Whether to use WebSocket streaming (default: true) */ + useWebSocket?: boolean; + /** Audio sample rate in Hz (default: 16000) */ + sampleRate?: number; +} + +/** Return type for the useVoiceInput hook */ +export interface UseVoiceInputReturn { + /** Whether the microphone is currently recording */ + isRecording: boolean; + /** Start microphone capture and transcription */ + startRecording: () => Promise; + /** Stop microphone capture and transcription */ + stopRecording: () => void; + /** The final transcription text */ + transcript: string; + /** Partial transcription text (updates in real-time) */ + partialTranscript: string; + /** Error message if something went wrong */ + error: string | null; + /** Current audio input level (0-1) */ + audioLevel: number; +} + +interface TranscriptionPartialPayload { + text: string; +} + +interface TranscriptionFinalPayload { + text: string; +} + +interface TranscriptionErrorPayload { + message: string; +} + +interface TranscribeResponse { + data: { + text: string; + }; +} + +/** + * Determine the best MIME type for audio recording + */ +function getAudioMimeType(): string { + if (typeof MediaRecorder === "undefined") { + return "audio/webm"; + } + const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"]; + for (const type of types) { + if (MediaRecorder.isTypeSupported(type)) { + return type; + } + } + return "audio/webm"; +} + +/** + * Hook for microphone capture and real-time speech-to-text transcription. + * + * Uses WebSocket streaming by default for real-time partial transcriptions. + * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket + * is disabled or unavailable. + */ +export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn { + const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options; + + const [isRecording, setIsRecording] = useState(false); + const [transcript, setTranscript] = useState(""); + const [partialTranscript, setPartialTranscript] = useState(""); + const [error, setError] = useState(null); + const [audioLevel, setAudioLevel] = useState(0); + + // Refs to hold mutable state without re-renders + const socketRef = useRef(null); + const mediaRecorderRef = useRef(null); + const streamRef = useRef(null); + const audioContextRef = useRef(null); + const analyserRef = useRef(null); + const animationFrameRef = useRef(null); + const onTranscriptRef = useRef(onTranscript); + const recordedChunksRef = useRef([]); + const isRecordingRef = useRef(false); + + // Keep callback ref up to date + useEffect(() => { + onTranscriptRef.current = onTranscript; + }, [onTranscript]); + + /** + * Set up audio analysis for visualizing input level + */ + const setupAudioAnalysis = useCallback((stream: MediaStream): void => { + try { + const audioContext = new AudioContext(); + const analyser = audioContext.createAnalyser(); + const source = audioContext.createMediaStreamSource(stream); + + analyser.fftSize = 256; + source.connect(analyser); + + audioContextRef.current = audioContext; + analyserRef.current = analyser; + + // Start level monitoring + const dataArray = new Uint8Array(analyser.frequencyBinCount); + + const updateLevel = (): void => { + if (!isRecordingRef.current) { + return; + } + + analyser.getByteFrequencyData(dataArray); + + // Calculate average level + let sum = 0; + for (const value of dataArray) { + sum += value; + } + const average = sum / dataArray.length / 255; + setAudioLevel(average); + + animationFrameRef.current = requestAnimationFrame(updateLevel); + }; + + animationFrameRef.current = requestAnimationFrame(updateLevel); + } catch { + // Audio analysis is non-critical; continue without it + console.warn("Audio analysis not available"); + } + }, []); + + /** + * Clean up audio analysis resources + */ + const cleanupAudioAnalysis = useCallback((): void => { + if (animationFrameRef.current !== null) { + cancelAnimationFrame(animationFrameRef.current); + animationFrameRef.current = null; + } + if (audioContextRef.current) { + void audioContextRef.current.close(); + audioContextRef.current = null; + } + analyserRef.current = null; + setAudioLevel(0); + }, []); + + /** + * Connect to the speech WebSocket namespace + */ + const connectSocket = useCallback((): Socket => { + const socket = io(API_BASE_URL, { + path: "/socket.io", + transports: ["websocket", "polling"], + }); + + socket.on("transcription-partial", (data: TranscriptionPartialPayload) => { + setPartialTranscript(data.text); + }); + + socket.on("transcription-final", (data: TranscriptionFinalPayload) => { + setTranscript(data.text); + setPartialTranscript(""); + onTranscriptRef.current?.(data.text); + }); + + socket.on("transcription-error", (data: TranscriptionErrorPayload) => { + setError(data.message); + }); + + socketRef.current = socket; + return socket; + }, []); + + /** + * Disconnect the WebSocket + */ + const disconnectSocket = useCallback((): void => { + if (socketRef.current) { + socketRef.current.off("transcription-partial"); + socketRef.current.off("transcription-final"); + socketRef.current.off("transcription-error"); + socketRef.current.disconnect(); + socketRef.current = null; + } + }, []); + + /** + * Send recorded audio via REST API as fallback + */ + const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise => { + try { + const formData = new FormData(); + formData.append("audio", audioBlob, "recording.webm"); + + const response = await apiPostFormData( + "/api/speech/transcribe", + formData + ); + + if (response.data.text) { + setTranscript(response.data.text); + setPartialTranscript(""); + onTranscriptRef.current?.(response.data.text); + } + } catch (err) { + const message = err instanceof Error ? err.message : "Transcription request failed"; + setError(message); + } + }, []); + + /** + * Stop all media tracks on the stream + */ + const stopMediaTracks = useCallback((): void => { + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => { + track.stop(); + }); + streamRef.current = null; + } + }, []); + + /** + * Start microphone capture and transcription + */ + const startRecording = useCallback(async (): Promise => { + // Prevent double-start + if (isRecordingRef.current) { + return; + } + + setError(null); + setPartialTranscript(""); + recordedChunksRef.current = []; + + try { + // Request microphone access + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + sampleRate, + }, + }); + + streamRef.current = stream; + + // Set up audio level visualization + setupAudioAnalysis(stream); + + // Determine MIME type + const mimeType = getAudioMimeType(); + + // Create MediaRecorder + const mediaRecorder = new MediaRecorder(stream, { mimeType }); + mediaRecorderRef.current = mediaRecorder; + + // Connect WebSocket if enabled + let socket: Socket | null = null; + if (useWs) { + socket = connectSocket(); + + // Emit start-transcription event + socket.emit("start-transcription", { + format: mimeType, + sampleRate, + }); + } + + // Handle audio data chunks + mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => { + if (event.data.size > 0) { + if (socket?.connected) { + // Stream chunks via WebSocket + socket.emit("audio-chunk", event.data); + } else { + // Collect chunks for REST upload + recordedChunksRef.current.push(event.data); + } + } + }); + + // Handle recording stop + mediaRecorder.addEventListener("stop", () => { + // If using REST fallback, send collected audio + if (!useWs || !socket?.connected) { + if (recordedChunksRef.current.length > 0) { + const audioBlob = new Blob(recordedChunksRef.current, { + type: mimeType, + }); + void sendAudioViaRest(audioBlob); + } + } + }); + + // Handle errors + mediaRecorder.addEventListener("error", () => { + setError("Recording encountered an issue. Please try again."); + setIsRecording(false); + isRecordingRef.current = false; + }); + + // Start recording with timeslice for streaming chunks (250ms intervals) + mediaRecorder.start(250); + setIsRecording(true); + isRecordingRef.current = true; + } catch (err) { + // Handle specific error types + if (err instanceof DOMException) { + if (err.name === "NotAllowedError") { + setError( + "Microphone access was not granted. Please allow microphone access to use voice input." + ); + } else if (err.name === "NotFoundError") { + setError("No microphone found. Please connect a microphone and try again."); + } else { + setError("Unable to access the microphone. Please check your device settings."); + } + } else { + setError("Unable to start voice input. Please try again."); + } + + // Clean up on failure + stopMediaTracks(); + cleanupAudioAnalysis(); + } + }, [ + useWs, + sampleRate, + setupAudioAnalysis, + connectSocket, + sendAudioViaRest, + stopMediaTracks, + cleanupAudioAnalysis, + ]); + + /** + * Stop microphone capture and transcription + */ + const stopRecording = useCallback((): void => { + setIsRecording(false); + isRecordingRef.current = false; + + // Stop MediaRecorder + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { + mediaRecorderRef.current.stop(); + mediaRecorderRef.current = null; + } + + // Stop media tracks + stopMediaTracks(); + + // Clean up audio analysis + cleanupAudioAnalysis(); + + // Emit stop event and disconnect WebSocket + if (socketRef.current) { + socketRef.current.emit("stop-transcription"); + // Give the server a moment to process the final chunk before disconnecting + setTimeout(() => { + disconnectSocket(); + }, 500); + } + }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]); + + // Cleanup on unmount + useEffect(() => { + return (): void => { + isRecordingRef.current = false; + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { + mediaRecorderRef.current.stop(); + } + stopMediaTracks(); + cleanupAudioAnalysis(); + disconnectSocket(); + }; + }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]); + + return { + isRecording, + startRecording, + stopRecording, + transcript, + partialTranscript, + error, + audioLevel, + }; +} diff --git a/apps/web/src/lib/api/speech.ts b/apps/web/src/lib/api/speech.ts new file mode 100644 index 0000000..cf5aeef --- /dev/null +++ b/apps/web/src/lib/api/speech.ts @@ -0,0 +1,58 @@ +/** + * Speech API client + * Handles text-to-speech synthesis and voice listing via /api/speech + */ + +import { apiGet } from "./client"; +import { API_BASE_URL } from "../config"; + +export interface VoiceInfo { + id: string; + name: string; + language: string; + gender?: string; + preview_url?: string; +} + +export interface SynthesizeOptions { + text: string; + voice?: string; + speed?: number; + format?: string; + tier?: string; +} + +export interface VoicesResponse { + data: VoiceInfo[]; +} + +/** + * Fetch available TTS voices + */ +export async function getVoices(): Promise { + return apiGet("/api/speech/voices"); +} + +/** + * Synthesize text to speech audio + * Returns the audio as a Blob since the API returns binary audio data + */ +export async function synthesizeSpeech(options: SynthesizeOptions): Promise { + const url = `${API_BASE_URL}/api/speech/synthesize`; + + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + credentials: "include", + body: JSON.stringify(options), + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => "Unknown error"); + throw new Error(`Speech synthesis failed: ${errorText}`); + } + + return response.blob(); +}