fix(#388): address PR review findings — fix WebSocket/REST bugs, improve error handling, fix types and comments
Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -34,7 +34,7 @@ speech/
|
|||||||
└── providers/
|
└── providers/
|
||||||
├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic)
|
├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic)
|
||||||
├── base-tts.provider.spec.ts
|
├── base-tts.provider.spec.ts
|
||||||
├── kokoro-tts.provider.ts # Default tier (CPU, 54 voices, 8 languages)
|
├── kokoro-tts.provider.ts # Default tier (CPU, 53 voices, 8 languages)
|
||||||
├── kokoro-tts.provider.spec.ts
|
├── kokoro-tts.provider.spec.ts
|
||||||
├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control)
|
├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control)
|
||||||
├── chatterbox-tts.provider.spec.ts
|
├── chatterbox-tts.provider.spec.ts
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
* SynthesizeDto
|
* SynthesizeDto
|
||||||
*
|
*
|
||||||
* DTO for text-to-speech synthesis requests.
|
* DTO for text-to-speech synthesis requests.
|
||||||
* The text field is validated by TextValidationPipe for length/emptiness.
|
* Text and option fields are validated by class-validator decorators.
|
||||||
* Additional options control voice, speed, format, and tier selection.
|
* Additional options control voice, speed, format, and tier selection.
|
||||||
*
|
*
|
||||||
* Issue #398
|
* Issue #398
|
||||||
@@ -10,29 +10,13 @@
|
|||||||
|
|
||||||
import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
|
import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
|
||||||
import { Type } from "class-transformer";
|
import { Type } from "class-transformer";
|
||||||
|
import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types";
|
||||||
import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";
|
import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";
|
||||||
|
|
||||||
/**
|
|
||||||
* Valid audio output formats for TTS synthesis.
|
|
||||||
*/
|
|
||||||
const VALID_AUDIO_FORMATS: readonly AudioFormat[] = [
|
|
||||||
"mp3",
|
|
||||||
"wav",
|
|
||||||
"opus",
|
|
||||||
"flac",
|
|
||||||
"aac",
|
|
||||||
"pcm",
|
|
||||||
] as const;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Valid TTS tiers for provider selection.
|
|
||||||
*/
|
|
||||||
const VALID_SPEECH_TIERS: readonly SpeechTier[] = ["default", "premium", "fallback"] as const;
|
|
||||||
|
|
||||||
export class SynthesizeDto {
|
export class SynthesizeDto {
|
||||||
/**
|
/**
|
||||||
* Text to convert to speech.
|
* Text to convert to speech.
|
||||||
* Validated separately by TextValidationPipe for length and emptiness.
|
* Validated by class-validator decorators for type and maximum length.
|
||||||
*/
|
*/
|
||||||
@IsString({ message: "text must be a string" })
|
@IsString({ message: "text must be a string" })
|
||||||
@MaxLength(4096, { message: "text must not exceed 4096 characters" })
|
@MaxLength(4096, { message: "text must not exceed 4096 characters" })
|
||||||
@@ -66,8 +50,8 @@ export class SynthesizeDto {
|
|||||||
*/
|
*/
|
||||||
@IsOptional()
|
@IsOptional()
|
||||||
@IsString({ message: "format must be a string" })
|
@IsString({ message: "format must be a string" })
|
||||||
@IsIn(VALID_AUDIO_FORMATS, {
|
@IsIn(AUDIO_FORMATS, {
|
||||||
message: `format must be one of: ${VALID_AUDIO_FORMATS.join(", ")}`,
|
message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`,
|
||||||
})
|
})
|
||||||
format?: AudioFormat;
|
format?: AudioFormat;
|
||||||
|
|
||||||
@@ -78,8 +62,8 @@ export class SynthesizeDto {
|
|||||||
*/
|
*/
|
||||||
@IsOptional()
|
@IsOptional()
|
||||||
@IsString({ message: "tier must be a string" })
|
@IsString({ message: "tier must be a string" })
|
||||||
@IsIn(VALID_SPEECH_TIERS, {
|
@IsIn(SPEECH_TIERS, {
|
||||||
message: `tier must be one of: ${VALID_SPEECH_TIERS.join(", ")}`,
|
message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`,
|
||||||
})
|
})
|
||||||
tier?: SpeechTier;
|
tier?: SpeechTier;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
export type { ISTTProvider } from "./stt-provider.interface";
|
export type { ISTTProvider } from "./stt-provider.interface";
|
||||||
export type { ITTSProvider } from "./tts-provider.interface";
|
export type { ITTSProvider } from "./tts-provider.interface";
|
||||||
|
export { SPEECH_TIERS, AUDIO_FORMATS } from "./speech-types";
|
||||||
export type {
|
export type {
|
||||||
SpeechTier,
|
SpeechTier,
|
||||||
AudioFormat,
|
AudioFormat,
|
||||||
|
|||||||
@@ -12,19 +12,21 @@
|
|||||||
// ==========================================
|
// ==========================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TTS provider tier.
|
* Canonical array of TTS provider tiers.
|
||||||
* Determines which TTS engine is used for synthesis.
|
* Determines which TTS engine is used for synthesis.
|
||||||
*
|
*
|
||||||
* - default: Primary TTS engine (e.g., Kokoro)
|
* - default: Primary TTS engine (e.g., Kokoro)
|
||||||
* - premium: Higher quality TTS engine (e.g., Chatterbox)
|
* - premium: Higher quality TTS engine (e.g., Chatterbox)
|
||||||
* - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
|
* - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
|
||||||
*/
|
*/
|
||||||
export type SpeechTier = "default" | "premium" | "fallback";
|
export const SPEECH_TIERS = ["default", "premium", "fallback"] as const;
|
||||||
|
export type SpeechTier = (typeof SPEECH_TIERS)[number];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Audio output format for TTS synthesis.
|
* Canonical array of audio output formats for TTS synthesis.
|
||||||
*/
|
*/
|
||||||
export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm";
|
export const AUDIO_FORMATS = ["mp3", "wav", "opus", "flac", "aac", "pcm"] as const;
|
||||||
|
export type AudioFormat = (typeof AUDIO_FORMATS)[number];
|
||||||
|
|
||||||
// ==========================================
|
// ==========================================
|
||||||
// STT Types
|
// STT Types
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
|
|||||||
*
|
*
|
||||||
* @example
|
* @example
|
||||||
* ```typescript
|
* ```typescript
|
||||||
* class SpeachesProvider implements ISTTProvider {
|
* class SpeachesSttProvider implements ISTTProvider {
|
||||||
* readonly name = "speaches";
|
* readonly name = "speaches";
|
||||||
*
|
*
|
||||||
* async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
|
* async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
* CPU-based, always available, Apache 2.0 license.
|
* CPU-based, always available, Apache 2.0 license.
|
||||||
*
|
*
|
||||||
* Features:
|
* Features:
|
||||||
* - 54 built-in voices across 8 languages
|
* - 53 built-in voices across 8 languages
|
||||||
* - Speed control: 0.25x to 4.0x
|
* - Speed control: 0.25x to 4.0x
|
||||||
* - Output formats: mp3, wav, opus, flac
|
* - Output formats: mp3, wav, opus, flac
|
||||||
* - Voice metadata derived from ID prefix (language, gender, accent)
|
* - Voice metadata derived from ID prefix (language, gender, accent)
|
||||||
@@ -222,7 +222,7 @@ export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
|
|||||||
/**
|
/**
|
||||||
* Kokoro-FastAPI TTS provider (default tier).
|
* Kokoro-FastAPI TTS provider (default tier).
|
||||||
*
|
*
|
||||||
* CPU-based text-to-speech engine with 54 built-in voices across 8 languages.
|
* CPU-based text-to-speech engine with 53 built-in voices across 8 languages.
|
||||||
* Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
|
* Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
|
||||||
*
|
*
|
||||||
* @example
|
* @example
|
||||||
@@ -254,7 +254,7 @@ export class KokoroTtsProvider extends BaseTTSProvider {
|
|||||||
/**
|
/**
|
||||||
* List all available Kokoro voices with metadata.
|
* List all available Kokoro voices with metadata.
|
||||||
*
|
*
|
||||||
* Returns the full catalog of 54 built-in voices with language, gender,
|
* Returns the full catalog of 53 built-in voices with language, gender,
|
||||||
* and accent information derived from voice ID prefixes.
|
* and accent information derived from voice ID prefixes.
|
||||||
*
|
*
|
||||||
* @returns Array of VoiceInfo objects for all Kokoro voices
|
* @returns Array of VoiceInfo objects for all Kokoro voices
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
* - OpenAI-compatible API via OpenedAI Speech server
|
* - OpenAI-compatible API via OpenedAI Speech server
|
||||||
* - 100+ Piper voices across 40+ languages
|
* - 100+ Piper voices across 40+ languages
|
||||||
* - 6 standard OpenAI voice names mapped to Piper voices
|
* - 6 standard OpenAI voice names mapped to Piper voices
|
||||||
* - Output formats: mp3, wav, opus, flac, aac, pcm
|
* - Output formats: mp3, wav, opus, flac
|
||||||
* - CPU-only, no GPU required
|
* - CPU-only, no GPU required
|
||||||
* - GPL license (via OpenedAI Speech)
|
* - GPL license (via OpenedAI Speech)
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
|
|||||||
import { KokoroTtsProvider } from "./kokoro-tts.provider";
|
import { KokoroTtsProvider } from "./kokoro-tts.provider";
|
||||||
import { PiperTtsProvider } from "./piper-tts.provider";
|
import { PiperTtsProvider } from "./piper-tts.provider";
|
||||||
import type { ITTSProvider } from "../interfaces/tts-provider.interface";
|
import type { ITTSProvider } from "../interfaces/tts-provider.interface";
|
||||||
import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
|
import type { SpeechTier } from "../interfaces/speech-types";
|
||||||
import type { SpeechConfig } from "../speech.config";
|
import type { SpeechConfig } from "../speech.config";
|
||||||
|
|
||||||
// ==========================================
|
// ==========================================
|
||||||
@@ -44,7 +44,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
|
|||||||
const provider = new KokoroTtsProvider(
|
const provider = new KokoroTtsProvider(
|
||||||
config.tts.default.url,
|
config.tts.default.url,
|
||||||
config.tts.default.voice,
|
config.tts.default.voice,
|
||||||
config.tts.default.format as AudioFormat
|
config.tts.default.format
|
||||||
);
|
);
|
||||||
providers.set("default", provider);
|
providers.set("default", provider);
|
||||||
logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`);
|
logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`);
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
import { registerAs } from "@nestjs/config";
|
import { registerAs } from "@nestjs/config";
|
||||||
|
import type { AudioFormat } from "./interfaces/speech-types";
|
||||||
|
|
||||||
// ==========================================
|
// ==========================================
|
||||||
// Default values
|
// Default values
|
||||||
@@ -58,7 +59,7 @@ export interface TtsDefaultConfig {
|
|||||||
enabled: boolean;
|
enabled: boolean;
|
||||||
url: string;
|
url: string;
|
||||||
voice: string;
|
voice: string;
|
||||||
format: string;
|
format: AudioFormat;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TtsPremiumConfig {
|
export interface TtsPremiumConfig {
|
||||||
@@ -247,7 +248,7 @@ export function getSpeechConfig(): SpeechConfig {
|
|||||||
enabled: isTtsEnabled(),
|
enabled: isTtsEnabled(),
|
||||||
url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url,
|
url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url,
|
||||||
voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice,
|
voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice,
|
||||||
format: process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format,
|
format: (process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format) as AudioFormat,
|
||||||
},
|
},
|
||||||
premium: {
|
premium: {
|
||||||
enabled: isTtsPremiumEnabled(),
|
enabled: isTtsPremiumEnabled(),
|
||||||
|
|||||||
@@ -100,6 +100,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
|||||||
const timeoutId = setTimeout(() => {
|
const timeoutId = setTimeout(() => {
|
||||||
if (!authenticatedClient.data.userId) {
|
if (!authenticatedClient.data.userId) {
|
||||||
this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`);
|
this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`);
|
||||||
|
authenticatedClient.emit("transcription-error", {
|
||||||
|
message: "Authentication timed out.",
|
||||||
|
});
|
||||||
authenticatedClient.disconnect();
|
authenticatedClient.disconnect();
|
||||||
}
|
}
|
||||||
}, this.CONNECTION_TIMEOUT_MS);
|
}, this.CONNECTION_TIMEOUT_MS);
|
||||||
@@ -109,6 +112,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
|||||||
|
|
||||||
if (!token) {
|
if (!token) {
|
||||||
this.logger.warn(`Client ${authenticatedClient.id} connected without token`);
|
this.logger.warn(`Client ${authenticatedClient.id} connected without token`);
|
||||||
|
authenticatedClient.emit("transcription-error", {
|
||||||
|
message: "Authentication failed: no token provided.",
|
||||||
|
});
|
||||||
authenticatedClient.disconnect();
|
authenticatedClient.disconnect();
|
||||||
clearTimeout(timeoutId);
|
clearTimeout(timeoutId);
|
||||||
return;
|
return;
|
||||||
@@ -118,6 +124,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
|||||||
|
|
||||||
if (!sessionData) {
|
if (!sessionData) {
|
||||||
this.logger.warn(`Client ${authenticatedClient.id} has invalid token`);
|
this.logger.warn(`Client ${authenticatedClient.id} has invalid token`);
|
||||||
|
authenticatedClient.emit("transcription-error", {
|
||||||
|
message: "Authentication failed: invalid or expired token.",
|
||||||
|
});
|
||||||
authenticatedClient.disconnect();
|
authenticatedClient.disconnect();
|
||||||
clearTimeout(timeoutId);
|
clearTimeout(timeoutId);
|
||||||
return;
|
return;
|
||||||
@@ -133,6 +142,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
|||||||
|
|
||||||
if (!workspaceMembership) {
|
if (!workspaceMembership) {
|
||||||
this.logger.warn(`User ${userId} has no workspace access`);
|
this.logger.warn(`User ${userId} has no workspace access`);
|
||||||
|
authenticatedClient.emit("transcription-error", {
|
||||||
|
message: "Authentication failed: no workspace access.",
|
||||||
|
});
|
||||||
authenticatedClient.disconnect();
|
authenticatedClient.disconnect();
|
||||||
clearTimeout(timeoutId);
|
clearTimeout(timeoutId);
|
||||||
return;
|
return;
|
||||||
@@ -151,6 +163,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
|
|||||||
`Authentication failed for speech client ${authenticatedClient.id}:`,
|
`Authentication failed for speech client ${authenticatedClient.id}:`,
|
||||||
error instanceof Error ? error.message : "Unknown error"
|
error instanceof Error ? error.message : "Unknown error"
|
||||||
);
|
);
|
||||||
|
authenticatedClient.emit("transcription-error", {
|
||||||
|
message: "Authentication failed: an unexpected error occurred.",
|
||||||
|
});
|
||||||
authenticatedClient.disconnect();
|
authenticatedClient.disconnect();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ export interface TextToSpeechButtonProps {
|
|||||||
text: string;
|
text: string;
|
||||||
/** Optional voice ID to use */
|
/** Optional voice ID to use */
|
||||||
voice?: string;
|
voice?: string;
|
||||||
/** Optional tier (e.g. "standard", "premium") */
|
/** Optional tier (e.g. "default", "premium", "fallback") */
|
||||||
tier?: string;
|
tier?: string;
|
||||||
/** Optional className for the container */
|
/** Optional className for the container */
|
||||||
className?: string;
|
className?: string;
|
||||||
|
|||||||
@@ -173,8 +173,17 @@ export function useTextToSpeech(): UseTextToSpeechReturn {
|
|||||||
const play = useCallback(async (): Promise<void> => {
|
const play = useCallback(async (): Promise<void> => {
|
||||||
const audio = audioRef.current;
|
const audio = audioRef.current;
|
||||||
if (audio) {
|
if (audio) {
|
||||||
await audio.play();
|
try {
|
||||||
setIsPlaying(true);
|
await audio.play();
|
||||||
|
setIsPlaying(true);
|
||||||
|
} catch (err) {
|
||||||
|
const message =
|
||||||
|
err instanceof DOMException && err.name === "NotAllowedError"
|
||||||
|
? "Playback was blocked by the browser. Try interacting with the page first."
|
||||||
|
: "Unable to play audio. The format may not be supported.";
|
||||||
|
setError(message);
|
||||||
|
setIsPlaying(false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
/**
|
/**
|
||||||
* useVoiceInput hook
|
* useVoiceInput hook
|
||||||
*
|
*
|
||||||
* Custom hook for microphone capture and real-time transcription.
|
* Custom hook for microphone capture and speech-to-text transcription.
|
||||||
* Supports WebSocket streaming for real-time partial transcriptions
|
* Supports WebSocket streaming with batch transcription on stop,
|
||||||
* with REST upload fallback when WebSocket is unavailable.
|
* with REST upload fallback when WebSocket is unavailable.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@@ -20,6 +20,8 @@ export interface UseVoiceInputOptions {
|
|||||||
useWebSocket?: boolean;
|
useWebSocket?: boolean;
|
||||||
/** Audio sample rate in Hz (default: 16000) */
|
/** Audio sample rate in Hz (default: 16000) */
|
||||||
sampleRate?: number;
|
sampleRate?: number;
|
||||||
|
/** Authentication token for WebSocket connection */
|
||||||
|
token?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Return type for the useVoiceInput hook */
|
/** Return type for the useVoiceInput hook */
|
||||||
@@ -75,14 +77,14 @@ function getAudioMimeType(): string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hook for microphone capture and real-time speech-to-text transcription.
|
* Hook for microphone capture and speech-to-text transcription.
|
||||||
*
|
*
|
||||||
* Uses WebSocket streaming by default for real-time partial transcriptions.
|
* Uses WebSocket streaming by default with batch transcription on stop.
|
||||||
* Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
|
* Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
|
||||||
* is disabled or unavailable.
|
* is disabled or unavailable.
|
||||||
*/
|
*/
|
||||||
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
|
export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
|
||||||
const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options;
|
const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options;
|
||||||
|
|
||||||
const [isRecording, setIsRecording] = useState(false);
|
const [isRecording, setIsRecording] = useState(false);
|
||||||
const [transcript, setTranscript] = useState("");
|
const [transcript, setTranscript] = useState("");
|
||||||
@@ -143,9 +145,12 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
|
|||||||
};
|
};
|
||||||
|
|
||||||
animationFrameRef.current = requestAnimationFrame(updateLevel);
|
animationFrameRef.current = requestAnimationFrame(updateLevel);
|
||||||
} catch {
|
} catch (err) {
|
||||||
// Audio analysis is non-critical; continue without it
|
// Audio analysis is non-critical; continue without it
|
||||||
console.warn("Audio analysis not available");
|
console.warn(
|
||||||
|
"Audio level visualization unavailable:",
|
||||||
|
err instanceof Error ? err.message : String(err)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
@@ -169,11 +174,14 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
|
|||||||
* Connect to the speech WebSocket namespace
|
* Connect to the speech WebSocket namespace
|
||||||
*/
|
*/
|
||||||
const connectSocket = useCallback((): Socket => {
|
const connectSocket = useCallback((): Socket => {
|
||||||
const socket = io(API_BASE_URL, {
|
const socket = io(`${API_BASE_URL}/speech`, {
|
||||||
path: "/socket.io",
|
path: "/socket.io",
|
||||||
transports: ["websocket", "polling"],
|
transports: ["websocket", "polling"],
|
||||||
|
...(token ? { auth: { token } } : {}),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Future use: the gateway does not currently emit transcription-partial,
|
||||||
|
// but the listener is registered for when real-time partial transcription is added.
|
||||||
socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
|
socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
|
||||||
setPartialTranscript(data.text);
|
setPartialTranscript(data.text);
|
||||||
});
|
});
|
||||||
@@ -188,9 +196,19 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
|
|||||||
setError(data.message);
|
setError(data.message);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
socket.on("connect_error", (err: Error) => {
|
||||||
|
setError(`WebSocket connection failed: ${err.message}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
socket.on("disconnect", (reason: string) => {
|
||||||
|
if (reason !== "io client disconnect") {
|
||||||
|
setError(`WebSocket disconnected unexpectedly: ${reason}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
socketRef.current = socket;
|
socketRef.current = socket;
|
||||||
return socket;
|
return socket;
|
||||||
}, []);
|
}, [token]);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Disconnect the WebSocket
|
* Disconnect the WebSocket
|
||||||
@@ -200,6 +218,8 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
|
|||||||
socketRef.current.off("transcription-partial");
|
socketRef.current.off("transcription-partial");
|
||||||
socketRef.current.off("transcription-final");
|
socketRef.current.off("transcription-final");
|
||||||
socketRef.current.off("transcription-error");
|
socketRef.current.off("transcription-error");
|
||||||
|
socketRef.current.off("connect_error");
|
||||||
|
socketRef.current.off("disconnect");
|
||||||
socketRef.current.disconnect();
|
socketRef.current.disconnect();
|
||||||
socketRef.current = null;
|
socketRef.current = null;
|
||||||
}
|
}
|
||||||
@@ -211,7 +231,7 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
|
|||||||
const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
|
const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
|
||||||
try {
|
try {
|
||||||
const formData = new FormData();
|
const formData = new FormData();
|
||||||
formData.append("audio", audioBlob, "recording.webm");
|
formData.append("file", audioBlob, "recording.webm");
|
||||||
|
|
||||||
const response = await apiPostFormData<TranscribeResponse>(
|
const response = await apiPostFormData<TranscribeResponse>(
|
||||||
"/api/speech/transcribe",
|
"/api/speech/transcribe",
|
||||||
@@ -315,10 +335,16 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Handle errors
|
// Handle errors
|
||||||
mediaRecorder.addEventListener("error", () => {
|
mediaRecorder.addEventListener("error", (event: Event) => {
|
||||||
setError("Recording encountered an issue. Please try again.");
|
let errorMessage = "Recording encountered an issue. Please try again.";
|
||||||
|
if ("error" in event && event.error instanceof DOMException) {
|
||||||
|
errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`;
|
||||||
|
}
|
||||||
|
setError(errorMessage);
|
||||||
setIsRecording(false);
|
setIsRecording(false);
|
||||||
isRecordingRef.current = false;
|
isRecordingRef.current = false;
|
||||||
|
stopMediaTracks();
|
||||||
|
cleanupAudioAnalysis();
|
||||||
});
|
});
|
||||||
|
|
||||||
// Start recording with timeslice for streaming chunks (250ms intervals)
|
// Start recording with timeslice for streaming chunks (250ms intervals)
|
||||||
|
|||||||
@@ -494,7 +494,7 @@ Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values de
|
|||||||
|
|
||||||
**Capabilities:**
|
**Capabilities:**
|
||||||
|
|
||||||
- 54 built-in voices across 8 languages
|
- 53 built-in voices across 8 languages
|
||||||
- Speed control: 0.25x to 4.0x
|
- Speed control: 0.25x to 4.0x
|
||||||
- Output formats: mp3, wav, opus, flac
|
- Output formats: mp3, wav, opus, flac
|
||||||
- Voice metadata derived from ID prefix (language, gender, accent)
|
- Voice metadata derived from ID prefix (language, gender, accent)
|
||||||
|
|||||||
Reference in New Issue
Block a user