Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
70 lines
2.2 KiB
TypeScript
70 lines
2.2 KiB
TypeScript
/**
|
|
* SynthesizeDto
|
|
*
|
|
* DTO for text-to-speech synthesis requests.
|
|
* Text and option fields are validated by class-validator decorators.
|
|
* Additional options control voice, speed, format, and tier selection.
|
|
*
|
|
* Issue #398
|
|
*/
|
|
|
|
import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
|
|
import { Type } from "class-transformer";
|
|
import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types";
|
|
import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";
|
|
|
|
export class SynthesizeDto {
|
|
/**
|
|
* Text to convert to speech.
|
|
* Validated by class-validator decorators for type and maximum length.
|
|
*/
|
|
@IsString({ message: "text must be a string" })
|
|
@MaxLength(4096, { message: "text must not exceed 4096 characters" })
|
|
text!: string;
|
|
|
|
/**
|
|
* Voice ID to use for synthesis.
|
|
* Available voices depend on the selected tier and provider.
|
|
* If omitted, the default voice from speech config is used.
|
|
*/
|
|
@IsOptional()
|
|
@IsString({ message: "voice must be a string" })
|
|
@MaxLength(100, { message: "voice must not exceed 100 characters" })
|
|
voice?: string;
|
|
|
|
/**
|
|
* Speech speed multiplier (0.5 to 2.0).
|
|
* 1.0 is normal speed, <1.0 is slower, >1.0 is faster.
|
|
*/
|
|
@IsOptional()
|
|
@Type(() => Number)
|
|
@IsNumber({}, { message: "speed must be a number" })
|
|
@Min(0.5, { message: "speed must be at least 0.5" })
|
|
@Max(2.0, { message: "speed must not exceed 2.0" })
|
|
speed?: number;
|
|
|
|
/**
|
|
* Desired audio output format.
|
|
* Supported: mp3, wav, opus, flac, aac, pcm.
|
|
* If omitted, the default format from speech config is used.
|
|
*/
|
|
@IsOptional()
|
|
@IsString({ message: "format must be a string" })
|
|
@IsIn(AUDIO_FORMATS, {
|
|
message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`,
|
|
})
|
|
format?: AudioFormat;
|
|
|
|
/**
|
|
* TTS tier to use for synthesis.
|
|
* Controls which provider is used: default (Kokoro), premium (Chatterbox), or fallback (Piper).
|
|
* If the selected tier is unavailable, the service falls back to the next available tier.
|
|
*/
|
|
@IsOptional()
|
|
@IsString({ message: "tier must be a string" })
|
|
@IsIn(SPEECH_TIERS, {
|
|
message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`,
|
|
})
|
|
tier?: SpeechTier;
|
|
}
|