Files
stack/apps/api/src/speech/dto/synthesize.dto.ts
Jason Woltje af9c5799af
All checks were successful
ci/woodpecker/push/web Pipeline was successful
ci/woodpecker/push/api Pipeline was successful
fix(#388): address PR review findings — fix WebSocket/REST bugs, improve error handling, fix types and comments
Critical fixes:
- Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor
- Add /speech namespace to WebSocket connection URL
- Pass auth token in WebSocket handshake options
- Wrap audio.play() in try-catch for NotAllowedError and DOMException handling
- Replace bare catch block with named error parameter and descriptive message
- Add connect_error and disconnect event handlers to WebSocket
- Update JSDoc to accurately describe batch transcription (not real-time partial)

Important fixes:
- Emit transcription-error before disconnect in gateway auth failures
- Capture MediaRecorder error details and clean up media tracks on error
- Change TtsDefaultConfig.format type from string to AudioFormat
- Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth
- Fix voice count from 54 to 53 in provider, AGENTS.md, and docs
- Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 03:44:33 -06:00

70 lines
2.2 KiB
TypeScript

/**
* SynthesizeDto
*
* DTO for text-to-speech synthesis requests.
* Text and option fields are validated by class-validator decorators.
* Additional options control voice, speed, format, and tier selection.
*
* Issue #398
*/
import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
import { Type } from "class-transformer";
import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types";
import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";
export class SynthesizeDto {
/**
* Text to convert to speech.
* Validated by class-validator decorators for type and maximum length.
*/
@IsString({ message: "text must be a string" })
@MaxLength(4096, { message: "text must not exceed 4096 characters" })
text!: string;
/**
* Voice ID to use for synthesis.
* Available voices depend on the selected tier and provider.
* If omitted, the default voice from speech config is used.
*/
@IsOptional()
@IsString({ message: "voice must be a string" })
@MaxLength(100, { message: "voice must not exceed 100 characters" })
voice?: string;
/**
* Speech speed multiplier (0.5 to 2.0).
* 1.0 is normal speed, <1.0 is slower, >1.0 is faster.
*/
@IsOptional()
@Type(() => Number)
@IsNumber({}, { message: "speed must be a number" })
@Min(0.5, { message: "speed must be at least 0.5" })
@Max(2.0, { message: "speed must not exceed 2.0" })
speed?: number;
/**
* Desired audio output format.
* Supported: mp3, wav, opus, flac, aac, pcm.
* If omitted, the default format from speech config is used.
*/
@IsOptional()
@IsString({ message: "format must be a string" })
@IsIn(AUDIO_FORMATS, {
message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`,
})
format?: AudioFormat;
/**
* TTS tier to use for synthesis.
* Controls which provider is used: default (Kokoro), premium (Chatterbox), or fallback (Piper).
* If the selected tier is unavailable, the service falls back to the next available tier.
*/
@IsOptional()
@IsString({ message: "tier must be a string" })
@IsIn(SPEECH_TIERS, {
message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`,
})
tier?: SpeechTier;
}