/** * SpeechController * * REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services. * Handles audio file uploads for transcription, text-to-speech synthesis, * voice listing, and provider health status. * * Endpoints: * - POST /api/speech/transcribe - Transcribe uploaded audio file to text * - POST /api/speech/synthesize - Synthesize text to audio (TTS) * - GET /api/speech/voices - List available TTS voices * - GET /api/speech/health - Check STT/TTS provider availability * * Issue #392, #396 */ import { Controller, Post, Get, Body, Query, UseGuards, UseInterceptors, UploadedFile, StreamableFile, } from "@nestjs/common"; import { FileInterceptor } from "@nestjs/platform-express"; import { SpeechService } from "./speech.service"; import { TranscribeDto } from "./dto/transcribe.dto"; import { SynthesizeDto } from "./dto/synthesize.dto"; import { AudioValidationPipe } from "./pipes/audio-validation.pipe"; import { AuthGuard } from "../auth/guards/auth.guard"; import { WorkspaceGuard, PermissionGuard } from "../common/guards"; import { Workspace, Permission, RequirePermission } from "../common/decorators"; import { CurrentUser } from "../auth/decorators/current-user.decorator"; import type { AuthenticatedUser } from "../common/types/user.types"; import type { AudioFormat, SynthesizeOptions, TranscribeOptions, TranscriptionResult, VoiceInfo, SpeechTier, } from "./interfaces/speech-types"; /** * Map audio format to MIME type for Content-Type header. */ const AUDIO_FORMAT_MIME_TYPES: Record = { mp3: "audio/mpeg", wav: "audio/wav", opus: "audio/opus", flac: "audio/flac", aac: "audio/aac", pcm: "audio/pcm", }; /** * Health status for a single speech provider category. */ interface ProviderHealth { available: boolean; } /** * Combined health status response for all speech providers. */ interface SpeechHealthResponse { data: { stt: ProviderHealth; tts: ProviderHealth; }; } @Controller("speech") @UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard) export class SpeechController { constructor(private readonly speechService: SpeechService) {} /** * POST /api/speech/transcribe * * Transcribe an uploaded audio file to text. * Accepts multipart form data with an audio file and optional transcription parameters. * * @param file - Uploaded audio file (validated by AudioValidationPipe) * @param dto - Optional transcription parameters (language, model, prompt, temperature) * @param _workspaceId - Workspace context (validated by WorkspaceGuard) * @param _user - Authenticated user (validated by AuthGuard) * @returns Transcription result wrapped in standard data envelope */ @Post("transcribe") @RequirePermission(Permission.WORKSPACE_MEMBER) @UseInterceptors(FileInterceptor("file")) async transcribe( @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File, @Body() dto: TranscribeDto, @Workspace() _workspaceId: string, @CurrentUser() _user: AuthenticatedUser ): Promise<{ data: TranscriptionResult }> { const options: TranscribeOptions = { mimeType: file.mimetype }; if (dto.language !== undefined) options.language = dto.language; if (dto.model !== undefined) options.model = dto.model; if (dto.prompt !== undefined) options.prompt = dto.prompt; if (dto.temperature !== undefined) options.temperature = dto.temperature; const result = await this.speechService.transcribe(file.buffer, options); return { data: result }; } /** * GET /api/speech/health * * Check availability of STT and TTS providers. * * @param _workspaceId - Workspace context (validated by WorkspaceGuard) * @returns Health status of STT and TTS providers */ @Get("health") @RequirePermission(Permission.WORKSPACE_ANY) health(@Workspace() _workspaceId: string): SpeechHealthResponse { return { data: { stt: { available: this.speechService.isSTTAvailable() }, tts: { available: this.speechService.isTTSAvailable() }, }, }; } /** * POST /api/speech/synthesize * * Synthesize text to audio using TTS providers. * Accepts JSON body with text and optional voice/format/speed/tier parameters. * Returns audio binary with appropriate Content-Type and Content-Disposition headers. * * Provider selection follows fallback chain: requested tier -> default -> fallback. * * @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?) * @param _workspaceId - Workspace context (validated by WorkspaceGuard) * @param _user - Authenticated user (validated by AuthGuard) * @returns StreamableFile containing synthesized audio * * Issue #396 */ @Post("synthesize") @RequirePermission(Permission.WORKSPACE_MEMBER) async synthesize( @Body() dto: SynthesizeDto, @Workspace() _workspaceId: string, @CurrentUser() _user: AuthenticatedUser ): Promise { const options: SynthesizeOptions = {}; if (dto.voice !== undefined) options.voice = dto.voice; if (dto.speed !== undefined) options.speed = dto.speed; if (dto.format !== undefined) options.format = dto.format; if (dto.tier !== undefined) options.tier = dto.tier; const result = await this.speechService.synthesize(dto.text, options); const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format]; return new StreamableFile(result.audio, { type: mimeType, disposition: `attachment; filename="speech.${result.format}"`, length: result.audio.length, }); } /** * GET /api/speech/voices * * List available TTS voices across all tiers. * Optionally filter by tier using the `tier` query parameter. * * @param _workspaceId - Workspace context (validated by WorkspaceGuard) * @param tier - Optional tier filter (default, premium, fallback) * @returns Voice information array wrapped in standard data envelope * * Issue #396 */ @Get("voices") @RequirePermission(Permission.WORKSPACE_ANY) async getVoices( @Workspace() _workspaceId: string, @Query("tier") tier?: SpeechTier ): Promise<{ data: VoiceInfo[] }> { const voices = await this.speechService.listVoices(tier); return { data: voices }; } }