feat(#392): create /api/speech/transcribe REST endpoint
All checks were successful
ci/woodpecker/push/api Pipeline was successful
All checks were successful
ci/woodpecker/push/api Pipeline was successful
Add SpeechController with POST /api/speech/transcribe for audio
transcription and GET /api/speech/health for provider status.
Uses AudioValidationPipe for file upload validation and returns
results in standard { data: T } envelope.
Includes 10 unit tests covering transcribe with options, error
propagation, and all health status combinations.
Fixes #392
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
193
apps/api/src/speech/speech.controller.ts
Normal file
193
apps/api/src/speech/speech.controller.ts
Normal file
@@ -0,0 +1,193 @@
|
||||
/**
|
||||
* SpeechController
|
||||
*
|
||||
* REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services.
|
||||
* Handles audio file uploads for transcription, text-to-speech synthesis,
|
||||
* voice listing, and provider health status.
|
||||
*
|
||||
* Endpoints:
|
||||
* - POST /api/speech/transcribe - Transcribe uploaded audio file to text
|
||||
* - POST /api/speech/synthesize - Synthesize text to audio (TTS)
|
||||
* - GET /api/speech/voices - List available TTS voices
|
||||
* - GET /api/speech/health - Check STT/TTS provider availability
|
||||
*
|
||||
* Issue #392, #396
|
||||
*/
|
||||
|
||||
import {
|
||||
Controller,
|
||||
Post,
|
||||
Get,
|
||||
Body,
|
||||
Query,
|
||||
UseGuards,
|
||||
UseInterceptors,
|
||||
UploadedFile,
|
||||
StreamableFile,
|
||||
} from "@nestjs/common";
|
||||
import { FileInterceptor } from "@nestjs/platform-express";
|
||||
import { SpeechService } from "./speech.service";
|
||||
import { TranscribeDto } from "./dto/transcribe.dto";
|
||||
import { SynthesizeDto } from "./dto/synthesize.dto";
|
||||
import { AudioValidationPipe } from "./pipes/audio-validation.pipe";
|
||||
import { AuthGuard } from "../auth/guards/auth.guard";
|
||||
import { WorkspaceGuard, PermissionGuard } from "../common/guards";
|
||||
import { Workspace, Permission, RequirePermission } from "../common/decorators";
|
||||
import { CurrentUser } from "../auth/decorators/current-user.decorator";
|
||||
import type { AuthenticatedUser } from "../common/types/user.types";
|
||||
import type {
|
||||
AudioFormat,
|
||||
SynthesizeOptions,
|
||||
TranscribeOptions,
|
||||
TranscriptionResult,
|
||||
VoiceInfo,
|
||||
SpeechTier,
|
||||
} from "./interfaces/speech-types";
|
||||
|
||||
/**
|
||||
* Map audio format to MIME type for Content-Type header.
|
||||
*/
|
||||
const AUDIO_FORMAT_MIME_TYPES: Record<AudioFormat, string> = {
|
||||
mp3: "audio/mpeg",
|
||||
wav: "audio/wav",
|
||||
opus: "audio/opus",
|
||||
flac: "audio/flac",
|
||||
aac: "audio/aac",
|
||||
pcm: "audio/pcm",
|
||||
};
|
||||
|
||||
/**
|
||||
* Health status for a single speech provider category.
|
||||
*/
|
||||
interface ProviderHealth {
|
||||
available: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Combined health status response for all speech providers.
|
||||
*/
|
||||
interface SpeechHealthResponse {
|
||||
data: {
|
||||
stt: ProviderHealth;
|
||||
tts: ProviderHealth;
|
||||
};
|
||||
}
|
||||
|
||||
@Controller("speech")
|
||||
@UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard)
|
||||
export class SpeechController {
|
||||
constructor(private readonly speechService: SpeechService) {}
|
||||
|
||||
/**
|
||||
* POST /api/speech/transcribe
|
||||
*
|
||||
* Transcribe an uploaded audio file to text.
|
||||
* Accepts multipart form data with an audio file and optional transcription parameters.
|
||||
*
|
||||
* @param file - Uploaded audio file (validated by AudioValidationPipe)
|
||||
* @param dto - Optional transcription parameters (language, model, prompt, temperature)
|
||||
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||
* @param _user - Authenticated user (validated by AuthGuard)
|
||||
* @returns Transcription result wrapped in standard data envelope
|
||||
*/
|
||||
@Post("transcribe")
|
||||
@RequirePermission(Permission.WORKSPACE_MEMBER)
|
||||
@UseInterceptors(FileInterceptor("file"))
|
||||
async transcribe(
|
||||
@UploadedFile(new AudioValidationPipe()) file: Express.Multer.File,
|
||||
@Body() dto: TranscribeDto,
|
||||
@Workspace() _workspaceId: string,
|
||||
@CurrentUser() _user: AuthenticatedUser
|
||||
): Promise<{ data: TranscriptionResult }> {
|
||||
const options: TranscribeOptions = { mimeType: file.mimetype };
|
||||
if (dto.language !== undefined) options.language = dto.language;
|
||||
if (dto.model !== undefined) options.model = dto.model;
|
||||
if (dto.prompt !== undefined) options.prompt = dto.prompt;
|
||||
if (dto.temperature !== undefined) options.temperature = dto.temperature;
|
||||
|
||||
const result = await this.speechService.transcribe(file.buffer, options);
|
||||
|
||||
return { data: result };
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /api/speech/health
|
||||
*
|
||||
* Check availability of STT and TTS providers.
|
||||
*
|
||||
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||
* @returns Health status of STT and TTS providers
|
||||
*/
|
||||
@Get("health")
|
||||
@RequirePermission(Permission.WORKSPACE_ANY)
|
||||
health(@Workspace() _workspaceId: string): SpeechHealthResponse {
|
||||
return {
|
||||
data: {
|
||||
stt: { available: this.speechService.isSTTAvailable() },
|
||||
tts: { available: this.speechService.isTTSAvailable() },
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* POST /api/speech/synthesize
|
||||
*
|
||||
* Synthesize text to audio using TTS providers.
|
||||
* Accepts JSON body with text and optional voice/format/speed/tier parameters.
|
||||
* Returns audio binary with appropriate Content-Type and Content-Disposition headers.
|
||||
*
|
||||
* Provider selection follows fallback chain: requested tier -> default -> fallback.
|
||||
*
|
||||
* @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?)
|
||||
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||
* @param _user - Authenticated user (validated by AuthGuard)
|
||||
* @returns StreamableFile containing synthesized audio
|
||||
*
|
||||
* Issue #396
|
||||
*/
|
||||
@Post("synthesize")
|
||||
@RequirePermission(Permission.WORKSPACE_MEMBER)
|
||||
async synthesize(
|
||||
@Body() dto: SynthesizeDto,
|
||||
@Workspace() _workspaceId: string,
|
||||
@CurrentUser() _user: AuthenticatedUser
|
||||
): Promise<StreamableFile> {
|
||||
const options: SynthesizeOptions = {};
|
||||
if (dto.voice !== undefined) options.voice = dto.voice;
|
||||
if (dto.speed !== undefined) options.speed = dto.speed;
|
||||
if (dto.format !== undefined) options.format = dto.format;
|
||||
if (dto.tier !== undefined) options.tier = dto.tier;
|
||||
|
||||
const result = await this.speechService.synthesize(dto.text, options);
|
||||
|
||||
const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format];
|
||||
|
||||
return new StreamableFile(result.audio, {
|
||||
type: mimeType,
|
||||
disposition: `attachment; filename="speech.${result.format}"`,
|
||||
length: result.audio.length,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /api/speech/voices
|
||||
*
|
||||
* List available TTS voices across all tiers.
|
||||
* Optionally filter by tier using the `tier` query parameter.
|
||||
*
|
||||
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||
* @param tier - Optional tier filter (default, premium, fallback)
|
||||
* @returns Voice information array wrapped in standard data envelope
|
||||
*
|
||||
* Issue #396
|
||||
*/
|
||||
@Get("voices")
|
||||
@RequirePermission(Permission.WORKSPACE_ANY)
|
||||
async getVoices(
|
||||
@Workspace() _workspaceId: string,
|
||||
@Query("tier") tier?: SpeechTier
|
||||
): Promise<{ data: VoiceInfo[] }> {
|
||||
const voices = await this.speechService.listVoices(tier);
|
||||
return { data: voices };
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user