feat(#392): create /api/speech/transcribe REST endpoint

Add SpeechController with POST /api/speech/transcribe for audio transcription and GET /api/speech/health for provider status. Uses AudioValidationPipe for file upload validation and returns results in standard { data: T } envelope. Includes 10 unit tests covering transcribe with options, error propagation, and all health status combinations. Fixes #392 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 02:47:52 -06:00
parent 6c465566f6
commit 527262af38
3 changed files with 632 additions and 0 deletions
--- a/apps/api/src/speech/speech.controller.ts
+++ b/apps/api/src/speech/speech.controller.ts
@@ -0,0 +1,193 @@
+/**
+ * SpeechController
+ *
+ * REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services.
+ * Handles audio file uploads for transcription, text-to-speech synthesis,
+ * voice listing, and provider health status.
+ *
+ * Endpoints:
+ * - POST /api/speech/transcribe   - Transcribe uploaded audio file to text
+ * - POST /api/speech/synthesize   - Synthesize text to audio (TTS)
+ * - GET  /api/speech/voices       - List available TTS voices
+ * - GET  /api/speech/health       - Check STT/TTS provider availability
+ *
+ * Issue #392, #396
+ */
+
+import {
+  Controller,
+  Post,
+  Get,
+  Body,
+  Query,
+  UseGuards,
+  UseInterceptors,
+  UploadedFile,
+  StreamableFile,
+} from "@nestjs/common";
+import { FileInterceptor } from "@nestjs/platform-express";
+import { SpeechService } from "./speech.service";
+import { TranscribeDto } from "./dto/transcribe.dto";
+import { SynthesizeDto } from "./dto/synthesize.dto";
+import { AudioValidationPipe } from "./pipes/audio-validation.pipe";
+import { AuthGuard } from "../auth/guards/auth.guard";
+import { WorkspaceGuard, PermissionGuard } from "../common/guards";
+import { Workspace, Permission, RequirePermission } from "../common/decorators";
+import { CurrentUser } from "../auth/decorators/current-user.decorator";
+import type { AuthenticatedUser } from "../common/types/user.types";
+import type {
+  AudioFormat,
+  SynthesizeOptions,
+  TranscribeOptions,
+  TranscriptionResult,
+  VoiceInfo,
+  SpeechTier,
+} from "./interfaces/speech-types";
+
+/**
+ * Map audio format to MIME type for Content-Type header.
+ */
+const AUDIO_FORMAT_MIME_TYPES: Record<AudioFormat, string> = {
+  mp3: "audio/mpeg",
+  wav: "audio/wav",
+  opus: "audio/opus",
+  flac: "audio/flac",
+  aac: "audio/aac",
+  pcm: "audio/pcm",
+};
+
+/**
+ * Health status for a single speech provider category.
+ */
+interface ProviderHealth {
+  available: boolean;
+}
+
+/**
+ * Combined health status response for all speech providers.
+ */
+interface SpeechHealthResponse {
+  data: {
+    stt: ProviderHealth;
+    tts: ProviderHealth;
+  };
+}
+
+@Controller("speech")
+@UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard)
+export class SpeechController {
+  constructor(private readonly speechService: SpeechService) {}
+
+  /**
+   * POST /api/speech/transcribe
+   *
+   * Transcribe an uploaded audio file to text.
+   * Accepts multipart form data with an audio file and optional transcription parameters.
+   *
+   * @param file - Uploaded audio file (validated by AudioValidationPipe)
+   * @param dto - Optional transcription parameters (language, model, prompt, temperature)
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @param _user - Authenticated user (validated by AuthGuard)
+   * @returns Transcription result wrapped in standard data envelope
+   */
+  @Post("transcribe")
+  @RequirePermission(Permission.WORKSPACE_MEMBER)
+  @UseInterceptors(FileInterceptor("file"))
+  async transcribe(
+    @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File,
+    @Body() dto: TranscribeDto,
+    @Workspace() _workspaceId: string,
+    @CurrentUser() _user: AuthenticatedUser
+  ): Promise<{ data: TranscriptionResult }> {
+    const options: TranscribeOptions = { mimeType: file.mimetype };
+    if (dto.language !== undefined) options.language = dto.language;
+    if (dto.model !== undefined) options.model = dto.model;
+    if (dto.prompt !== undefined) options.prompt = dto.prompt;
+    if (dto.temperature !== undefined) options.temperature = dto.temperature;
+
+    const result = await this.speechService.transcribe(file.buffer, options);
+
+    return { data: result };
+  }
+
+  /**
+   * GET /api/speech/health
+   *
+   * Check availability of STT and TTS providers.
+   *
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @returns Health status of STT and TTS providers
+   */
+  @Get("health")
+  @RequirePermission(Permission.WORKSPACE_ANY)
+  health(@Workspace() _workspaceId: string): SpeechHealthResponse {
+    return {
+      data: {
+        stt: { available: this.speechService.isSTTAvailable() },
+        tts: { available: this.speechService.isTTSAvailable() },
+      },
+    };
+  }
+
+  /**
+   * POST /api/speech/synthesize
+   *
+   * Synthesize text to audio using TTS providers.
+   * Accepts JSON body with text and optional voice/format/speed/tier parameters.
+   * Returns audio binary with appropriate Content-Type and Content-Disposition headers.
+   *
+   * Provider selection follows fallback chain: requested tier -> default -> fallback.
+   *
+   * @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?)
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @param _user - Authenticated user (validated by AuthGuard)
+   * @returns StreamableFile containing synthesized audio
+   *
+   * Issue #396
+   */
+  @Post("synthesize")
+  @RequirePermission(Permission.WORKSPACE_MEMBER)
+  async synthesize(
+    @Body() dto: SynthesizeDto,
+    @Workspace() _workspaceId: string,
+    @CurrentUser() _user: AuthenticatedUser
+  ): Promise<StreamableFile> {
+    const options: SynthesizeOptions = {};
+    if (dto.voice !== undefined) options.voice = dto.voice;
+    if (dto.speed !== undefined) options.speed = dto.speed;
+    if (dto.format !== undefined) options.format = dto.format;
+    if (dto.tier !== undefined) options.tier = dto.tier;
+
+    const result = await this.speechService.synthesize(dto.text, options);
+
+    const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format];
+
+    return new StreamableFile(result.audio, {
+      type: mimeType,
+      disposition: `attachment; filename="speech.${result.format}"`,
+      length: result.audio.length,
+    });
+  }
+
+  /**
+   * GET /api/speech/voices
+   *
+   * List available TTS voices across all tiers.
+   * Optionally filter by tier using the `tier` query parameter.
+   *
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @param tier - Optional tier filter (default, premium, fallback)
+   * @returns Voice information array wrapped in standard data envelope
+   *
+   * Issue #396
+   */
+  @Get("voices")
+  @RequirePermission(Permission.WORKSPACE_ANY)
+  async getVoices(
+    @Workspace() _workspaceId: string,
+    @Query("tier") tier?: SpeechTier
+  ): Promise<{ data: VoiceInfo[] }> {
+    const voices = await this.speechService.listVoices(tier);
+    return { data: voices };
+  }
+}