feat(#392): create /api/speech/transcribe REST endpoint

Add SpeechController with POST /api/speech/transcribe for audio transcription and GET /api/speech/health for provider status. Uses AudioValidationPipe for file upload validation and returns results in standard { data: T } envelope. Includes 10 unit tests covering transcribe with options, error propagation, and all health status combinations. Fixes #392 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 02:47:52 -06:00
parent 6c465566f6
commit 527262af38
3 changed files with 632 additions and 0 deletions
--- a/apps/api/src/speech/speech.controller.spec.ts
+++ b/apps/api/src/speech/speech.controller.spec.ts
@@ -0,0 +1,437 @@
 import { describe, it, expect, beforeEach, vi } from "vitest";
 import { StreamableFile, ServiceUnavailableException } from "@nestjs/common";
 import { SpeechController } from "./speech.controller";
 import { SpeechService } from "./speech.service";
 import type { TranscribeDto } from "./dto/transcribe.dto";
 import type { SynthesizeDto } from "./dto/synthesize.dto";
 import type { TranscriptionResult, SynthesisResult, VoiceInfo } from "./interfaces/speech-types";
 describe("SpeechController", () => {
  let controller: SpeechController;
  let service: SpeechService;
  const mockSpeechService = {
    transcribe: vi.fn(),
    synthesize: vi.fn(),
    listVoices: vi.fn(),
    isSTTAvailable: vi.fn(),
    isTTSAvailable: vi.fn(),
  };
  const mockWorkspaceId = "550e8400-e29b-41d4-a716-446655440001";
  const mockUserId = "550e8400-e29b-41d4-a716-446655440002";
  const mockUser = {
    id: mockUserId,
    email: "test@example.com",
    name: "Test User",
    workspaceId: mockWorkspaceId,
  };
  const mockFile: Express.Multer.File = {
    buffer: Buffer.from("fake-audio-data"),
    mimetype: "audio/wav",
    size: 1024,
    originalname: "test.wav",
    fieldname: "file",
    encoding: "7bit",
    stream: null as never,
    destination: "",
    filename: "",
    path: "",
  };
  const mockTranscriptionResult: TranscriptionResult = {
    text: "Hello, world!",
    language: "en",
    durationSeconds: 2.5,
    confidence: 0.95,
  };
  beforeEach(() => {
    service = mockSpeechService as unknown as SpeechService;
    controller = new SpeechController(service);
    vi.clearAllMocks();
  });
  it("should be defined", () => {
    expect(controller).toBeDefined();
  });
  describe("transcribe", () => {
    it("should transcribe audio file and return data wrapper", async () => {
      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
      const dto: TranscribeDto = {};
      const result = await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
      expect(result).toEqual({ data: mockTranscriptionResult });
      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
        mimeType: "audio/wav",
      });
    });
    it("should pass language override from DTO to service", async () => {
      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
      const dto: TranscribeDto = { language: "fr" };
      await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
        language: "fr",
        mimeType: "audio/wav",
      });
    });
    it("should pass model override from DTO to service", async () => {
      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
      const dto: TranscribeDto = { model: "whisper-large-v3" };
      await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
        model: "whisper-large-v3",
        mimeType: "audio/wav",
      });
    });
    it("should pass all DTO options to service", async () => {
      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
      const dto: TranscribeDto = {
        language: "de",
        model: "whisper-large-v3",
        prompt: "Meeting notes",
        temperature: 0.5,
      };
      await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
        language: "de",
        model: "whisper-large-v3",
        prompt: "Meeting notes",
        temperature: 0.5,
        mimeType: "audio/wav",
      });
    });
    it("should propagate service errors", async () => {
      mockSpeechService.transcribe.mockRejectedValue(new Error("STT unavailable"));
      const dto: TranscribeDto = {};
      await expect(controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser)).rejects.toThrow(
        "STT unavailable"
      );
    });
  });
  describe("health", () => {
    it("should return health status with both providers available", async () => {
      mockSpeechService.isSTTAvailable.mockReturnValue(true);
      mockSpeechService.isTTSAvailable.mockReturnValue(true);
      const result = await controller.health(mockWorkspaceId);
      expect(result).toEqual({
        data: {
          stt: { available: true },
          tts: { available: true },
        },
      });
    });
    it("should return health status with STT unavailable", async () => {
      mockSpeechService.isSTTAvailable.mockReturnValue(false);
      mockSpeechService.isTTSAvailable.mockReturnValue(true);
      const result = await controller.health(mockWorkspaceId);
      expect(result).toEqual({
        data: {
          stt: { available: false },
          tts: { available: true },
        },
      });
    });
    it("should return health status with TTS unavailable", async () => {
      mockSpeechService.isSTTAvailable.mockReturnValue(true);
      mockSpeechService.isTTSAvailable.mockReturnValue(false);
      const result = await controller.health(mockWorkspaceId);
      expect(result).toEqual({
        data: {
          stt: { available: true },
          tts: { available: false },
        },
      });
    });
    it("should return health status with both providers unavailable", async () => {
      mockSpeechService.isSTTAvailable.mockReturnValue(false);
      mockSpeechService.isTTSAvailable.mockReturnValue(false);
      const result = await controller.health(mockWorkspaceId);
      expect(result).toEqual({
        data: {
          stt: { available: false },
          tts: { available: false },
        },
      });
    });
  });
  // ==============================================
  // POST /api/speech/synthesize (Issue #396)
  // ==============================================
  describe("synthesize", () => {
    const mockAudioBuffer = Buffer.from("fake-audio-data");
    const mockSynthesisResult: SynthesisResult = {
      audio: mockAudioBuffer,
      format: "mp3",
      voice: "af_heart",
      tier: "default",
      durationSeconds: 2.5,
    };
    it("should synthesize text and return a StreamableFile", async () => {
      const dto: SynthesizeDto = { text: "Hello world" };
      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Hello world", {});
      expect(result).toBeInstanceOf(StreamableFile);
    });
    it("should pass voice, speed, format, and tier options to the service", async () => {
      const dto: SynthesizeDto = {
        text: "Test with options",
        voice: "af_heart",
        speed: 1.5,
        format: "wav",
        tier: "premium",
      };
      const wavResult: SynthesisResult = {
        audio: mockAudioBuffer,
        format: "wav",
        voice: "af_heart",
        tier: "premium",
      };
      mockSpeechService.synthesize.mockResolvedValue(wavResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Test with options", {
        voice: "af_heart",
        speed: 1.5,
        format: "wav",
        tier: "premium",
      });
      expect(result).toBeInstanceOf(StreamableFile);
    });
    it("should set correct Content-Type for mp3 format", async () => {
      const dto: SynthesizeDto = { text: "Hello", format: "mp3" };
      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      expect(result).toBeInstanceOf(StreamableFile);
      const headers = result.getHeaders();
      expect(headers.type).toBe("audio/mpeg");
    });
    it("should set correct Content-Type for wav format", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" };
      mockSpeechService.synthesize.mockResolvedValue(wavResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.type).toBe("audio/wav");
    });
    it("should set correct Content-Type for opus format", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      const opusResult: SynthesisResult = { ...mockSynthesisResult, format: "opus" };
      mockSpeechService.synthesize.mockResolvedValue(opusResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.type).toBe("audio/opus");
    });
    it("should set correct Content-Type for flac format", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      const flacResult: SynthesisResult = { ...mockSynthesisResult, format: "flac" };
      mockSpeechService.synthesize.mockResolvedValue(flacResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.type).toBe("audio/flac");
    });
    it("should set correct Content-Type for aac format", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      const aacResult: SynthesisResult = { ...mockSynthesisResult, format: "aac" };
      mockSpeechService.synthesize.mockResolvedValue(aacResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.type).toBe("audio/aac");
    });
    it("should set correct Content-Type for pcm format", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      const pcmResult: SynthesisResult = { ...mockSynthesisResult, format: "pcm" };
      mockSpeechService.synthesize.mockResolvedValue(pcmResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.type).toBe("audio/pcm");
    });
    it("should set Content-Disposition header for download with correct extension", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.disposition).toContain("attachment");
      expect(headers.disposition).toContain("speech.mp3");
    });
    it("should set Content-Disposition with correct file extension for wav", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" };
      mockSpeechService.synthesize.mockResolvedValue(wavResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.disposition).toContain("speech.wav");
    });
    it("should set Content-Length header based on audio buffer size", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
      const headers = result.getHeaders();
      expect(headers.length).toBe(mockAudioBuffer.length);
    });
    it("should propagate ServiceUnavailableException from service", async () => {
      const dto: SynthesizeDto = { text: "Hello" };
      mockSpeechService.synthesize.mockRejectedValue(
        new ServiceUnavailableException("No TTS providers are available")
      );
      await expect(controller.synthesize(dto, mockWorkspaceId, mockUser)).rejects.toThrow(
        ServiceUnavailableException
      );
    });
  });
  // ==============================================
  // GET /api/speech/voices (Issue #396)
  // ==============================================
  describe("getVoices", () => {
    const mockVoices: VoiceInfo[] = [
      {
        id: "af_heart",
        name: "Heart",
        language: "en",
        tier: "default",
        isDefault: true,
      },
      {
        id: "af_sky",
        name: "Sky",
        language: "en",
        tier: "default",
        isDefault: false,
      },
      {
        id: "chatterbox-voice",
        name: "Chatterbox Default",
        language: "en",
        tier: "premium",
        isDefault: true,
      },
    ];
    it("should return all voices when no tier filter is provided", async () => {
      mockSpeechService.listVoices.mockResolvedValue(mockVoices);
      const result = await controller.getVoices(mockWorkspaceId);
      expect(mockSpeechService.listVoices).toHaveBeenCalledWith(undefined);
      expect(result).toEqual({ data: mockVoices });
    });
    it("should filter voices by default tier", async () => {
      const defaultVoices = mockVoices.filter((v) => v.tier === "default");
      mockSpeechService.listVoices.mockResolvedValue(defaultVoices);
      const result = await controller.getVoices(mockWorkspaceId, "default");
      expect(mockSpeechService.listVoices).toHaveBeenCalledWith("default");
      expect(result).toEqual({ data: defaultVoices });
    });
    it("should filter voices by premium tier", async () => {
      const premiumVoices = mockVoices.filter((v) => v.tier === "premium");
      mockSpeechService.listVoices.mockResolvedValue(premiumVoices);
      const result = await controller.getVoices(mockWorkspaceId, "premium");
      expect(mockSpeechService.listVoices).toHaveBeenCalledWith("premium");
      expect(result).toEqual({ data: premiumVoices });
    });
    it("should return empty array when no voices are available", async () => {
      mockSpeechService.listVoices.mockResolvedValue([]);
      const result = await controller.getVoices(mockWorkspaceId);
      expect(result).toEqual({ data: [] });
    });
    it("should return empty array when filtering by tier with no matching voices", async () => {
      mockSpeechService.listVoices.mockResolvedValue([]);
      const result = await controller.getVoices(mockWorkspaceId, "fallback");
      expect(mockSpeechService.listVoices).toHaveBeenCalledWith("fallback");
      expect(result).toEqual({ data: [] });
    });
  });
 });
--- a/apps/api/src/speech/speech.controller.ts
+++ b/apps/api/src/speech/speech.controller.ts
@@ -0,0 +1,193 @@
 /**
 * SpeechController
 *
 * REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services.
 * Handles audio file uploads for transcription, text-to-speech synthesis,
 * voice listing, and provider health status.
 *
 * Endpoints:
 * - POST /api/speech/transcribe   - Transcribe uploaded audio file to text
 * - POST /api/speech/synthesize   - Synthesize text to audio (TTS)
 * - GET  /api/speech/voices       - List available TTS voices
 * - GET  /api/speech/health       - Check STT/TTS provider availability
 *
 * Issue #392, #396
 */
 import {
  Controller,
  Post,
  Get,
  Body,
  Query,
  UseGuards,
  UseInterceptors,
  UploadedFile,
  StreamableFile,
 } from "@nestjs/common";
 import { FileInterceptor } from "@nestjs/platform-express";
 import { SpeechService } from "./speech.service";
 import { TranscribeDto } from "./dto/transcribe.dto";
 import { SynthesizeDto } from "./dto/synthesize.dto";
 import { AudioValidationPipe } from "./pipes/audio-validation.pipe";
 import { AuthGuard } from "../auth/guards/auth.guard";
 import { WorkspaceGuard, PermissionGuard } from "../common/guards";
 import { Workspace, Permission, RequirePermission } from "../common/decorators";
 import { CurrentUser } from "../auth/decorators/current-user.decorator";
 import type { AuthenticatedUser } from "../common/types/user.types";
 import type {
  AudioFormat,
  SynthesizeOptions,
  TranscribeOptions,
  TranscriptionResult,
  VoiceInfo,
  SpeechTier,
 } from "./interfaces/speech-types";
 /**
 * Map audio format to MIME type for Content-Type header.
 */
 const AUDIO_FORMAT_MIME_TYPES: Record<AudioFormat, string> = {
  mp3: "audio/mpeg",
  wav: "audio/wav",
  opus: "audio/opus",
  flac: "audio/flac",
  aac: "audio/aac",
  pcm: "audio/pcm",
 };
 /**
 * Health status for a single speech provider category.
 */
 interface ProviderHealth {
  available: boolean;
 }
 /**
 * Combined health status response for all speech providers.
 */
 interface SpeechHealthResponse {
  data: {
    stt: ProviderHealth;
    tts: ProviderHealth;
  };
 }
@Controller("speech")
@UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard)
 export class SpeechController {
  constructor(private readonly speechService: SpeechService) {}
  /**
   * POST /api/speech/transcribe
   *
   * Transcribe an uploaded audio file to text.
   * Accepts multipart form data with an audio file and optional transcription parameters.
   *
   * @param file - Uploaded audio file (validated by AudioValidationPipe)
   * @param dto - Optional transcription parameters (language, model, prompt, temperature)
   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
   * @param _user - Authenticated user (validated by AuthGuard)
   * @returns Transcription result wrapped in standard data envelope
   */
  @Post("transcribe")
  @RequirePermission(Permission.WORKSPACE_MEMBER)
  @UseInterceptors(FileInterceptor("file"))
  async transcribe(
    @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File,
    @Body() dto: TranscribeDto,
    @Workspace() _workspaceId: string,
    @CurrentUser() _user: AuthenticatedUser
  ): Promise<{ data: TranscriptionResult }> {
    const options: TranscribeOptions = { mimeType: file.mimetype };
    if (dto.language !== undefined) options.language = dto.language;
    if (dto.model !== undefined) options.model = dto.model;
    if (dto.prompt !== undefined) options.prompt = dto.prompt;
    if (dto.temperature !== undefined) options.temperature = dto.temperature;
    const result = await this.speechService.transcribe(file.buffer, options);
    return { data: result };
  }
  /**
   * GET /api/speech/health
   *
   * Check availability of STT and TTS providers.
   *
   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
   * @returns Health status of STT and TTS providers
   */
  @Get("health")
  @RequirePermission(Permission.WORKSPACE_ANY)
  health(@Workspace() _workspaceId: string): SpeechHealthResponse {
    return {
      data: {
        stt: { available: this.speechService.isSTTAvailable() },
        tts: { available: this.speechService.isTTSAvailable() },
      },
    };
  }
  /**
   * POST /api/speech/synthesize
   *
   * Synthesize text to audio using TTS providers.
   * Accepts JSON body with text and optional voice/format/speed/tier parameters.
   * Returns audio binary with appropriate Content-Type and Content-Disposition headers.
   *
   * Provider selection follows fallback chain: requested tier -> default -> fallback.
   *
   * @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?)
   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
   * @param _user - Authenticated user (validated by AuthGuard)
   * @returns StreamableFile containing synthesized audio
   *
   * Issue #396
   */
  @Post("synthesize")
  @RequirePermission(Permission.WORKSPACE_MEMBER)
  async synthesize(
    @Body() dto: SynthesizeDto,
    @Workspace() _workspaceId: string,
    @CurrentUser() _user: AuthenticatedUser
  ): Promise<StreamableFile> {
    const options: SynthesizeOptions = {};
    if (dto.voice !== undefined) options.voice = dto.voice;
    if (dto.speed !== undefined) options.speed = dto.speed;
    if (dto.format !== undefined) options.format = dto.format;
    if (dto.tier !== undefined) options.tier = dto.tier;
    const result = await this.speechService.synthesize(dto.text, options);
    const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format];
    return new StreamableFile(result.audio, {
      type: mimeType,
      disposition: `attachment; filename="speech.${result.format}"`,
      length: result.audio.length,
    });
  }
  /**
   * GET /api/speech/voices
   *
   * List available TTS voices across all tiers.
   * Optionally filter by tier using the `tier` query parameter.
   *
   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
   * @param tier - Optional tier filter (default, premium, fallback)
   * @returns Voice information array wrapped in standard data envelope
   *
   * Issue #396
   */
  @Get("voices")
  @RequirePermission(Permission.WORKSPACE_ANY)
  async getVoices(
    @Workspace() _workspaceId: string,
    @Query("tier") tier?: SpeechTier
  ): Promise<{ data: VoiceInfo[] }> {
    const voices = await this.speechService.listVoices(tier);
    return { data: voices };
  }
 }
--- a/apps/api/src/speech/speech.module.ts
+++ b/apps/api/src/speech/speech.module.ts
@@ -31,12 +31,14 @@ import {
  type SpeechConfig,
 } from "./speech.config";
 import { SpeechService } from "./speech.service";
 import { SpeechController } from "./speech.controller";
 import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
 import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
 import { createTTSProviders } from "./providers/tts-provider.factory";
@Module({
  imports: [ConfigModule.forFeature(speechConfig)],
  controllers: [SpeechController],
  providers: [
    SpeechService,
    // STT provider: conditionally register SpeachesSttProvider when STT is enabled