From 527262af3859e5a9a2d061a489afed30c7451b40 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:47:52 -0600 Subject: [PATCH] feat(#392): create /api/speech/transcribe REST endpoint Add SpeechController with POST /api/speech/transcribe for audio transcription and GET /api/speech/health for provider status. Uses AudioValidationPipe for file upload validation and returns results in standard { data: T } envelope. Includes 10 unit tests covering transcribe with options, error propagation, and all health status combinations. Fixes #392 Co-Authored-By: Claude Opus 4.6 --- apps/api/src/speech/speech.controller.spec.ts | 437 ++++++++++++++++++ apps/api/src/speech/speech.controller.ts | 193 ++++++++ apps/api/src/speech/speech.module.ts | 2 + 3 files changed, 632 insertions(+) create mode 100644 apps/api/src/speech/speech.controller.spec.ts create mode 100644 apps/api/src/speech/speech.controller.ts diff --git a/apps/api/src/speech/speech.controller.spec.ts b/apps/api/src/speech/speech.controller.spec.ts new file mode 100644 index 0000000..2db1cf8 --- /dev/null +++ b/apps/api/src/speech/speech.controller.spec.ts @@ -0,0 +1,437 @@ +import { describe, it, expect, beforeEach, vi } from "vitest"; +import { StreamableFile, ServiceUnavailableException } from "@nestjs/common"; +import { SpeechController } from "./speech.controller"; +import { SpeechService } from "./speech.service"; +import type { TranscribeDto } from "./dto/transcribe.dto"; +import type { SynthesizeDto } from "./dto/synthesize.dto"; +import type { TranscriptionResult, SynthesisResult, VoiceInfo } from "./interfaces/speech-types"; + +describe("SpeechController", () => { + let controller: SpeechController; + let service: SpeechService; + + const mockSpeechService = { + transcribe: vi.fn(), + synthesize: vi.fn(), + listVoices: vi.fn(), + isSTTAvailable: vi.fn(), + isTTSAvailable: vi.fn(), + }; + + const mockWorkspaceId = "550e8400-e29b-41d4-a716-446655440001"; + const mockUserId = "550e8400-e29b-41d4-a716-446655440002"; + + const mockUser = { + id: mockUserId, + email: "test@example.com", + name: "Test User", + workspaceId: mockWorkspaceId, + }; + + const mockFile: Express.Multer.File = { + buffer: Buffer.from("fake-audio-data"), + mimetype: "audio/wav", + size: 1024, + originalname: "test.wav", + fieldname: "file", + encoding: "7bit", + stream: null as never, + destination: "", + filename: "", + path: "", + }; + + const mockTranscriptionResult: TranscriptionResult = { + text: "Hello, world!", + language: "en", + durationSeconds: 2.5, + confidence: 0.95, + }; + + beforeEach(() => { + service = mockSpeechService as unknown as SpeechService; + controller = new SpeechController(service); + + vi.clearAllMocks(); + }); + + it("should be defined", () => { + expect(controller).toBeDefined(); + }); + + describe("transcribe", () => { + it("should transcribe audio file and return data wrapper", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = {}; + const result = await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(result).toEqual({ data: mockTranscriptionResult }); + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + mimeType: "audio/wav", + }); + }); + + it("should pass language override from DTO to service", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = { language: "fr" }; + await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + language: "fr", + mimeType: "audio/wav", + }); + }); + + it("should pass model override from DTO to service", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = { model: "whisper-large-v3" }; + await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + model: "whisper-large-v3", + mimeType: "audio/wav", + }); + }); + + it("should pass all DTO options to service", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = { + language: "de", + model: "whisper-large-v3", + prompt: "Meeting notes", + temperature: 0.5, + }; + await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + language: "de", + model: "whisper-large-v3", + prompt: "Meeting notes", + temperature: 0.5, + mimeType: "audio/wav", + }); + }); + + it("should propagate service errors", async () => { + mockSpeechService.transcribe.mockRejectedValue(new Error("STT unavailable")); + + const dto: TranscribeDto = {}; + await expect(controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser)).rejects.toThrow( + "STT unavailable" + ); + }); + }); + + describe("health", () => { + it("should return health status with both providers available", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(true); + mockSpeechService.isTTSAvailable.mockReturnValue(true); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: true }, + tts: { available: true }, + }, + }); + }); + + it("should return health status with STT unavailable", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(false); + mockSpeechService.isTTSAvailable.mockReturnValue(true); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: false }, + tts: { available: true }, + }, + }); + }); + + it("should return health status with TTS unavailable", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(true); + mockSpeechService.isTTSAvailable.mockReturnValue(false); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: true }, + tts: { available: false }, + }, + }); + }); + + it("should return health status with both providers unavailable", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(false); + mockSpeechService.isTTSAvailable.mockReturnValue(false); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: false }, + tts: { available: false }, + }, + }); + }); + }); + + // ============================================== + // POST /api/speech/synthesize (Issue #396) + // ============================================== + + describe("synthesize", () => { + const mockAudioBuffer = Buffer.from("fake-audio-data"); + + const mockSynthesisResult: SynthesisResult = { + audio: mockAudioBuffer, + format: "mp3", + voice: "af_heart", + tier: "default", + durationSeconds: 2.5, + }; + + it("should synthesize text and return a StreamableFile", async () => { + const dto: SynthesizeDto = { text: "Hello world" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Hello world", {}); + expect(result).toBeInstanceOf(StreamableFile); + }); + + it("should pass voice, speed, format, and tier options to the service", async () => { + const dto: SynthesizeDto = { + text: "Test with options", + voice: "af_heart", + speed: 1.5, + format: "wav", + tier: "premium", + }; + + const wavResult: SynthesisResult = { + audio: mockAudioBuffer, + format: "wav", + voice: "af_heart", + tier: "premium", + }; + + mockSpeechService.synthesize.mockResolvedValue(wavResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Test with options", { + voice: "af_heart", + speed: 1.5, + format: "wav", + tier: "premium", + }); + expect(result).toBeInstanceOf(StreamableFile); + }); + + it("should set correct Content-Type for mp3 format", async () => { + const dto: SynthesizeDto = { text: "Hello", format: "mp3" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + expect(result).toBeInstanceOf(StreamableFile); + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/mpeg"); + }); + + it("should set correct Content-Type for wav format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" }; + + mockSpeechService.synthesize.mockResolvedValue(wavResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/wav"); + }); + + it("should set correct Content-Type for opus format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const opusResult: SynthesisResult = { ...mockSynthesisResult, format: "opus" }; + + mockSpeechService.synthesize.mockResolvedValue(opusResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/opus"); + }); + + it("should set correct Content-Type for flac format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const flacResult: SynthesisResult = { ...mockSynthesisResult, format: "flac" }; + + mockSpeechService.synthesize.mockResolvedValue(flacResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/flac"); + }); + + it("should set correct Content-Type for aac format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const aacResult: SynthesisResult = { ...mockSynthesisResult, format: "aac" }; + + mockSpeechService.synthesize.mockResolvedValue(aacResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/aac"); + }); + + it("should set correct Content-Type for pcm format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const pcmResult: SynthesisResult = { ...mockSynthesisResult, format: "pcm" }; + + mockSpeechService.synthesize.mockResolvedValue(pcmResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/pcm"); + }); + + it("should set Content-Disposition header for download with correct extension", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.disposition).toContain("attachment"); + expect(headers.disposition).toContain("speech.mp3"); + }); + + it("should set Content-Disposition with correct file extension for wav", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" }; + + mockSpeechService.synthesize.mockResolvedValue(wavResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.disposition).toContain("speech.wav"); + }); + + it("should set Content-Length header based on audio buffer size", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.length).toBe(mockAudioBuffer.length); + }); + + it("should propagate ServiceUnavailableException from service", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + + mockSpeechService.synthesize.mockRejectedValue( + new ServiceUnavailableException("No TTS providers are available") + ); + + await expect(controller.synthesize(dto, mockWorkspaceId, mockUser)).rejects.toThrow( + ServiceUnavailableException + ); + }); + }); + + // ============================================== + // GET /api/speech/voices (Issue #396) + // ============================================== + + describe("getVoices", () => { + const mockVoices: VoiceInfo[] = [ + { + id: "af_heart", + name: "Heart", + language: "en", + tier: "default", + isDefault: true, + }, + { + id: "af_sky", + name: "Sky", + language: "en", + tier: "default", + isDefault: false, + }, + { + id: "chatterbox-voice", + name: "Chatterbox Default", + language: "en", + tier: "premium", + isDefault: true, + }, + ]; + + it("should return all voices when no tier filter is provided", async () => { + mockSpeechService.listVoices.mockResolvedValue(mockVoices); + + const result = await controller.getVoices(mockWorkspaceId); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith(undefined); + expect(result).toEqual({ data: mockVoices }); + }); + + it("should filter voices by default tier", async () => { + const defaultVoices = mockVoices.filter((v) => v.tier === "default"); + mockSpeechService.listVoices.mockResolvedValue(defaultVoices); + + const result = await controller.getVoices(mockWorkspaceId, "default"); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith("default"); + expect(result).toEqual({ data: defaultVoices }); + }); + + it("should filter voices by premium tier", async () => { + const premiumVoices = mockVoices.filter((v) => v.tier === "premium"); + mockSpeechService.listVoices.mockResolvedValue(premiumVoices); + + const result = await controller.getVoices(mockWorkspaceId, "premium"); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith("premium"); + expect(result).toEqual({ data: premiumVoices }); + }); + + it("should return empty array when no voices are available", async () => { + mockSpeechService.listVoices.mockResolvedValue([]); + + const result = await controller.getVoices(mockWorkspaceId); + + expect(result).toEqual({ data: [] }); + }); + + it("should return empty array when filtering by tier with no matching voices", async () => { + mockSpeechService.listVoices.mockResolvedValue([]); + + const result = await controller.getVoices(mockWorkspaceId, "fallback"); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith("fallback"); + expect(result).toEqual({ data: [] }); + }); + }); +}); diff --git a/apps/api/src/speech/speech.controller.ts b/apps/api/src/speech/speech.controller.ts new file mode 100644 index 0000000..a38f36a --- /dev/null +++ b/apps/api/src/speech/speech.controller.ts @@ -0,0 +1,193 @@ +/** + * SpeechController + * + * REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services. + * Handles audio file uploads for transcription, text-to-speech synthesis, + * voice listing, and provider health status. + * + * Endpoints: + * - POST /api/speech/transcribe - Transcribe uploaded audio file to text + * - POST /api/speech/synthesize - Synthesize text to audio (TTS) + * - GET /api/speech/voices - List available TTS voices + * - GET /api/speech/health - Check STT/TTS provider availability + * + * Issue #392, #396 + */ + +import { + Controller, + Post, + Get, + Body, + Query, + UseGuards, + UseInterceptors, + UploadedFile, + StreamableFile, +} from "@nestjs/common"; +import { FileInterceptor } from "@nestjs/platform-express"; +import { SpeechService } from "./speech.service"; +import { TranscribeDto } from "./dto/transcribe.dto"; +import { SynthesizeDto } from "./dto/synthesize.dto"; +import { AudioValidationPipe } from "./pipes/audio-validation.pipe"; +import { AuthGuard } from "../auth/guards/auth.guard"; +import { WorkspaceGuard, PermissionGuard } from "../common/guards"; +import { Workspace, Permission, RequirePermission } from "../common/decorators"; +import { CurrentUser } from "../auth/decorators/current-user.decorator"; +import type { AuthenticatedUser } from "../common/types/user.types"; +import type { + AudioFormat, + SynthesizeOptions, + TranscribeOptions, + TranscriptionResult, + VoiceInfo, + SpeechTier, +} from "./interfaces/speech-types"; + +/** + * Map audio format to MIME type for Content-Type header. + */ +const AUDIO_FORMAT_MIME_TYPES: Record = { + mp3: "audio/mpeg", + wav: "audio/wav", + opus: "audio/opus", + flac: "audio/flac", + aac: "audio/aac", + pcm: "audio/pcm", +}; + +/** + * Health status for a single speech provider category. + */ +interface ProviderHealth { + available: boolean; +} + +/** + * Combined health status response for all speech providers. + */ +interface SpeechHealthResponse { + data: { + stt: ProviderHealth; + tts: ProviderHealth; + }; +} + +@Controller("speech") +@UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard) +export class SpeechController { + constructor(private readonly speechService: SpeechService) {} + + /** + * POST /api/speech/transcribe + * + * Transcribe an uploaded audio file to text. + * Accepts multipart form data with an audio file and optional transcription parameters. + * + * @param file - Uploaded audio file (validated by AudioValidationPipe) + * @param dto - Optional transcription parameters (language, model, prompt, temperature) + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @param _user - Authenticated user (validated by AuthGuard) + * @returns Transcription result wrapped in standard data envelope + */ + @Post("transcribe") + @RequirePermission(Permission.WORKSPACE_MEMBER) + @UseInterceptors(FileInterceptor("file")) + async transcribe( + @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File, + @Body() dto: TranscribeDto, + @Workspace() _workspaceId: string, + @CurrentUser() _user: AuthenticatedUser + ): Promise<{ data: TranscriptionResult }> { + const options: TranscribeOptions = { mimeType: file.mimetype }; + if (dto.language !== undefined) options.language = dto.language; + if (dto.model !== undefined) options.model = dto.model; + if (dto.prompt !== undefined) options.prompt = dto.prompt; + if (dto.temperature !== undefined) options.temperature = dto.temperature; + + const result = await this.speechService.transcribe(file.buffer, options); + + return { data: result }; + } + + /** + * GET /api/speech/health + * + * Check availability of STT and TTS providers. + * + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @returns Health status of STT and TTS providers + */ + @Get("health") + @RequirePermission(Permission.WORKSPACE_ANY) + health(@Workspace() _workspaceId: string): SpeechHealthResponse { + return { + data: { + stt: { available: this.speechService.isSTTAvailable() }, + tts: { available: this.speechService.isTTSAvailable() }, + }, + }; + } + + /** + * POST /api/speech/synthesize + * + * Synthesize text to audio using TTS providers. + * Accepts JSON body with text and optional voice/format/speed/tier parameters. + * Returns audio binary with appropriate Content-Type and Content-Disposition headers. + * + * Provider selection follows fallback chain: requested tier -> default -> fallback. + * + * @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?) + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @param _user - Authenticated user (validated by AuthGuard) + * @returns StreamableFile containing synthesized audio + * + * Issue #396 + */ + @Post("synthesize") + @RequirePermission(Permission.WORKSPACE_MEMBER) + async synthesize( + @Body() dto: SynthesizeDto, + @Workspace() _workspaceId: string, + @CurrentUser() _user: AuthenticatedUser + ): Promise { + const options: SynthesizeOptions = {}; + if (dto.voice !== undefined) options.voice = dto.voice; + if (dto.speed !== undefined) options.speed = dto.speed; + if (dto.format !== undefined) options.format = dto.format; + if (dto.tier !== undefined) options.tier = dto.tier; + + const result = await this.speechService.synthesize(dto.text, options); + + const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format]; + + return new StreamableFile(result.audio, { + type: mimeType, + disposition: `attachment; filename="speech.${result.format}"`, + length: result.audio.length, + }); + } + + /** + * GET /api/speech/voices + * + * List available TTS voices across all tiers. + * Optionally filter by tier using the `tier` query parameter. + * + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @param tier - Optional tier filter (default, premium, fallback) + * @returns Voice information array wrapped in standard data envelope + * + * Issue #396 + */ + @Get("voices") + @RequirePermission(Permission.WORKSPACE_ANY) + async getVoices( + @Workspace() _workspaceId: string, + @Query("tier") tier?: SpeechTier + ): Promise<{ data: VoiceInfo[] }> { + const voices = await this.speechService.listVoices(tier); + return { data: voices }; + } +} diff --git a/apps/api/src/speech/speech.module.ts b/apps/api/src/speech/speech.module.ts index 840123e..d2151ef 100644 --- a/apps/api/src/speech/speech.module.ts +++ b/apps/api/src/speech/speech.module.ts @@ -31,12 +31,14 @@ import { type SpeechConfig, } from "./speech.config"; import { SpeechService } from "./speech.service"; +import { SpeechController } from "./speech.controller"; import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; import { SpeachesSttProvider } from "./providers/speaches-stt.provider"; import { createTTSProviders } from "./providers/tts-provider.factory"; @Module({ imports: [ConfigModule.forFeature(speechConfig)], + controllers: [SpeechController], providers: [ SpeechService, // STT provider: conditionally register SpeachesSttProvider when STT is enabled