feat(#392): create /api/speech/transcribe REST endpoint
All checks were successful
ci/woodpecker/push/api Pipeline was successful
All checks were successful
ci/woodpecker/push/api Pipeline was successful
Add SpeechController with POST /api/speech/transcribe for audio
transcription and GET /api/speech/health for provider status.
Uses AudioValidationPipe for file upload validation and returns
results in standard { data: T } envelope.
Includes 10 unit tests covering transcribe with options, error
propagation, and all health status combinations.
Fixes #392
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
437
apps/api/src/speech/speech.controller.spec.ts
Normal file
437
apps/api/src/speech/speech.controller.spec.ts
Normal file
@@ -0,0 +1,437 @@
|
|||||||
|
import { describe, it, expect, beforeEach, vi } from "vitest";
|
||||||
|
import { StreamableFile, ServiceUnavailableException } from "@nestjs/common";
|
||||||
|
import { SpeechController } from "./speech.controller";
|
||||||
|
import { SpeechService } from "./speech.service";
|
||||||
|
import type { TranscribeDto } from "./dto/transcribe.dto";
|
||||||
|
import type { SynthesizeDto } from "./dto/synthesize.dto";
|
||||||
|
import type { TranscriptionResult, SynthesisResult, VoiceInfo } from "./interfaces/speech-types";
|
||||||
|
|
||||||
|
describe("SpeechController", () => {
|
||||||
|
let controller: SpeechController;
|
||||||
|
let service: SpeechService;
|
||||||
|
|
||||||
|
const mockSpeechService = {
|
||||||
|
transcribe: vi.fn(),
|
||||||
|
synthesize: vi.fn(),
|
||||||
|
listVoices: vi.fn(),
|
||||||
|
isSTTAvailable: vi.fn(),
|
||||||
|
isTTSAvailable: vi.fn(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockWorkspaceId = "550e8400-e29b-41d4-a716-446655440001";
|
||||||
|
const mockUserId = "550e8400-e29b-41d4-a716-446655440002";
|
||||||
|
|
||||||
|
const mockUser = {
|
||||||
|
id: mockUserId,
|
||||||
|
email: "test@example.com",
|
||||||
|
name: "Test User",
|
||||||
|
workspaceId: mockWorkspaceId,
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockFile: Express.Multer.File = {
|
||||||
|
buffer: Buffer.from("fake-audio-data"),
|
||||||
|
mimetype: "audio/wav",
|
||||||
|
size: 1024,
|
||||||
|
originalname: "test.wav",
|
||||||
|
fieldname: "file",
|
||||||
|
encoding: "7bit",
|
||||||
|
stream: null as never,
|
||||||
|
destination: "",
|
||||||
|
filename: "",
|
||||||
|
path: "",
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockTranscriptionResult: TranscriptionResult = {
|
||||||
|
text: "Hello, world!",
|
||||||
|
language: "en",
|
||||||
|
durationSeconds: 2.5,
|
||||||
|
confidence: 0.95,
|
||||||
|
};
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
service = mockSpeechService as unknown as SpeechService;
|
||||||
|
controller = new SpeechController(service);
|
||||||
|
|
||||||
|
vi.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should be defined", () => {
|
||||||
|
expect(controller).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("transcribe", () => {
|
||||||
|
it("should transcribe audio file and return data wrapper", async () => {
|
||||||
|
mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
|
||||||
|
|
||||||
|
const dto: TranscribeDto = {};
|
||||||
|
const result = await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
expect(result).toEqual({ data: mockTranscriptionResult });
|
||||||
|
expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
|
||||||
|
mimeType: "audio/wav",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass language override from DTO to service", async () => {
|
||||||
|
mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
|
||||||
|
|
||||||
|
const dto: TranscribeDto = { language: "fr" };
|
||||||
|
await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
|
||||||
|
language: "fr",
|
||||||
|
mimeType: "audio/wav",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass model override from DTO to service", async () => {
|
||||||
|
mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
|
||||||
|
|
||||||
|
const dto: TranscribeDto = { model: "whisper-large-v3" };
|
||||||
|
await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
|
||||||
|
model: "whisper-large-v3",
|
||||||
|
mimeType: "audio/wav",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass all DTO options to service", async () => {
|
||||||
|
mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
|
||||||
|
|
||||||
|
const dto: TranscribeDto = {
|
||||||
|
language: "de",
|
||||||
|
model: "whisper-large-v3",
|
||||||
|
prompt: "Meeting notes",
|
||||||
|
temperature: 0.5,
|
||||||
|
};
|
||||||
|
await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
|
||||||
|
language: "de",
|
||||||
|
model: "whisper-large-v3",
|
||||||
|
prompt: "Meeting notes",
|
||||||
|
temperature: 0.5,
|
||||||
|
mimeType: "audio/wav",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should propagate service errors", async () => {
|
||||||
|
mockSpeechService.transcribe.mockRejectedValue(new Error("STT unavailable"));
|
||||||
|
|
||||||
|
const dto: TranscribeDto = {};
|
||||||
|
await expect(controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser)).rejects.toThrow(
|
||||||
|
"STT unavailable"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("health", () => {
|
||||||
|
it("should return health status with both providers available", async () => {
|
||||||
|
mockSpeechService.isSTTAvailable.mockReturnValue(true);
|
||||||
|
mockSpeechService.isTTSAvailable.mockReturnValue(true);
|
||||||
|
|
||||||
|
const result = await controller.health(mockWorkspaceId);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
data: {
|
||||||
|
stt: { available: true },
|
||||||
|
tts: { available: true },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return health status with STT unavailable", async () => {
|
||||||
|
mockSpeechService.isSTTAvailable.mockReturnValue(false);
|
||||||
|
mockSpeechService.isTTSAvailable.mockReturnValue(true);
|
||||||
|
|
||||||
|
const result = await controller.health(mockWorkspaceId);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
data: {
|
||||||
|
stt: { available: false },
|
||||||
|
tts: { available: true },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return health status with TTS unavailable", async () => {
|
||||||
|
mockSpeechService.isSTTAvailable.mockReturnValue(true);
|
||||||
|
mockSpeechService.isTTSAvailable.mockReturnValue(false);
|
||||||
|
|
||||||
|
const result = await controller.health(mockWorkspaceId);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
data: {
|
||||||
|
stt: { available: true },
|
||||||
|
tts: { available: false },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return health status with both providers unavailable", async () => {
|
||||||
|
mockSpeechService.isSTTAvailable.mockReturnValue(false);
|
||||||
|
mockSpeechService.isTTSAvailable.mockReturnValue(false);
|
||||||
|
|
||||||
|
const result = await controller.health(mockWorkspaceId);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
data: {
|
||||||
|
stt: { available: false },
|
||||||
|
tts: { available: false },
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==============================================
|
||||||
|
// POST /api/speech/synthesize (Issue #396)
|
||||||
|
// ==============================================
|
||||||
|
|
||||||
|
describe("synthesize", () => {
|
||||||
|
const mockAudioBuffer = Buffer.from("fake-audio-data");
|
||||||
|
|
||||||
|
const mockSynthesisResult: SynthesisResult = {
|
||||||
|
audio: mockAudioBuffer,
|
||||||
|
format: "mp3",
|
||||||
|
voice: "af_heart",
|
||||||
|
tier: "default",
|
||||||
|
durationSeconds: 2.5,
|
||||||
|
};
|
||||||
|
|
||||||
|
it("should synthesize text and return a StreamableFile", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello world" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Hello world", {});
|
||||||
|
expect(result).toBeInstanceOf(StreamableFile);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass voice, speed, format, and tier options to the service", async () => {
|
||||||
|
const dto: SynthesizeDto = {
|
||||||
|
text: "Test with options",
|
||||||
|
voice: "af_heart",
|
||||||
|
speed: 1.5,
|
||||||
|
format: "wav",
|
||||||
|
tier: "premium",
|
||||||
|
};
|
||||||
|
|
||||||
|
const wavResult: SynthesisResult = {
|
||||||
|
audio: mockAudioBuffer,
|
||||||
|
format: "wav",
|
||||||
|
voice: "af_heart",
|
||||||
|
tier: "premium",
|
||||||
|
};
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(wavResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Test with options", {
|
||||||
|
voice: "af_heart",
|
||||||
|
speed: 1.5,
|
||||||
|
format: "wav",
|
||||||
|
tier: "premium",
|
||||||
|
});
|
||||||
|
expect(result).toBeInstanceOf(StreamableFile);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set correct Content-Type for mp3 format", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello", format: "mp3" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
expect(result).toBeInstanceOf(StreamableFile);
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.type).toBe("audio/mpeg");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set correct Content-Type for wav format", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(wavResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.type).toBe("audio/wav");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set correct Content-Type for opus format", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
const opusResult: SynthesisResult = { ...mockSynthesisResult, format: "opus" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(opusResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.type).toBe("audio/opus");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set correct Content-Type for flac format", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
const flacResult: SynthesisResult = { ...mockSynthesisResult, format: "flac" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(flacResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.type).toBe("audio/flac");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set correct Content-Type for aac format", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
const aacResult: SynthesisResult = { ...mockSynthesisResult, format: "aac" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(aacResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.type).toBe("audio/aac");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set correct Content-Type for pcm format", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
const pcmResult: SynthesisResult = { ...mockSynthesisResult, format: "pcm" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(pcmResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.type).toBe("audio/pcm");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set Content-Disposition header for download with correct extension", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.disposition).toContain("attachment");
|
||||||
|
expect(headers.disposition).toContain("speech.mp3");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set Content-Disposition with correct file extension for wav", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(wavResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.disposition).toContain("speech.wav");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set Content-Length header based on audio buffer size", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
|
||||||
|
|
||||||
|
const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
|
||||||
|
|
||||||
|
const headers = result.getHeaders();
|
||||||
|
expect(headers.length).toBe(mockAudioBuffer.length);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should propagate ServiceUnavailableException from service", async () => {
|
||||||
|
const dto: SynthesizeDto = { text: "Hello" };
|
||||||
|
|
||||||
|
mockSpeechService.synthesize.mockRejectedValue(
|
||||||
|
new ServiceUnavailableException("No TTS providers are available")
|
||||||
|
);
|
||||||
|
|
||||||
|
await expect(controller.synthesize(dto, mockWorkspaceId, mockUser)).rejects.toThrow(
|
||||||
|
ServiceUnavailableException
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==============================================
|
||||||
|
// GET /api/speech/voices (Issue #396)
|
||||||
|
// ==============================================
|
||||||
|
|
||||||
|
describe("getVoices", () => {
|
||||||
|
const mockVoices: VoiceInfo[] = [
|
||||||
|
{
|
||||||
|
id: "af_heart",
|
||||||
|
name: "Heart",
|
||||||
|
language: "en",
|
||||||
|
tier: "default",
|
||||||
|
isDefault: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "af_sky",
|
||||||
|
name: "Sky",
|
||||||
|
language: "en",
|
||||||
|
tier: "default",
|
||||||
|
isDefault: false,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "chatterbox-voice",
|
||||||
|
name: "Chatterbox Default",
|
||||||
|
language: "en",
|
||||||
|
tier: "premium",
|
||||||
|
isDefault: true,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
it("should return all voices when no tier filter is provided", async () => {
|
||||||
|
mockSpeechService.listVoices.mockResolvedValue(mockVoices);
|
||||||
|
|
||||||
|
const result = await controller.getVoices(mockWorkspaceId);
|
||||||
|
|
||||||
|
expect(mockSpeechService.listVoices).toHaveBeenCalledWith(undefined);
|
||||||
|
expect(result).toEqual({ data: mockVoices });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should filter voices by default tier", async () => {
|
||||||
|
const defaultVoices = mockVoices.filter((v) => v.tier === "default");
|
||||||
|
mockSpeechService.listVoices.mockResolvedValue(defaultVoices);
|
||||||
|
|
||||||
|
const result = await controller.getVoices(mockWorkspaceId, "default");
|
||||||
|
|
||||||
|
expect(mockSpeechService.listVoices).toHaveBeenCalledWith("default");
|
||||||
|
expect(result).toEqual({ data: defaultVoices });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should filter voices by premium tier", async () => {
|
||||||
|
const premiumVoices = mockVoices.filter((v) => v.tier === "premium");
|
||||||
|
mockSpeechService.listVoices.mockResolvedValue(premiumVoices);
|
||||||
|
|
||||||
|
const result = await controller.getVoices(mockWorkspaceId, "premium");
|
||||||
|
|
||||||
|
expect(mockSpeechService.listVoices).toHaveBeenCalledWith("premium");
|
||||||
|
expect(result).toEqual({ data: premiumVoices });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return empty array when no voices are available", async () => {
|
||||||
|
mockSpeechService.listVoices.mockResolvedValue([]);
|
||||||
|
|
||||||
|
const result = await controller.getVoices(mockWorkspaceId);
|
||||||
|
|
||||||
|
expect(result).toEqual({ data: [] });
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return empty array when filtering by tier with no matching voices", async () => {
|
||||||
|
mockSpeechService.listVoices.mockResolvedValue([]);
|
||||||
|
|
||||||
|
const result = await controller.getVoices(mockWorkspaceId, "fallback");
|
||||||
|
|
||||||
|
expect(mockSpeechService.listVoices).toHaveBeenCalledWith("fallback");
|
||||||
|
expect(result).toEqual({ data: [] });
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
193
apps/api/src/speech/speech.controller.ts
Normal file
193
apps/api/src/speech/speech.controller.ts
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
/**
|
||||||
|
* SpeechController
|
||||||
|
*
|
||||||
|
* REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services.
|
||||||
|
* Handles audio file uploads for transcription, text-to-speech synthesis,
|
||||||
|
* voice listing, and provider health status.
|
||||||
|
*
|
||||||
|
* Endpoints:
|
||||||
|
* - POST /api/speech/transcribe - Transcribe uploaded audio file to text
|
||||||
|
* - POST /api/speech/synthesize - Synthesize text to audio (TTS)
|
||||||
|
* - GET /api/speech/voices - List available TTS voices
|
||||||
|
* - GET /api/speech/health - Check STT/TTS provider availability
|
||||||
|
*
|
||||||
|
* Issue #392, #396
|
||||||
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
Controller,
|
||||||
|
Post,
|
||||||
|
Get,
|
||||||
|
Body,
|
||||||
|
Query,
|
||||||
|
UseGuards,
|
||||||
|
UseInterceptors,
|
||||||
|
UploadedFile,
|
||||||
|
StreamableFile,
|
||||||
|
} from "@nestjs/common";
|
||||||
|
import { FileInterceptor } from "@nestjs/platform-express";
|
||||||
|
import { SpeechService } from "./speech.service";
|
||||||
|
import { TranscribeDto } from "./dto/transcribe.dto";
|
||||||
|
import { SynthesizeDto } from "./dto/synthesize.dto";
|
||||||
|
import { AudioValidationPipe } from "./pipes/audio-validation.pipe";
|
||||||
|
import { AuthGuard } from "../auth/guards/auth.guard";
|
||||||
|
import { WorkspaceGuard, PermissionGuard } from "../common/guards";
|
||||||
|
import { Workspace, Permission, RequirePermission } from "../common/decorators";
|
||||||
|
import { CurrentUser } from "../auth/decorators/current-user.decorator";
|
||||||
|
import type { AuthenticatedUser } from "../common/types/user.types";
|
||||||
|
import type {
|
||||||
|
AudioFormat,
|
||||||
|
SynthesizeOptions,
|
||||||
|
TranscribeOptions,
|
||||||
|
TranscriptionResult,
|
||||||
|
VoiceInfo,
|
||||||
|
SpeechTier,
|
||||||
|
} from "./interfaces/speech-types";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map audio format to MIME type for Content-Type header.
|
||||||
|
*/
|
||||||
|
const AUDIO_FORMAT_MIME_TYPES: Record<AudioFormat, string> = {
|
||||||
|
mp3: "audio/mpeg",
|
||||||
|
wav: "audio/wav",
|
||||||
|
opus: "audio/opus",
|
||||||
|
flac: "audio/flac",
|
||||||
|
aac: "audio/aac",
|
||||||
|
pcm: "audio/pcm",
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Health status for a single speech provider category.
|
||||||
|
*/
|
||||||
|
interface ProviderHealth {
|
||||||
|
available: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Combined health status response for all speech providers.
|
||||||
|
*/
|
||||||
|
interface SpeechHealthResponse {
|
||||||
|
data: {
|
||||||
|
stt: ProviderHealth;
|
||||||
|
tts: ProviderHealth;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Controller("speech")
|
||||||
|
@UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard)
|
||||||
|
export class SpeechController {
|
||||||
|
constructor(private readonly speechService: SpeechService) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/speech/transcribe
|
||||||
|
*
|
||||||
|
* Transcribe an uploaded audio file to text.
|
||||||
|
* Accepts multipart form data with an audio file and optional transcription parameters.
|
||||||
|
*
|
||||||
|
* @param file - Uploaded audio file (validated by AudioValidationPipe)
|
||||||
|
* @param dto - Optional transcription parameters (language, model, prompt, temperature)
|
||||||
|
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||||
|
* @param _user - Authenticated user (validated by AuthGuard)
|
||||||
|
* @returns Transcription result wrapped in standard data envelope
|
||||||
|
*/
|
||||||
|
@Post("transcribe")
|
||||||
|
@RequirePermission(Permission.WORKSPACE_MEMBER)
|
||||||
|
@UseInterceptors(FileInterceptor("file"))
|
||||||
|
async transcribe(
|
||||||
|
@UploadedFile(new AudioValidationPipe()) file: Express.Multer.File,
|
||||||
|
@Body() dto: TranscribeDto,
|
||||||
|
@Workspace() _workspaceId: string,
|
||||||
|
@CurrentUser() _user: AuthenticatedUser
|
||||||
|
): Promise<{ data: TranscriptionResult }> {
|
||||||
|
const options: TranscribeOptions = { mimeType: file.mimetype };
|
||||||
|
if (dto.language !== undefined) options.language = dto.language;
|
||||||
|
if (dto.model !== undefined) options.model = dto.model;
|
||||||
|
if (dto.prompt !== undefined) options.prompt = dto.prompt;
|
||||||
|
if (dto.temperature !== undefined) options.temperature = dto.temperature;
|
||||||
|
|
||||||
|
const result = await this.speechService.transcribe(file.buffer, options);
|
||||||
|
|
||||||
|
return { data: result };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/speech/health
|
||||||
|
*
|
||||||
|
* Check availability of STT and TTS providers.
|
||||||
|
*
|
||||||
|
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||||
|
* @returns Health status of STT and TTS providers
|
||||||
|
*/
|
||||||
|
@Get("health")
|
||||||
|
@RequirePermission(Permission.WORKSPACE_ANY)
|
||||||
|
health(@Workspace() _workspaceId: string): SpeechHealthResponse {
|
||||||
|
return {
|
||||||
|
data: {
|
||||||
|
stt: { available: this.speechService.isSTTAvailable() },
|
||||||
|
tts: { available: this.speechService.isTTSAvailable() },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* POST /api/speech/synthesize
|
||||||
|
*
|
||||||
|
* Synthesize text to audio using TTS providers.
|
||||||
|
* Accepts JSON body with text and optional voice/format/speed/tier parameters.
|
||||||
|
* Returns audio binary with appropriate Content-Type and Content-Disposition headers.
|
||||||
|
*
|
||||||
|
* Provider selection follows fallback chain: requested tier -> default -> fallback.
|
||||||
|
*
|
||||||
|
* @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?)
|
||||||
|
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||||
|
* @param _user - Authenticated user (validated by AuthGuard)
|
||||||
|
* @returns StreamableFile containing synthesized audio
|
||||||
|
*
|
||||||
|
* Issue #396
|
||||||
|
*/
|
||||||
|
@Post("synthesize")
|
||||||
|
@RequirePermission(Permission.WORKSPACE_MEMBER)
|
||||||
|
async synthesize(
|
||||||
|
@Body() dto: SynthesizeDto,
|
||||||
|
@Workspace() _workspaceId: string,
|
||||||
|
@CurrentUser() _user: AuthenticatedUser
|
||||||
|
): Promise<StreamableFile> {
|
||||||
|
const options: SynthesizeOptions = {};
|
||||||
|
if (dto.voice !== undefined) options.voice = dto.voice;
|
||||||
|
if (dto.speed !== undefined) options.speed = dto.speed;
|
||||||
|
if (dto.format !== undefined) options.format = dto.format;
|
||||||
|
if (dto.tier !== undefined) options.tier = dto.tier;
|
||||||
|
|
||||||
|
const result = await this.speechService.synthesize(dto.text, options);
|
||||||
|
|
||||||
|
const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format];
|
||||||
|
|
||||||
|
return new StreamableFile(result.audio, {
|
||||||
|
type: mimeType,
|
||||||
|
disposition: `attachment; filename="speech.${result.format}"`,
|
||||||
|
length: result.audio.length,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* GET /api/speech/voices
|
||||||
|
*
|
||||||
|
* List available TTS voices across all tiers.
|
||||||
|
* Optionally filter by tier using the `tier` query parameter.
|
||||||
|
*
|
||||||
|
* @param _workspaceId - Workspace context (validated by WorkspaceGuard)
|
||||||
|
* @param tier - Optional tier filter (default, premium, fallback)
|
||||||
|
* @returns Voice information array wrapped in standard data envelope
|
||||||
|
*
|
||||||
|
* Issue #396
|
||||||
|
*/
|
||||||
|
@Get("voices")
|
||||||
|
@RequirePermission(Permission.WORKSPACE_ANY)
|
||||||
|
async getVoices(
|
||||||
|
@Workspace() _workspaceId: string,
|
||||||
|
@Query("tier") tier?: SpeechTier
|
||||||
|
): Promise<{ data: VoiceInfo[] }> {
|
||||||
|
const voices = await this.speechService.listVoices(tier);
|
||||||
|
return { data: voices };
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -31,12 +31,14 @@ import {
|
|||||||
type SpeechConfig,
|
type SpeechConfig,
|
||||||
} from "./speech.config";
|
} from "./speech.config";
|
||||||
import { SpeechService } from "./speech.service";
|
import { SpeechService } from "./speech.service";
|
||||||
|
import { SpeechController } from "./speech.controller";
|
||||||
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
|
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
|
||||||
import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
|
import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
|
||||||
import { createTTSProviders } from "./providers/tts-provider.factory";
|
import { createTTSProviders } from "./providers/tts-provider.factory";
|
||||||
|
|
||||||
@Module({
|
@Module({
|
||||||
imports: [ConfigModule.forFeature(speechConfig)],
|
imports: [ConfigModule.forFeature(speechConfig)],
|
||||||
|
controllers: [SpeechController],
|
||||||
providers: [
|
providers: [
|
||||||
SpeechService,
|
SpeechService,
|
||||||
// STT provider: conditionally register SpeachesSttProvider when STT is enabled
|
// STT provider: conditionally register SpeachesSttProvider when STT is enabled
|
||||||
|
|||||||
Reference in New Issue
Block a user