diff --git a/apps/api/src/speech/speech.integration.spec.ts b/apps/api/src/speech/speech.integration.spec.ts new file mode 100644 index 0000000..033a4e9 --- /dev/null +++ b/apps/api/src/speech/speech.integration.spec.ts @@ -0,0 +1,933 @@ +/** + * Speech Services E2E Integration Tests + * + * Tests the full speech pipeline from API endpoints through to mocked external providers. + * Covers REST transcription, synthesis, provider fallback, WebSocket streaming, + * audio validation, file size limits, authentication, voice listing, and health checks. + * + * Uses NestJS testing module with supertest for HTTP testing and direct gateway + * invocation for WebSocket streaming tests. + * + * Issue #405 + */ + +import { describe, it, expect, beforeAll, beforeEach, afterAll, vi } from "vitest"; +import { Test } from "@nestjs/testing"; +import { + type INestApplication, + type CanActivate, + type ExecutionContext, + UnauthorizedException, + ValidationPipe, +} from "@nestjs/common"; +import request from "supertest"; +import type { App } from "supertest/types"; + +import { SpeechController } from "./speech.controller"; +import { SpeechService } from "./speech.service"; +import { SpeechGateway } from "./speech.gateway"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { speechConfig } from "./speech.config"; +import type { SpeechConfig } from "./speech.config"; +import type { ISTTProvider } from "./interfaces/stt-provider.interface"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import type { + TranscriptionResult, + SynthesisResult, + VoiceInfo, + SpeechTier, +} from "./interfaces/speech-types"; +import { AuthGuard } from "../auth/guards/auth.guard"; +import { WorkspaceGuard, PermissionGuard } from "../common/guards"; +import { AuthService } from "../auth/auth.service"; +import { PrismaService } from "../prisma/prisma.service"; + +// ========================================== +// Test Fixtures +// ========================================== + +/** + * Small WAV file header (44 bytes) + minimal data. + * Not a real audio file, but has the correct structure for testing. + */ +const TEST_AUDIO_BUFFER = Buffer.alloc(1024, 0); + +const MOCK_WORKSPACE_ID = "550e8400-e29b-41d4-a716-446655440001"; +const MOCK_USER_ID = "550e8400-e29b-41d4-a716-446655440002"; + +const MOCK_USER = { + id: MOCK_USER_ID, + email: "test@example.com", + name: "Test User", + workspaceId: MOCK_WORKSPACE_ID, +}; + +const MOCK_TRANSCRIPTION_RESULT: TranscriptionResult = { + text: "Hello, this is a test transcription.", + language: "en", + durationSeconds: 3.2, + confidence: 0.97, + segments: [ + { text: "Hello, this is a test transcription.", start: 0, end: 3.2, confidence: 0.97 }, + ], +}; + +const MOCK_SYNTHESIS_RESULT: SynthesisResult = { + audio: Buffer.from("fake-synthesized-audio-data-mp3"), + format: "mp3", + voice: "af_heart", + tier: "default" as SpeechTier, + durationSeconds: 2.1, +}; + +const MOCK_VOICES: VoiceInfo[] = [ + { id: "af_heart", name: "Heart", language: "en", tier: "default", isDefault: true }, + { id: "af_sky", name: "Sky", language: "en", tier: "default", isDefault: false }, + { + id: "chatterbox-default", + name: "Chatterbox", + language: "en", + tier: "premium", + isDefault: true, + }, +]; + +const MOCK_SPEECH_CONFIG: SpeechConfig = { + stt: { + enabled: true, + baseUrl: "http://speaches:8000/v1", + model: "test-model", + language: "en", + }, + tts: { + default: { enabled: true, url: "http://kokoro:8880/v1", voice: "af_heart", format: "mp3" }, + premium: { enabled: true, url: "http://chatterbox:8881/v1" }, + fallback: { enabled: true, url: "http://openedai:8000/v1" }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, +}; + +// ========================================== +// Mock Providers +// ========================================== + +function createMockSTTProvider(): ISTTProvider { + return { + name: "mock-stt", + transcribe: vi.fn().mockResolvedValue(MOCK_TRANSCRIPTION_RESULT), + isHealthy: vi.fn().mockResolvedValue(true), + }; +} + +function createMockTTSProvider(tier: SpeechTier, name: string): ITTSProvider { + const voices = MOCK_VOICES.filter((v) => v.tier === tier); + return { + name, + tier, + synthesize: vi.fn().mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier, + }), + listVoices: vi.fn().mockResolvedValue(voices), + isHealthy: vi.fn().mockResolvedValue(true), + }; +} + +// ========================================== +// Test Guards +// ========================================== + +/** + * Conditional auth guard for testing. + * Authenticates requests that carry `Authorization: Bearer test-token`. + * Rejects all others with UnauthorizedException. + */ +class TestAuthGuard implements CanActivate { + canActivate(context: ExecutionContext): boolean { + const req = context.switchToHttp().getRequest<{ + headers: Record; + user?: typeof MOCK_USER; + cookies?: Record; + }>(); + const authHeader = req.headers.authorization; + const cookieToken = req.cookies?.["better-auth.session_token"]; + + if (authHeader === "Bearer test-token" || cookieToken === "test-token") { + req.user = { ...MOCK_USER }; + return true; + } + + throw new UnauthorizedException("No authentication token provided"); + } +} + +/** + * Test workspace guard that attaches a mock workspace to the request. + */ +class TestWorkspaceGuard implements CanActivate { + canActivate(context: ExecutionContext): boolean { + const req = context.switchToHttp().getRequest<{ + workspace?: { id: string }; + headers: Record; + }>(); + const workspaceId = req.headers["x-workspace-id"] ?? MOCK_WORKSPACE_ID; + req.workspace = { id: workspaceId as string }; + return true; + } +} + +/** + * Test permission guard that always allows access. + */ +class TestPermissionGuard implements CanActivate { + canActivate(): boolean { + return true; + } +} + +// ========================================== +// Tests +// ========================================== + +describe("Speech Services E2E Integration", () => { + let app: INestApplication; + let mockSTTProvider: ISTTProvider; + let defaultTTSProvider: ITTSProvider; + let premiumTTSProvider: ITTSProvider; + let fallbackTTSProvider: ITTSProvider; + let ttsProvidersMap: Map; + + // WebSocket gateway test dependencies + let speechGateway: SpeechGateway; + let mockSpeechService: SpeechService; + + beforeAll(async () => { + // Create mock providers + mockSTTProvider = createMockSTTProvider(); + defaultTTSProvider = createMockTTSProvider("default", "mock-kokoro"); + premiumTTSProvider = createMockTTSProvider("premium", "mock-chatterbox"); + fallbackTTSProvider = createMockTTSProvider("fallback", "mock-piper"); + + ttsProvidersMap = new Map([ + ["default", defaultTTSProvider], + ["premium", premiumTTSProvider], + ["fallback", fallbackTTSProvider], + ]); + + const moduleRef = await Test.createTestingModule({ + controllers: [SpeechController], + providers: [ + SpeechService, + { + provide: speechConfig.KEY, + useValue: MOCK_SPEECH_CONFIG, + }, + { + provide: STT_PROVIDER, + useValue: mockSTTProvider, + }, + { + provide: TTS_PROVIDERS, + useValue: ttsProvidersMap, + }, + // Gateway dependencies (not tested via HTTP but needed for DI) + { + provide: SpeechGateway, + useFactory: ( + authService: AuthService, + prisma: PrismaService, + speechService: SpeechService, + config: SpeechConfig + ): SpeechGateway => { + return new SpeechGateway(authService, prisma, speechService, config); + }, + inject: [AuthService, PrismaService, SpeechService, speechConfig.KEY], + }, + { + provide: AuthService, + useValue: { + verifySession: vi.fn().mockResolvedValue({ + user: { id: MOCK_USER_ID, email: "test@example.com", name: "Test User" }, + session: { id: "test-session" }, + }), + }, + }, + { + provide: PrismaService, + useValue: { + workspaceMember: { + findFirst: vi.fn().mockResolvedValue({ + userId: MOCK_USER_ID, + workspaceId: MOCK_WORKSPACE_ID, + role: "MEMBER", + }), + }, + }, + }, + ], + }) + .overrideGuard(AuthGuard) + .useClass(TestAuthGuard) + .overrideGuard(WorkspaceGuard) + .useClass(TestWorkspaceGuard) + .overrideGuard(PermissionGuard) + .useClass(TestPermissionGuard) + .compile(); + + app = moduleRef.createNestApplication(); + app.useGlobalPipes(new ValidationPipe({ transform: true, whitelist: true })); + await app.init(); + + // Capture references for WebSocket tests + speechGateway = moduleRef.get(SpeechGateway); + mockSpeechService = moduleRef.get(SpeechService); + }); + + beforeEach(() => { + vi.clearAllMocks(); + + // Reset default mock behaviors + (mockSTTProvider.transcribe as ReturnType).mockResolvedValue( + MOCK_TRANSCRIPTION_RESULT + ); + (defaultTTSProvider.synthesize as ReturnType).mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier: "default", + }); + (premiumTTSProvider.synthesize as ReturnType).mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier: "premium", + }); + (fallbackTTSProvider.synthesize as ReturnType).mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier: "fallback", + }); + (defaultTTSProvider.listVoices as ReturnType).mockResolvedValue( + MOCK_VOICES.filter((v) => v.tier === "default") + ); + (premiumTTSProvider.listVoices as ReturnType).mockResolvedValue( + MOCK_VOICES.filter((v) => v.tier === "premium") + ); + (fallbackTTSProvider.listVoices as ReturnType).mockResolvedValue([]); + }); + + afterAll(async () => { + if (app) { + await app.close(); + } + }); + + // ========================================== + // Scenario 1: REST Transcription + // ========================================== + describe("Scenario 1: REST Transcription (POST /speech/transcribe)", () => { + it("should transcribe an uploaded audio file and return the transcription result", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", TEST_AUDIO_BUFFER, { + filename: "test.wav", + contentType: "audio/wav", + }) + .expect(201); + + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toMatchObject({ + text: MOCK_TRANSCRIPTION_RESULT.text, + language: MOCK_TRANSCRIPTION_RESULT.language, + durationSeconds: MOCK_TRANSCRIPTION_RESULT.durationSeconds, + confidence: MOCK_TRANSCRIPTION_RESULT.confidence, + }); + expect(response.body.data.segments).toBeDefined(); + expect(response.body.data.segments).toHaveLength(1); + + expect(mockSTTProvider.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({ mimeType: "audio/wav" }) + ); + }); + + it("should pass optional transcription parameters to the service", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", TEST_AUDIO_BUFFER, { + filename: "test.mp3", + contentType: "audio/mpeg", + }) + .field("language", "fr") + .field("model", "whisper-large-v3") + .field("prompt", "Meeting transcript") + .field("temperature", "0.3") + .expect(201); + + expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text); + + expect(mockSTTProvider.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({ + mimeType: "audio/mpeg", + language: "fr", + model: "whisper-large-v3", + prompt: "Meeting transcript", + temperature: 0.3, + }) + ); + }); + + it("should reject request without an audio file", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .expect(400); + + expect(response.body).toHaveProperty("message"); + }); + }); + + // ========================================== + // Scenario 2: REST Synthesis + // ========================================== + describe("Scenario 2: REST Synthesis (POST /speech/synthesize)", () => { + it("should synthesize text and return audio binary response", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "Hello, world!" }) + .expect(201); + + // Response should be binary audio + expect(response.headers["content-type"]).toContain("audio/mpeg"); + expect(response.headers["content-disposition"]).toContain("attachment"); + expect(response.headers["content-disposition"]).toContain("speech.mp3"); + expect(response.body).toBeDefined(); + expect(Buffer.isBuffer(response.body) || response.body instanceof Buffer).toBe(true); + }); + + it("should pass voice, speed, format, and tier options to the service", async () => { + (defaultTTSProvider.synthesize as ReturnType).mockResolvedValue({ + audio: Buffer.from("wav-audio-data"), + format: "wav", + voice: "af_sky", + tier: "default", + durationSeconds: 1.5, + }); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ + text: "Test with options", + voice: "af_sky", + speed: 1.5, + format: "wav", + }) + .expect(201); + + expect(response.headers["content-type"]).toContain("audio/wav"); + expect(response.headers["content-disposition"]).toContain("speech.wav"); + }); + + it("should accept empty text (validation delegated to service)", async () => { + // The SynthesizeDto allows empty strings (no @IsNotEmpty decorator). + // The service/provider handles empty text semantics. + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "" }) + .expect(201); + + expect(response.headers["content-type"]).toContain("audio/mpeg"); + }); + + it("should reject missing text field", async () => { + await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({}) + .expect(400); + }); + }); + + // ========================================== + // Scenario 3: Provider Fallback + // ========================================== + describe("Scenario 3: Provider Fallback", () => { + it("should fall back from premium to default when premium fails", async () => { + // Make premium provider fail + (premiumTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Premium provider unavailable") + ); + + // Default provider should succeed + (defaultTTSProvider.synthesize as ReturnType).mockResolvedValue({ + audio: Buffer.from("fallback-audio"), + format: "mp3", + voice: "af_heart", + tier: "default", + }); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "Fallback test", tier: "premium" }) + .expect(201); + + // Premium was attempted first + expect(premiumTTSProvider.synthesize).toHaveBeenCalled(); + // Then default succeeded + expect(defaultTTSProvider.synthesize).toHaveBeenCalled(); + expect(response.headers["content-type"]).toContain("audio/mpeg"); + }); + + it("should fall back through entire chain: premium -> default -> fallback", async () => { + // Make premium and default fail + (premiumTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Premium down") + ); + (defaultTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Default down") + ); + + // Fallback should succeed + (fallbackTTSProvider.synthesize as ReturnType).mockResolvedValue({ + audio: Buffer.from("fallback-piper-audio"), + format: "mp3", + voice: "piper-default", + tier: "fallback", + }); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "Full fallback chain test", tier: "premium" }) + .expect(201); + + expect(premiumTTSProvider.synthesize).toHaveBeenCalled(); + expect(defaultTTSProvider.synthesize).toHaveBeenCalled(); + expect(fallbackTTSProvider.synthesize).toHaveBeenCalled(); + expect(response.headers["content-type"]).toContain("audio/mpeg"); + }); + + it("should return 503 when all TTS providers fail", async () => { + (premiumTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Premium down") + ); + (defaultTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Default down") + ); + (fallbackTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Fallback down") + ); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "All providers down", tier: "premium" }) + .expect(503); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("All TTS providers failed"); + }); + }); + + // ========================================== + // Scenario 4: WebSocket Streaming Transcription + // ========================================== + describe("Scenario 4: WebSocket Streaming Transcription", () => { + interface MockSocket { + id: string; + join: ReturnType; + leave: ReturnType; + emit: ReturnType; + disconnect: ReturnType; + data: { userId?: string; workspaceId?: string }; + handshake: { + auth: Record; + query: Record; + headers: Record; + }; + } + + function createTestSocket(overrides?: Partial): MockSocket { + return { + id: "e2e-test-socket", + join: vi.fn(), + leave: vi.fn(), + emit: vi.fn(), + disconnect: vi.fn(), + data: {}, + handshake: { + auth: { token: "valid-token" }, + query: {}, + headers: {}, + }, + ...overrides, + }; + } + + it("should complete the full streaming transcription lifecycle", async () => { + const client = createTestSocket(); + // Authenticate the client + await speechGateway.handleConnection(client as never); + + expect(client.data.userId).toBe(MOCK_USER_ID); + expect(client.data.workspaceId).toBe(MOCK_WORKSPACE_ID); + expect(client.disconnect).not.toHaveBeenCalled(); + + // Start transcription session + speechGateway.handleStartTranscription(client as never, { language: "en" }); + + expect(client.emit).toHaveBeenCalledWith( + "transcription-started", + expect.objectContaining({ sessionId: "e2e-test-socket" }) + ); + + // Send audio chunks + const chunk1 = Buffer.from("audio-data-chunk-1"); + const chunk2 = Buffer.from("audio-data-chunk-2"); + const chunk3 = Buffer.from("audio-data-chunk-3"); + + speechGateway.handleAudioChunk(client as never, chunk1); + speechGateway.handleAudioChunk(client as never, chunk2); + speechGateway.handleAudioChunk(client as never, chunk3); + + // No errors should have been emitted for chunks + const errorCalls = client.emit.mock.calls.filter( + (call: unknown[]) => call[0] === "transcription-error" + ); + expect(errorCalls).toHaveLength(0); + + vi.clearAllMocks(); + (mockSTTProvider.transcribe as ReturnType).mockResolvedValue( + MOCK_TRANSCRIPTION_RESULT + ); + + // Stop transcription - should trigger the full transcription pipeline + await speechGateway.handleStopTranscription(client as never); + + // Verify transcription was called with concatenated audio + expect(mockSTTProvider.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({ language: "en" }) + ); + + // Verify the final result was emitted + expect(client.emit).toHaveBeenCalledWith( + "transcription-final", + expect.objectContaining({ + text: MOCK_TRANSCRIPTION_RESULT.text, + language: "en", + durationSeconds: 3.2, + confidence: 0.97, + }) + ); + }); + + it("should clean up session on disconnect", async () => { + const client = createTestSocket({ id: "disconnect-test" }); + await speechGateway.handleConnection(client as never); + + speechGateway.handleStartTranscription(client as never, {}); + speechGateway.handleAudioChunk(client as never, Buffer.from("data")); + + // Disconnect + speechGateway.handleDisconnect(client as never); + + // Trying to send more chunks should fail (session cleaned up) + vi.clearAllMocks(); + speechGateway.handleAudioChunk(client as never, Buffer.from("more-data")); + + expect(client.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ + message: expect.stringContaining("No active transcription session"), + }) + ); + }); + + it("should reject unauthenticated WebSocket clients", async () => { + const client = createTestSocket({ + id: "unauth-ws-client", + handshake: { auth: {}, query: {}, headers: {} }, + }); + + await speechGateway.handleConnection(client as never); + + expect(client.disconnect).toHaveBeenCalled(); + expect(client.data.userId).toBeUndefined(); + }); + }); + + // ========================================== + // Scenario 5: Audio Validation (Invalid MIME Type) + // ========================================== + describe("Scenario 5: Audio Validation", () => { + it("should reject files with unsupported MIME types", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", Buffer.from("not-audio"), { + filename: "document.pdf", + contentType: "application/pdf", + }) + .expect(400); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("Unsupported audio format"); + expect(response.body.message).toContain("application/pdf"); + }); + + it("should reject files with text/plain MIME type", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", Buffer.from("plain text content"), { + filename: "notes.txt", + contentType: "text/plain", + }) + .expect(400); + + expect(response.body.message).toContain("Unsupported audio format"); + }); + + it("should reject video MIME types", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", Buffer.from("video-data"), { + filename: "video.mp4", + contentType: "video/mp4", + }) + .expect(400); + + expect(response.body.message).toContain("Unsupported audio format"); + }); + + it("should accept valid audio MIME types", async () => { + const validMimeTypes = [ + { mime: "audio/wav", ext: "wav" }, + { mime: "audio/mpeg", ext: "mp3" }, + { mime: "audio/webm", ext: "webm" }, + { mime: "audio/ogg", ext: "ogg" }, + { mime: "audio/flac", ext: "flac" }, + ]; + + for (const { mime, ext } of validMimeTypes) { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", TEST_AUDIO_BUFFER, { + filename: `test.${ext}`, + contentType: mime, + }) + .expect(201); + + expect(response.body).toHaveProperty("data"); + expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text); + } + }); + }); + + // ========================================== + // Scenario 6: File Size Limits + // ========================================== + describe("Scenario 6: File Size Limits", () => { + it("should reject files exceeding the maximum upload size (25 MB)", async () => { + // Create a buffer slightly over the 25 MB limit + const oversizedBuffer = Buffer.alloc(25_000_001, 0); + + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", oversizedBuffer, { + filename: "large-audio.wav", + contentType: "audio/wav", + }) + .expect(400); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("exceeds maximum allowed size"); + }); + + it("should accept files within the size limit", async () => { + // Create a buffer at the exact limit + const maxBuffer = Buffer.alloc(1024, 0); + + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", maxBuffer, { + filename: "acceptable-audio.wav", + contentType: "audio/wav", + }) + .expect(201); + + expect(response.body).toHaveProperty("data"); + }); + }); + + // ========================================== + // Scenario 7: Authentication + // ========================================== + describe("Scenario 7: Authentication", () => { + it("should reject POST /speech/transcribe without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .attach("file", TEST_AUDIO_BUFFER, { + filename: "test.wav", + contentType: "audio/wav", + }) + .expect(401); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject POST /speech/synthesize without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .send({ text: "Hello" }) + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject GET /speech/voices without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices") + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject GET /speech/health without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/health") + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject requests with an invalid token", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices") + .set("Authorization", "Bearer invalid-token-xyz") + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + }); + + // ========================================== + // Scenario 8: Voice Listing + // ========================================== + describe("Scenario 8: Voice Listing (GET /speech/voices)", () => { + it("should return all voices when no tier filter is provided", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices") + .set("Authorization", "Bearer test-token") + .expect(200); + + expect(response.body).toHaveProperty("data"); + expect(Array.isArray(response.body.data)).toBe(true); + + // Should have voices from all providers that returned voices + const voices = response.body.data as VoiceInfo[]; + expect(voices.length).toBeGreaterThan(0); + + // Verify voice structure + for (const voice of voices) { + expect(voice).toHaveProperty("id"); + expect(voice).toHaveProperty("name"); + expect(voice).toHaveProperty("tier"); + } + }); + + it("should filter voices by tier when tier query param is provided", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices?tier=default") + .set("Authorization", "Bearer test-token") + .expect(200); + + const voices = response.body.data as VoiceInfo[]; + expect(voices.length).toBeGreaterThan(0); + + for (const voice of voices) { + expect(voice.tier).toBe("default"); + } + + expect(defaultTTSProvider.listVoices).toHaveBeenCalled(); + }); + + it("should return empty array for tier with no voices", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices?tier=fallback") + .set("Authorization", "Bearer test-token") + .expect(200); + + expect(response.body.data).toEqual([]); + }); + + it("should include voice metadata (id, name, language, tier, isDefault)", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices?tier=default") + .set("Authorization", "Bearer test-token") + .expect(200); + + const voices = response.body.data as VoiceInfo[]; + const defaultVoice = voices.find((v) => v.isDefault === true); + + expect(defaultVoice).toBeDefined(); + expect(defaultVoice).toMatchObject({ + id: "af_heart", + name: "Heart", + language: "en", + tier: "default", + isDefault: true, + }); + }); + }); + + // ========================================== + // Scenario 9: Health Check + // ========================================== + describe("Scenario 9: Health Check (GET /speech/health)", () => { + it("should return health status for both STT and TTS providers", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/health") + .set("Authorization", "Bearer test-token") + .expect(200); + + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("stt"); + expect(response.body.data).toHaveProperty("tts"); + + expect(response.body.data.stt).toHaveProperty("available"); + expect(response.body.data.tts).toHaveProperty("available"); + + // Both should be available since we have mock providers registered and config enabled + expect(response.body.data.stt.available).toBe(true); + expect(response.body.data.tts.available).toBe(true); + }); + + it("should return consistent health check format", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/health") + .set("Authorization", "Bearer test-token") + .expect(200); + + // Verify the response matches the expected shape + expect(response.body).toEqual({ + data: { + stt: { available: expect.any(Boolean) }, + tts: { available: expect.any(Boolean) }, + }, + }); + }); + }); +});