2026-02-17 01:04:47 +00:00
1 changed files with 933 additions and 0 deletions
--- a/apps/api/src/speech/speech.integration.spec.ts
+++ b/apps/api/src/speech/speech.integration.spec.ts
@@ -0,0 +1,933 @@
+/**
+ * Speech Services E2E Integration Tests
+ *
+ * Tests the full speech pipeline from API endpoints through to mocked external providers.
+ * Covers REST transcription, synthesis, provider fallback, WebSocket streaming,
+ * audio validation, file size limits, authentication, voice listing, and health checks.
+ *
+ * Uses NestJS testing module with supertest for HTTP testing and direct gateway
+ * invocation for WebSocket streaming tests.
+ *
+ * Issue #405
+ */
+
+import { describe, it, expect, beforeAll, beforeEach, afterAll, vi } from "vitest";
+import { Test } from "@nestjs/testing";
+import {
+  type INestApplication,
+  type CanActivate,
+  type ExecutionContext,
+  UnauthorizedException,
+  ValidationPipe,
+} from "@nestjs/common";
+import request from "supertest";
+import type { App } from "supertest/types";
+
+import { SpeechController } from "./speech.controller";
+import { SpeechService } from "./speech.service";
+import { SpeechGateway } from "./speech.gateway";
+import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
+import { speechConfig } from "./speech.config";
+import type { SpeechConfig } from "./speech.config";
+import type { ISTTProvider } from "./interfaces/stt-provider.interface";
+import type { ITTSProvider } from "./interfaces/tts-provider.interface";
+import type {
+  TranscriptionResult,
+  SynthesisResult,
+  VoiceInfo,
+  SpeechTier,
+} from "./interfaces/speech-types";
+import { AuthGuard } from "../auth/guards/auth.guard";
+import { WorkspaceGuard, PermissionGuard } from "../common/guards";
+import { AuthService } from "../auth/auth.service";
+import { PrismaService } from "../prisma/prisma.service";
+
+// ==========================================
+// Test Fixtures
+// ==========================================
+
+/**
+ * Small WAV file header (44 bytes) + minimal data.
+ * Not a real audio file, but has the correct structure for testing.
+ */
+const TEST_AUDIO_BUFFER = Buffer.alloc(1024, 0);
+
+const MOCK_WORKSPACE_ID = "550e8400-e29b-41d4-a716-446655440001";
+const MOCK_USER_ID = "550e8400-e29b-41d4-a716-446655440002";
+
+const MOCK_USER = {
+  id: MOCK_USER_ID,
+  email: "test@example.com",
+  name: "Test User",
+  workspaceId: MOCK_WORKSPACE_ID,
+};
+
+const MOCK_TRANSCRIPTION_RESULT: TranscriptionResult = {
+  text: "Hello, this is a test transcription.",
+  language: "en",
+  durationSeconds: 3.2,
+  confidence: 0.97,
+  segments: [
+    { text: "Hello, this is a test transcription.", start: 0, end: 3.2, confidence: 0.97 },
+  ],
+};
+
+const MOCK_SYNTHESIS_RESULT: SynthesisResult = {
+  audio: Buffer.from("fake-synthesized-audio-data-mp3"),
+  format: "mp3",
+  voice: "af_heart",
+  tier: "default" as SpeechTier,
+  durationSeconds: 2.1,
+};
+
+const MOCK_VOICES: VoiceInfo[] = [
+  { id: "af_heart", name: "Heart", language: "en", tier: "default", isDefault: true },
+  { id: "af_sky", name: "Sky", language: "en", tier: "default", isDefault: false },
+  {
+    id: "chatterbox-default",
+    name: "Chatterbox",
+    language: "en",
+    tier: "premium",
+    isDefault: true,
+  },
+];
+
+const MOCK_SPEECH_CONFIG: SpeechConfig = {
+  stt: {
+    enabled: true,
+    baseUrl: "http://speaches:8000/v1",
+    model: "test-model",
+    language: "en",
+  },
+  tts: {
+    default: { enabled: true, url: "http://kokoro:8880/v1", voice: "af_heart", format: "mp3" },
+    premium: { enabled: true, url: "http://chatterbox:8881/v1" },
+    fallback: { enabled: true, url: "http://openedai:8000/v1" },
+  },
+  limits: {
+    maxUploadSize: 25_000_000,
+    maxDurationSeconds: 600,
+    maxTextLength: 4096,
+  },
+};
+
+// ==========================================
+// Mock Providers
+// ==========================================
+
+function createMockSTTProvider(): ISTTProvider {
+  return {
+    name: "mock-stt",
+    transcribe: vi.fn().mockResolvedValue(MOCK_TRANSCRIPTION_RESULT),
+    isHealthy: vi.fn().mockResolvedValue(true),
+  };
+}
+
+function createMockTTSProvider(tier: SpeechTier, name: string): ITTSProvider {
+  const voices = MOCK_VOICES.filter((v) => v.tier === tier);
+  return {
+    name,
+    tier,
+    synthesize: vi.fn().mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier,
+    }),
+    listVoices: vi.fn().mockResolvedValue(voices),
+    isHealthy: vi.fn().mockResolvedValue(true),
+  };
+}
+
+// ==========================================
+// Test Guards
+// ==========================================
+
+/**
+ * Conditional auth guard for testing.
+ * Authenticates requests that carry `Authorization: Bearer test-token`.
+ * Rejects all others with UnauthorizedException.
+ */
+class TestAuthGuard implements CanActivate {
+  canActivate(context: ExecutionContext): boolean {
+    const req = context.switchToHttp().getRequest<{
+      headers: Record<string, string | undefined>;
+      user?: typeof MOCK_USER;
+      cookies?: Record<string, string>;
+    }>();
+    const authHeader = req.headers.authorization;
+    const cookieToken = req.cookies?.["better-auth.session_token"];
+
+    if (authHeader === "Bearer test-token" || cookieToken === "test-token") {
+      req.user = { ...MOCK_USER };
+      return true;
+    }
+
+    throw new UnauthorizedException("No authentication token provided");
+  }
+}
+
+/**
+ * Test workspace guard that attaches a mock workspace to the request.
+ */
+class TestWorkspaceGuard implements CanActivate {
+  canActivate(context: ExecutionContext): boolean {
+    const req = context.switchToHttp().getRequest<{
+      workspace?: { id: string };
+      headers: Record<string, string | undefined>;
+    }>();
+    const workspaceId = req.headers["x-workspace-id"] ?? MOCK_WORKSPACE_ID;
+    req.workspace = { id: workspaceId as string };
+    return true;
+  }
+}
+
+/**
+ * Test permission guard that always allows access.
+ */
+class TestPermissionGuard implements CanActivate {
+  canActivate(): boolean {
+    return true;
+  }
+}
+
+// ==========================================
+// Tests
+// ==========================================
+
+describe("Speech Services E2E Integration", () => {
+  let app: INestApplication;
+  let mockSTTProvider: ISTTProvider;
+  let defaultTTSProvider: ITTSProvider;
+  let premiumTTSProvider: ITTSProvider;
+  let fallbackTTSProvider: ITTSProvider;
+  let ttsProvidersMap: Map<SpeechTier, ITTSProvider>;
+
+  // WebSocket gateway test dependencies
+  let speechGateway: SpeechGateway;
+  let mockSpeechService: SpeechService;
+
+  beforeAll(async () => {
+    // Create mock providers
+    mockSTTProvider = createMockSTTProvider();
+    defaultTTSProvider = createMockTTSProvider("default", "mock-kokoro");
+    premiumTTSProvider = createMockTTSProvider("premium", "mock-chatterbox");
+    fallbackTTSProvider = createMockTTSProvider("fallback", "mock-piper");
+
+    ttsProvidersMap = new Map<SpeechTier, ITTSProvider>([
+      ["default", defaultTTSProvider],
+      ["premium", premiumTTSProvider],
+      ["fallback", fallbackTTSProvider],
+    ]);
+
+    const moduleRef = await Test.createTestingModule({
+      controllers: [SpeechController],
+      providers: [
+        SpeechService,
+        {
+          provide: speechConfig.KEY,
+          useValue: MOCK_SPEECH_CONFIG,
+        },
+        {
+          provide: STT_PROVIDER,
+          useValue: mockSTTProvider,
+        },
+        {
+          provide: TTS_PROVIDERS,
+          useValue: ttsProvidersMap,
+        },
+        // Gateway dependencies (not tested via HTTP but needed for DI)
+        {
+          provide: SpeechGateway,
+          useFactory: (
+            authService: AuthService,
+            prisma: PrismaService,
+            speechService: SpeechService,
+            config: SpeechConfig
+          ): SpeechGateway => {
+            return new SpeechGateway(authService, prisma, speechService, config);
+          },
+          inject: [AuthService, PrismaService, SpeechService, speechConfig.KEY],
+        },
+        {
+          provide: AuthService,
+          useValue: {
+            verifySession: vi.fn().mockResolvedValue({
+              user: { id: MOCK_USER_ID, email: "test@example.com", name: "Test User" },
+              session: { id: "test-session" },
+            }),
+          },
+        },
+        {
+          provide: PrismaService,
+          useValue: {
+            workspaceMember: {
+              findFirst: vi.fn().mockResolvedValue({
+                userId: MOCK_USER_ID,
+                workspaceId: MOCK_WORKSPACE_ID,
+                role: "MEMBER",
+              }),
+            },
+          },
+        },
+      ],
+    })
+      .overrideGuard(AuthGuard)
+      .useClass(TestAuthGuard)
+      .overrideGuard(WorkspaceGuard)
+      .useClass(TestWorkspaceGuard)
+      .overrideGuard(PermissionGuard)
+      .useClass(TestPermissionGuard)
+      .compile();
+
+    app = moduleRef.createNestApplication();
+    app.useGlobalPipes(new ValidationPipe({ transform: true, whitelist: true }));
+    await app.init();
+
+    // Capture references for WebSocket tests
+    speechGateway = moduleRef.get(SpeechGateway);
+    mockSpeechService = moduleRef.get(SpeechService);
+  });
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+
+    // Reset default mock behaviors
+    (mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
+      MOCK_TRANSCRIPTION_RESULT
+    );
+    (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier: "default",
+    });
+    (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier: "premium",
+    });
+    (fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier: "fallback",
+    });
+    (defaultTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
+      MOCK_VOICES.filter((v) => v.tier === "default")
+    );
+    (premiumTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
+      MOCK_VOICES.filter((v) => v.tier === "premium")
+    );
+    (fallbackTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+  });
+
+  afterAll(async () => {
+    if (app) {
+      await app.close();
+    }
+  });
+
+  // ==========================================
+  // Scenario 1: REST Transcription
+  // ==========================================
+  describe("Scenario 1: REST Transcription (POST /speech/transcribe)", () => {
+    it("should transcribe an uploaded audio file and return the transcription result", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", TEST_AUDIO_BUFFER, {
+          filename: "test.wav",
+          contentType: "audio/wav",
+        })
+        .expect(201);
+
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toMatchObject({
+        text: MOCK_TRANSCRIPTION_RESULT.text,
+        language: MOCK_TRANSCRIPTION_RESULT.language,
+        durationSeconds: MOCK_TRANSCRIPTION_RESULT.durationSeconds,
+        confidence: MOCK_TRANSCRIPTION_RESULT.confidence,
+      });
+      expect(response.body.data.segments).toBeDefined();
+      expect(response.body.data.segments).toHaveLength(1);
+
+      expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({ mimeType: "audio/wav" })
+      );
+    });
+
+    it("should pass optional transcription parameters to the service", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", TEST_AUDIO_BUFFER, {
+          filename: "test.mp3",
+          contentType: "audio/mpeg",
+        })
+        .field("language", "fr")
+        .field("model", "whisper-large-v3")
+        .field("prompt", "Meeting transcript")
+        .field("temperature", "0.3")
+        .expect(201);
+
+      expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
+
+      expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({
+          mimeType: "audio/mpeg",
+          language: "fr",
+          model: "whisper-large-v3",
+          prompt: "Meeting transcript",
+          temperature: 0.3,
+        })
+      );
+    });
+
+    it("should reject request without an audio file", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .expect(400);
+
+      expect(response.body).toHaveProperty("message");
+    });
+  });
+
+  // ==========================================
+  // Scenario 2: REST Synthesis
+  // ==========================================
+  describe("Scenario 2: REST Synthesis (POST /speech/synthesize)", () => {
+    it("should synthesize text and return audio binary response", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "Hello, world!" })
+        .expect(201);
+
+      // Response should be binary audio
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+      expect(response.headers["content-disposition"]).toContain("attachment");
+      expect(response.headers["content-disposition"]).toContain("speech.mp3");
+      expect(response.body).toBeDefined();
+      expect(Buffer.isBuffer(response.body) || response.body instanceof Buffer).toBe(true);
+    });
+
+    it("should pass voice, speed, format, and tier options to the service", async () => {
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+        audio: Buffer.from("wav-audio-data"),
+        format: "wav",
+        voice: "af_sky",
+        tier: "default",
+        durationSeconds: 1.5,
+      });
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({
+          text: "Test with options",
+          voice: "af_sky",
+          speed: 1.5,
+          format: "wav",
+        })
+        .expect(201);
+
+      expect(response.headers["content-type"]).toContain("audio/wav");
+      expect(response.headers["content-disposition"]).toContain("speech.wav");
+    });
+
+    it("should accept empty text (validation delegated to service)", async () => {
+      // The SynthesizeDto allows empty strings (no @IsNotEmpty decorator).
+      // The service/provider handles empty text semantics.
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "" })
+        .expect(201);
+
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+    });
+
+    it("should reject missing text field", async () => {
+      await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({})
+        .expect(400);
+    });
+  });
+
+  // ==========================================
+  // Scenario 3: Provider Fallback
+  // ==========================================
+  describe("Scenario 3: Provider Fallback", () => {
+    it("should fall back from premium to default when premium fails", async () => {
+      // Make premium provider fail
+      (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Premium provider unavailable")
+      );
+
+      // Default provider should succeed
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+        audio: Buffer.from("fallback-audio"),
+        format: "mp3",
+        voice: "af_heart",
+        tier: "default",
+      });
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "Fallback test", tier: "premium" })
+        .expect(201);
+
+      // Premium was attempted first
+      expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
+      // Then default succeeded
+      expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+    });
+
+    it("should fall back through entire chain: premium -> default -> fallback", async () => {
+      // Make premium and default fail
+      (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Premium down")
+      );
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Default down")
+      );
+
+      // Fallback should succeed
+      (fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+        audio: Buffer.from("fallback-piper-audio"),
+        format: "mp3",
+        voice: "piper-default",
+        tier: "fallback",
+      });
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "Full fallback chain test", tier: "premium" })
+        .expect(201);
+
+      expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
+      expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
+      expect(fallbackTTSProvider.synthesize).toHaveBeenCalled();
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+    });
+
+    it("should return 503 when all TTS providers fail", async () => {
+      (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Premium down")
+      );
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Default down")
+      );
+      (fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Fallback down")
+      );
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "All providers down", tier: "premium" })
+        .expect(503);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("All TTS providers failed");
+    });
+  });
+
+  // ==========================================
+  // Scenario 4: WebSocket Streaming Transcription
+  // ==========================================
+  describe("Scenario 4: WebSocket Streaming Transcription", () => {
+    interface MockSocket {
+      id: string;
+      join: ReturnType<typeof vi.fn>;
+      leave: ReturnType<typeof vi.fn>;
+      emit: ReturnType<typeof vi.fn>;
+      disconnect: ReturnType<typeof vi.fn>;
+      data: { userId?: string; workspaceId?: string };
+      handshake: {
+        auth: Record<string, unknown>;
+        query: Record<string, unknown>;
+        headers: Record<string, unknown>;
+      };
+    }
+
+    function createTestSocket(overrides?: Partial<MockSocket>): MockSocket {
+      return {
+        id: "e2e-test-socket",
+        join: vi.fn(),
+        leave: vi.fn(),
+        emit: vi.fn(),
+        disconnect: vi.fn(),
+        data: {},
+        handshake: {
+          auth: { token: "valid-token" },
+          query: {},
+          headers: {},
+        },
+        ...overrides,
+      };
+    }
+
+    it("should complete the full streaming transcription lifecycle", async () => {
+      const client = createTestSocket();
+      // Authenticate the client
+      await speechGateway.handleConnection(client as never);
+
+      expect(client.data.userId).toBe(MOCK_USER_ID);
+      expect(client.data.workspaceId).toBe(MOCK_WORKSPACE_ID);
+      expect(client.disconnect).not.toHaveBeenCalled();
+
+      // Start transcription session
+      speechGateway.handleStartTranscription(client as never, { language: "en" });
+
+      expect(client.emit).toHaveBeenCalledWith(
+        "transcription-started",
+        expect.objectContaining({ sessionId: "e2e-test-socket" })
+      );
+
+      // Send audio chunks
+      const chunk1 = Buffer.from("audio-data-chunk-1");
+      const chunk2 = Buffer.from("audio-data-chunk-2");
+      const chunk3 = Buffer.from("audio-data-chunk-3");
+
+      speechGateway.handleAudioChunk(client as never, chunk1);
+      speechGateway.handleAudioChunk(client as never, chunk2);
+      speechGateway.handleAudioChunk(client as never, chunk3);
+
+      // No errors should have been emitted for chunks
+      const errorCalls = client.emit.mock.calls.filter(
+        (call: unknown[]) => call[0] === "transcription-error"
+      );
+      expect(errorCalls).toHaveLength(0);
+
+      vi.clearAllMocks();
+      (mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
+        MOCK_TRANSCRIPTION_RESULT
+      );
+
+      // Stop transcription - should trigger the full transcription pipeline
+      await speechGateway.handleStopTranscription(client as never);
+
+      // Verify transcription was called with concatenated audio
+      expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({ language: "en" })
+      );
+
+      // Verify the final result was emitted
+      expect(client.emit).toHaveBeenCalledWith(
+        "transcription-final",
+        expect.objectContaining({
+          text: MOCK_TRANSCRIPTION_RESULT.text,
+          language: "en",
+          durationSeconds: 3.2,
+          confidence: 0.97,
+        })
+      );
+    });
+
+    it("should clean up session on disconnect", async () => {
+      const client = createTestSocket({ id: "disconnect-test" });
+      await speechGateway.handleConnection(client as never);
+
+      speechGateway.handleStartTranscription(client as never, {});
+      speechGateway.handleAudioChunk(client as never, Buffer.from("data"));
+
+      // Disconnect
+      speechGateway.handleDisconnect(client as never);
+
+      // Trying to send more chunks should fail (session cleaned up)
+      vi.clearAllMocks();
+      speechGateway.handleAudioChunk(client as never, Buffer.from("more-data"));
+
+      expect(client.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({
+          message: expect.stringContaining("No active transcription session"),
+        })
+      );
+    });
+
+    it("should reject unauthenticated WebSocket clients", async () => {
+      const client = createTestSocket({
+        id: "unauth-ws-client",
+        handshake: { auth: {}, query: {}, headers: {} },
+      });
+
+      await speechGateway.handleConnection(client as never);
+
+      expect(client.disconnect).toHaveBeenCalled();
+      expect(client.data.userId).toBeUndefined();
+    });
+  });
+
+  // ==========================================
+  // Scenario 5: Audio Validation (Invalid MIME Type)
+  // ==========================================
+  describe("Scenario 5: Audio Validation", () => {
+    it("should reject files with unsupported MIME types", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", Buffer.from("not-audio"), {
+          filename: "document.pdf",
+          contentType: "application/pdf",
+        })
+        .expect(400);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("Unsupported audio format");
+      expect(response.body.message).toContain("application/pdf");
+    });
+
+    it("should reject files with text/plain MIME type", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", Buffer.from("plain text content"), {
+          filename: "notes.txt",
+          contentType: "text/plain",
+        })
+        .expect(400);
+
+      expect(response.body.message).toContain("Unsupported audio format");
+    });
+
+    it("should reject video MIME types", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", Buffer.from("video-data"), {
+          filename: "video.mp4",
+          contentType: "video/mp4",
+        })
+        .expect(400);
+
+      expect(response.body.message).toContain("Unsupported audio format");
+    });
+
+    it("should accept valid audio MIME types", async () => {
+      const validMimeTypes = [
+        { mime: "audio/wav", ext: "wav" },
+        { mime: "audio/mpeg", ext: "mp3" },
+        { mime: "audio/webm", ext: "webm" },
+        { mime: "audio/ogg", ext: "ogg" },
+        { mime: "audio/flac", ext: "flac" },
+      ];
+
+      for (const { mime, ext } of validMimeTypes) {
+        const response = await request(app.getHttpServer() as App)
+          .post("/speech/transcribe")
+          .set("Authorization", "Bearer test-token")
+          .attach("file", TEST_AUDIO_BUFFER, {
+            filename: `test.${ext}`,
+            contentType: mime,
+          })
+          .expect(201);
+
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
+      }
+    });
+  });
+
+  // ==========================================
+  // Scenario 6: File Size Limits
+  // ==========================================
+  describe("Scenario 6: File Size Limits", () => {
+    it("should reject files exceeding the maximum upload size (25 MB)", async () => {
+      // Create a buffer slightly over the 25 MB limit
+      const oversizedBuffer = Buffer.alloc(25_000_001, 0);
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", oversizedBuffer, {
+          filename: "large-audio.wav",
+          contentType: "audio/wav",
+        })
+        .expect(400);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("exceeds maximum allowed size");
+    });
+
+    it("should accept files within the size limit", async () => {
+      // Create a buffer at the exact limit
+      const maxBuffer = Buffer.alloc(1024, 0);
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", maxBuffer, {
+          filename: "acceptable-audio.wav",
+          contentType: "audio/wav",
+        })
+        .expect(201);
+
+      expect(response.body).toHaveProperty("data");
+    });
+  });
+
+  // ==========================================
+  // Scenario 7: Authentication
+  // ==========================================
+  describe("Scenario 7: Authentication", () => {
+    it("should reject POST /speech/transcribe without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .attach("file", TEST_AUDIO_BUFFER, {
+          filename: "test.wav",
+          contentType: "audio/wav",
+        })
+        .expect(401);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject POST /speech/synthesize without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .send({ text: "Hello" })
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject GET /speech/voices without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices")
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject GET /speech/health without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/health")
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject requests with an invalid token", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices")
+        .set("Authorization", "Bearer invalid-token-xyz")
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+  });
+
+  // ==========================================
+  // Scenario 8: Voice Listing
+  // ==========================================
+  describe("Scenario 8: Voice Listing (GET /speech/voices)", () => {
+    it("should return all voices when no tier filter is provided", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      expect(response.body).toHaveProperty("data");
+      expect(Array.isArray(response.body.data)).toBe(true);
+
+      // Should have voices from all providers that returned voices
+      const voices = response.body.data as VoiceInfo[];
+      expect(voices.length).toBeGreaterThan(0);
+
+      // Verify voice structure
+      for (const voice of voices) {
+        expect(voice).toHaveProperty("id");
+        expect(voice).toHaveProperty("name");
+        expect(voice).toHaveProperty("tier");
+      }
+    });
+
+    it("should filter voices by tier when tier query param is provided", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices?tier=default")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      const voices = response.body.data as VoiceInfo[];
+      expect(voices.length).toBeGreaterThan(0);
+
+      for (const voice of voices) {
+        expect(voice.tier).toBe("default");
+      }
+
+      expect(defaultTTSProvider.listVoices).toHaveBeenCalled();
+    });
+
+    it("should return empty array for tier with no voices", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices?tier=fallback")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      expect(response.body.data).toEqual([]);
+    });
+
+    it("should include voice metadata (id, name, language, tier, isDefault)", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices?tier=default")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      const voices = response.body.data as VoiceInfo[];
+      const defaultVoice = voices.find((v) => v.isDefault === true);
+
+      expect(defaultVoice).toBeDefined();
+      expect(defaultVoice).toMatchObject({
+        id: "af_heart",
+        name: "Heart",
+        language: "en",
+        tier: "default",
+        isDefault: true,
+      });
+    });
+  });
+
+  // ==========================================
+  // Scenario 9: Health Check
+  // ==========================================
+  describe("Scenario 9: Health Check (GET /speech/health)", () => {
+    it("should return health status for both STT and TTS providers", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/health")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("stt");
+      expect(response.body.data).toHaveProperty("tts");
+
+      expect(response.body.data.stt).toHaveProperty("available");
+      expect(response.body.data.tts).toHaveProperty("available");
+
+      // Both should be available since we have mock providers registered and config enabled
+      expect(response.body.data.stt.available).toBe(true);
+      expect(response.body.data.tts.available).toBe(true);
+    });
+
+    it("should return consistent health check format", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/health")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      // Verify the response matches the expected shape
+      expect(response.body).toEqual({
+        data: {
+          stt: { available: expect.any(Boolean) },
+          tts: { available: expect.any(Boolean) },
+        },
+      });
+    });
+  });
+});