2026-02-17 01:04:47 +00:00
3 changed files with 682 additions and 10 deletions
--- a/apps/api/src/speech/providers/speaches-stt.provider.spec.ts
+++ b/apps/api/src/speech/providers/speaches-stt.provider.spec.ts
@@ -0,0 +1,468 @@
 /**
 * SpeachesSttProvider Tests
 *
 * TDD tests for the Speaches/faster-whisper STT provider.
 * Tests cover transcription, error handling, health checks, and config injection.
 *
 * Issue #390
 */
 import { describe, it, expect, beforeEach, vi } from "vitest";
 import { SpeachesSttProvider } from "./speaches-stt.provider";
 import type { SpeechConfig } from "../speech.config";
 import type { TranscribeOptions } from "../interfaces/speech-types";
 // ==========================================
 // Mock OpenAI SDK
 // ==========================================
 const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => {
  const mockCreate = vi.fn();
  const mockModelsList = vi.fn();
  const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => {
    return new File([buffer], name);
  });
  const mockOpenAIConstructorCalls: Array<Record<string, unknown>> = [];
  return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls };
 });
 vi.mock("openai", () => {
  class MockOpenAI {
    audio = {
      transcriptions: {
        create: mockCreate,
      },
    };
    models = {
      list: mockModelsList,
    };
    constructor(config: Record<string, unknown>) {
      mockOpenAIConstructorCalls.push(config);
    }
  }
  return {
    default: MockOpenAI,
    toFile: mockToFile,
  };
 });
 // ==========================================
 // Test helpers
 // ==========================================
 function createTestConfig(overrides?: Partial<SpeechConfig["stt"]>): SpeechConfig {
  return {
    stt: {
      enabled: true,
      baseUrl: "http://speaches:8000/v1",
      model: "Systran/faster-whisper-large-v3-turbo",
      language: "en",
      ...overrides,
    },
    tts: {
      default: { enabled: false, url: "", voice: "", format: "" },
      premium: { enabled: false, url: "" },
      fallback: { enabled: false, url: "" },
    },
    limits: {
      maxUploadSize: 25_000_000,
      maxDurationSeconds: 600,
      maxTextLength: 4096,
    },
  };
 }
 function createMockVerboseResponse(overrides?: Record<string, unknown>): Record<string, unknown> {
  return {
    text: "Hello, world!",
    language: "en",
    duration: 3.5,
    segments: [
      {
        id: 0,
        text: "Hello, world!",
        start: 0.0,
        end: 3.5,
        avg_logprob: -0.25,
        compression_ratio: 1.2,
        no_speech_prob: 0.01,
        seek: 0,
        temperature: 0.0,
        tokens: [1, 2, 3],
      },
    ],
    ...overrides,
  };
 }
 describe("SpeachesSttProvider", () => {
  let provider: SpeachesSttProvider;
  let config: SpeechConfig;
  beforeEach(() => {
    vi.clearAllMocks();
    mockOpenAIConstructorCalls.length = 0;
    config = createTestConfig();
    provider = new SpeachesSttProvider(config);
  });
  // ==========================================
  // Provider identity
  // ==========================================
  describe("name", () => {
    it("should have the name 'speaches'", () => {
      expect(provider.name).toBe("speaches");
    });
  });
  // ==========================================
  // transcribe
  // ==========================================
  describe("transcribe", () => {
    it("should call OpenAI audio.transcriptions.create with correct parameters", async () => {
      const mockResponse = createMockVerboseResponse();
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      await provider.transcribe(audio);
      expect(mockCreate).toHaveBeenCalledOnce();
      const callArgs = mockCreate.mock.calls[0][0];
      expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo");
      expect(callArgs.language).toBe("en");
      expect(callArgs.response_format).toBe("verbose_json");
    });
    it("should convert Buffer to File using toFile", async () => {
      const mockResponse = createMockVerboseResponse();
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      await provider.transcribe(audio);
      expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", {
        type: "audio/wav",
      });
    });
    it("should return TranscriptionResult with text and language", async () => {
      const mockResponse = createMockVerboseResponse();
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      const result = await provider.transcribe(audio);
      expect(result.text).toBe("Hello, world!");
      expect(result.language).toBe("en");
    });
    it("should return durationSeconds from verbose response", async () => {
      const mockResponse = createMockVerboseResponse({ duration: 5.25 });
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      const result = await provider.transcribe(audio);
      expect(result.durationSeconds).toBe(5.25);
    });
    it("should map segments from verbose response", async () => {
      const mockResponse = createMockVerboseResponse({
        segments: [
          {
            id: 0,
            text: "Hello,",
            start: 0.0,
            end: 1.5,
            avg_logprob: -0.2,
            compression_ratio: 1.1,
            no_speech_prob: 0.01,
            seek: 0,
            temperature: 0.0,
            tokens: [1, 2],
          },
          {
            id: 1,
            text: " world!",
            start: 1.5,
            end: 3.5,
            avg_logprob: -0.3,
            compression_ratio: 1.3,
            no_speech_prob: 0.02,
            seek: 0,
            temperature: 0.0,
            tokens: [3, 4],
          },
        ],
      });
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      const result = await provider.transcribe(audio);
      expect(result.segments).toHaveLength(2);
      expect(result.segments?.[0]).toEqual({
        text: "Hello,",
        start: 0.0,
        end: 1.5,
      });
      expect(result.segments?.[1]).toEqual({
        text: " world!",
        start: 1.5,
        end: 3.5,
      });
    });
    it("should handle response without segments gracefully", async () => {
      const mockResponse = createMockVerboseResponse({ segments: undefined });
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      const result = await provider.transcribe(audio);
      expect(result.text).toBe("Hello, world!");
      expect(result.segments).toBeUndefined();
    });
    it("should handle response without duration gracefully", async () => {
      const mockResponse = createMockVerboseResponse({ duration: undefined });
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      const result = await provider.transcribe(audio);
      expect(result.text).toBe("Hello, world!");
      expect(result.durationSeconds).toBeUndefined();
    });
    // ------------------------------------------
    // Options override
    // ------------------------------------------
    describe("options override", () => {
      it("should use custom model from options when provided", async () => {
        const mockResponse = createMockVerboseResponse();
        mockCreate.mockResolvedValueOnce(mockResponse);
        const audio = Buffer.from("fake-audio-data");
        const options: TranscribeOptions = { model: "custom-whisper-model" };
        await provider.transcribe(audio, options);
        const callArgs = mockCreate.mock.calls[0][0];
        expect(callArgs.model).toBe("custom-whisper-model");
      });
      it("should use custom language from options when provided", async () => {
        const mockResponse = createMockVerboseResponse({ language: "fr" });
        mockCreate.mockResolvedValueOnce(mockResponse);
        const audio = Buffer.from("fake-audio-data");
        const options: TranscribeOptions = { language: "fr" };
        await provider.transcribe(audio, options);
        const callArgs = mockCreate.mock.calls[0][0];
        expect(callArgs.language).toBe("fr");
      });
      it("should pass through prompt option", async () => {
        const mockResponse = createMockVerboseResponse();
        mockCreate.mockResolvedValueOnce(mockResponse);
        const audio = Buffer.from("fake-audio-data");
        const options: TranscribeOptions = { prompt: "This is a meeting about project planning." };
        await provider.transcribe(audio, options);
        const callArgs = mockCreate.mock.calls[0][0];
        expect(callArgs.prompt).toBe("This is a meeting about project planning.");
      });
      it("should pass through temperature option", async () => {
        const mockResponse = createMockVerboseResponse();
        mockCreate.mockResolvedValueOnce(mockResponse);
        const audio = Buffer.from("fake-audio-data");
        const options: TranscribeOptions = { temperature: 0.3 };
        await provider.transcribe(audio, options);
        const callArgs = mockCreate.mock.calls[0][0];
        expect(callArgs.temperature).toBe(0.3);
      });
      it("should use custom mimeType for file conversion when provided", async () => {
        const mockResponse = createMockVerboseResponse();
        mockCreate.mockResolvedValueOnce(mockResponse);
        const audio = Buffer.from("fake-audio-data");
        const options: TranscribeOptions = { mimeType: "audio/mp3" };
        await provider.transcribe(audio, options);
        expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", {
          type: "audio/mp3",
        });
      });
    });
    // ------------------------------------------
    // Simple response fallback
    // ------------------------------------------
    describe("simple response fallback", () => {
      it("should handle simple Transcription response (text only, no verbose fields)", async () => {
        // Some configurations may return just { text: "..." } without verbose fields
        const simpleResponse = { text: "Simple transcription result." };
        mockCreate.mockResolvedValueOnce(simpleResponse);
        const audio = Buffer.from("fake-audio-data");
        const result = await provider.transcribe(audio);
        expect(result.text).toBe("Simple transcription result.");
        expect(result.language).toBe("en"); // Falls back to config language
        expect(result.durationSeconds).toBeUndefined();
        expect(result.segments).toBeUndefined();
      });
    });
  });
  // ==========================================
  // Error handling
  // ==========================================
  describe("error handling", () => {
    it("should throw a descriptive error on connection refused", async () => {
      const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000");
      mockCreate.mockRejectedValueOnce(connectionError);
      const audio = Buffer.from("fake-audio-data");
      await expect(provider.transcribe(audio)).rejects.toThrow(
        "STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000"
      );
    });
    it("should throw a descriptive error on timeout", async () => {
      const timeoutError = new Error("Request timed out");
      mockCreate.mockRejectedValueOnce(timeoutError);
      const audio = Buffer.from("fake-audio-data");
      await expect(provider.transcribe(audio)).rejects.toThrow(
        "STT transcription failed: Request timed out"
      );
    });
    it("should throw a descriptive error on API error", async () => {
      const apiError = new Error("Invalid model: nonexistent-model");
      mockCreate.mockRejectedValueOnce(apiError);
      const audio = Buffer.from("fake-audio-data");
      await expect(provider.transcribe(audio)).rejects.toThrow(
        "STT transcription failed: Invalid model: nonexistent-model"
      );
    });
    it("should handle non-Error thrown values", async () => {
      mockCreate.mockRejectedValueOnce("unexpected string error");
      const audio = Buffer.from("fake-audio-data");
      await expect(provider.transcribe(audio)).rejects.toThrow(
        "STT transcription failed: unexpected string error"
      );
    });
  });
  // ==========================================
  // isHealthy
  // ==========================================
  describe("isHealthy", () => {
    it("should return true when the server is reachable", async () => {
      mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] });
      const healthy = await provider.isHealthy();
      expect(healthy).toBe(true);
    });
    it("should return false when the server is unreachable", async () => {
      mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
      const healthy = await provider.isHealthy();
      expect(healthy).toBe(false);
    });
    it("should not throw on health check failure", async () => {
      mockModelsList.mockRejectedValueOnce(new Error("Network error"));
      await expect(provider.isHealthy()).resolves.toBe(false);
    });
    it("should return false on unexpected error types", async () => {
      mockModelsList.mockRejectedValueOnce("string error");
      const healthy = await provider.isHealthy();
      expect(healthy).toBe(false);
    });
  });
  // ==========================================
  // Config injection
  // ==========================================
  describe("config injection", () => {
    it("should create OpenAI client with baseURL from config", () => {
      // The constructor was called in beforeEach
      expect(mockOpenAIConstructorCalls).toHaveLength(1);
      expect(mockOpenAIConstructorCalls[0]).toEqual(
        expect.objectContaining({
          baseURL: "http://speaches:8000/v1",
        })
      );
    });
    it("should use custom baseURL from config", () => {
      mockOpenAIConstructorCalls.length = 0;
      const customConfig = createTestConfig({
        baseUrl: "http://custom-speaches:9000/v1",
      });
      new SpeachesSttProvider(customConfig);
      expect(mockOpenAIConstructorCalls).toHaveLength(1);
      expect(mockOpenAIConstructorCalls[0]).toEqual(
        expect.objectContaining({
          baseURL: "http://custom-speaches:9000/v1",
        })
      );
    });
    it("should use default model from config for transcription", async () => {
      const customConfig = createTestConfig({
        model: "Systran/faster-whisper-small",
      });
      const customProvider = new SpeachesSttProvider(customConfig);
      const mockResponse = createMockVerboseResponse();
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      await customProvider.transcribe(audio);
      const callArgs = mockCreate.mock.calls[0][0];
      expect(callArgs.model).toBe("Systran/faster-whisper-small");
    });
    it("should use default language from config for transcription", async () => {
      const customConfig = createTestConfig({ language: "de" });
      const customProvider = new SpeachesSttProvider(customConfig);
      const mockResponse = createMockVerboseResponse({ language: "de" });
      mockCreate.mockResolvedValueOnce(mockResponse);
      const audio = Buffer.from("fake-audio-data");
      await customProvider.transcribe(audio);
      const callArgs = mockCreate.mock.calls[0][0];
      expect(callArgs.language).toBe("de");
    });
    it("should set a dummy API key for local Speaches server", () => {
      expect(mockOpenAIConstructorCalls).toHaveLength(1);
      expect(mockOpenAIConstructorCalls[0]).toEqual(
        expect.objectContaining({
          apiKey: "not-needed",
        })
      );
    });
  });
 });
--- a/apps/api/src/speech/providers/speaches-stt.provider.ts
+++ b/apps/api/src/speech/providers/speaches-stt.provider.ts
@@ -0,0 +1,180 @@
 /**
 * SpeachesSttProvider
 *
 * Speech-to-text provider using Speaches (faster-whisper backend).
 * Connects to the Speaches server via its OpenAI-compatible
 * `/v1/audio/transcriptions` endpoint using the OpenAI SDK.
 *
 * Issue #390
 */
 import { Injectable, Inject, Logger } from "@nestjs/common";
 import OpenAI from "openai";
 import { toFile } from "openai";
 import { speechConfig, type SpeechConfig } from "../speech.config";
 import type { ISTTProvider } from "../interfaces/stt-provider.interface";
 import type {
  TranscribeOptions,
  TranscriptionResult,
  TranscriptionSegment,
 } from "../interfaces/speech-types";
 /**
 * Derive file extension from a MIME type for use in the uploaded file name.
 */
 function extensionFromMimeType(mimeType: string): string {
  const mapping: Record<string, string> = {
    "audio/wav": "wav",
    "audio/wave": "wav",
    "audio/x-wav": "wav",
    "audio/mp3": "mp3",
    "audio/mpeg": "mp3",
    "audio/mp4": "mp4",
    "audio/m4a": "m4a",
    "audio/ogg": "ogg",
    "audio/flac": "flac",
    "audio/webm": "webm",
    "audio/mpga": "mpga",
  };
  return mapping[mimeType] ?? "wav";
 }
 /**
 * STT provider backed by a Speaches (faster-whisper) server.
 *
 * Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint,
 * so we re-use the official OpenAI SDK with a custom `baseURL`.
 *
 * @example
 * ```typescript
 * const provider = new SpeachesSttProvider(speechConfig);
 * const result = await provider.transcribe(audioBuffer, { language: "en" });
 * console.log(result.text);
 * ```
 */
@Injectable()
 export class SpeachesSttProvider implements ISTTProvider {
  readonly name = "speaches";
  private readonly logger = new Logger(SpeachesSttProvider.name);
  private readonly client: OpenAI;
  private readonly config: SpeechConfig;
  constructor(
    @Inject(speechConfig.KEY)
    config: SpeechConfig
  ) {
    this.config = config;
    this.client = new OpenAI({
      baseURL: config.stt.baseUrl,
      apiKey: "not-needed", // Speaches does not require an API key
    });
    this.logger.log(
      `Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})`
    );
  }
  /**
   * Transcribe audio data to text using the Speaches server.
   *
   * Sends the audio buffer to the `/v1/audio/transcriptions` endpoint
   * with `response_format=verbose_json` to get segments and duration data.
   *
   * @param audio - Raw audio data as a Buffer
   * @param options - Optional transcription parameters (model, language, prompt, temperature)
   * @returns Transcription result with text, language, duration, and optional segments
   * @throws {Error} If transcription fails (connection error, API error, etc.)
   */
  async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
    const model = options?.model ?? this.config.stt.model;
    const language = options?.language ?? this.config.stt.language;
    const mimeType = options?.mimeType ?? "audio/wav";
    const extension = extensionFromMimeType(mimeType);
    try {
      const file = await toFile(audio, `audio.${extension}`, {
        type: mimeType,
      });
      const response = await this.client.audio.transcriptions.create({
        file,
        model,
        language,
        response_format: "verbose_json",
        ...(options?.prompt !== undefined ? { prompt: options.prompt } : {}),
        ...(options?.temperature !== undefined ? { temperature: options.temperature } : {}),
      });
      return this.mapResponse(response, language);
    } catch (error: unknown) {
      const message = error instanceof Error ? error.message : String(error);
      this.logger.error(`Transcription failed: ${message}`);
      throw new Error(`STT transcription failed: ${message}`);
    }
  }
  /**
   * Check if the Speaches server is healthy and reachable.
   *
   * Attempts to list models from the server. Returns true if the request
   * succeeds, false otherwise.
   *
   * @returns true if the Speaches server is reachable and ready
   */
  async isHealthy(): Promise<boolean> {
    try {
      await this.client.models.list();
      return true;
    } catch (error: unknown) {
      const message = error instanceof Error ? error.message : String(error);
      this.logger.warn(`Speaches health check failed: ${message}`);
      return false;
    }
  }
  /**
   * Map the OpenAI SDK transcription response to our TranscriptionResult type.
   *
   * Handles both verbose responses (with duration, segments) and simple
   * responses (text only).
   */
  private mapResponse(
    response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record<string, unknown>,
    fallbackLanguage: string
  ): TranscriptionResult {
    const text = (response as { text: string }).text;
    const verboseResponse = response as {
      text: string;
      language?: string;
      duration?: number;
      segments?: {
        text: string;
        start: number;
        end: number;
      }[];
    };
    const result: TranscriptionResult = {
      text,
      language: verboseResponse.language ?? fallbackLanguage,
    };
    if (verboseResponse.duration !== undefined) {
      result.durationSeconds = verboseResponse.duration;
    }
    if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) {
      result.segments = verboseResponse.segments.map(
        (segment): TranscriptionSegment => ({
          text: segment.text,
          start: segment.start,
          end: segment.end,
        })
      );
    }
    return result;
  }
 }
--- a/apps/api/src/speech/speech.module.ts
+++ b/apps/api/src/speech/speech.module.ts
@@ -4,36 +4,60 @@
 * NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
 * Provides a provider abstraction layer with graceful fallback for TTS tiers.
 *
 * TTS providers are created dynamically based on configuration:
 * - default: Kokoro-FastAPI (CPU, always available)
 * - premium: Chatterbox (GPU, voice cloning)
 * - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
 *
 * Imports:
 * - ConfigModule.forFeature(speechConfig) for speech configuration
 *
 * Providers:
 * - SpeechService: High-level speech operations with provider selection
- * - TTS_PROVIDERS: Empty Map<SpeechTier, ITTSProvider> (populated by provider modules)
+ * - TTS_PROVIDERS: Map<SpeechTier, ITTSProvider> populated by factory based on config
 *
 * Exports:
 * - SpeechService for use by other modules (e.g., controllers, brain)
 *
- * Issue #389
+ * Issue #389, #390, #391
 */
 import { Module, type OnModuleInit, Logger } from "@nestjs/common";
-import { ConfigModule } from "@nestjs/config";
+import { ConfigModule, ConfigService } from "@nestjs/config";
-import { speechConfig, validateSpeechConfig } from "./speech.config";
+import {
  speechConfig,
  validateSpeechConfig,
  isSttEnabled,
  type SpeechConfig,
 } from "./speech.config";
 import { SpeechService } from "./speech.service";
-import { TTS_PROVIDERS } from "./speech.constants";
+import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
-import type { SpeechTier } from "./interfaces/speech-types";
+import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
-import type { ITTSProvider } from "./interfaces/tts-provider.interface";
+import { createTTSProviders } from "./providers/tts-provider.factory";
@Module({
  imports: [ConfigModule.forFeature(speechConfig)],
  providers: [
    SpeechService,
-    // Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.)
+    // STT provider: conditionally register SpeachesSttProvider when STT is enabled
-    // will register their providers in subsequent tasks.
+    ...(isSttEnabled()
      ? [
          {
            provide: STT_PROVIDER,
            useClass: SpeachesSttProvider,
          },
        ]
      : []),
    {
      provide: TTS_PROVIDERS,
-      useFactory: (): Map<SpeechTier, ITTSProvider> => new Map(),
+      useFactory: (configService: ConfigService) => {
        const config = configService.get<SpeechConfig>("speech");
        if (!config) {
          return new Map();
        }
        return createTTSProviders(config);
      },
      inject: [ConfigService],
    },
  ],
  exports: [SpeechService],