2026-02-17 01:04:47 +00:00
3 changed files with 682 additions and 10 deletions
--- a/apps/api/src/speech/providers/speaches-stt.provider.spec.ts
+++ b/apps/api/src/speech/providers/speaches-stt.provider.spec.ts
@@ -0,0 +1,468 @@
+/**
+ * SpeachesSttProvider Tests
+ *
+ * TDD tests for the Speaches/faster-whisper STT provider.
+ * Tests cover transcription, error handling, health checks, and config injection.
+ *
+ * Issue #390
+ */
+
+import { describe, it, expect, beforeEach, vi } from "vitest";
+import { SpeachesSttProvider } from "./speaches-stt.provider";
+import type { SpeechConfig } from "../speech.config";
+import type { TranscribeOptions } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => {
+  const mockCreate = vi.fn();
+  const mockModelsList = vi.fn();
+  const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => {
+    return new File([buffer], name);
+  });
+  const mockOpenAIConstructorCalls: Array<Record<string, unknown>> = [];
+  return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls };
+});
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      transcriptions: {
+        create: mockCreate,
+      },
+    };
+    models = {
+      list: mockModelsList,
+    };
+    constructor(config: Record<string, unknown>) {
+      mockOpenAIConstructorCalls.push(config);
+    }
+  }
+  return {
+    default: MockOpenAI,
+    toFile: mockToFile,
+  };
+});
+
+// ==========================================
+// Test helpers
+// ==========================================
+
+function createTestConfig(overrides?: Partial<SpeechConfig["stt"]>): SpeechConfig {
+  return {
+    stt: {
+      enabled: true,
+      baseUrl: "http://speaches:8000/v1",
+      model: "Systran/faster-whisper-large-v3-turbo",
+      language: "en",
+      ...overrides,
+    },
+    tts: {
+      default: { enabled: false, url: "", voice: "", format: "" },
+      premium: { enabled: false, url: "" },
+      fallback: { enabled: false, url: "" },
+    },
+    limits: {
+      maxUploadSize: 25_000_000,
+      maxDurationSeconds: 600,
+      maxTextLength: 4096,
+    },
+  };
+}
+
+function createMockVerboseResponse(overrides?: Record<string, unknown>): Record<string, unknown> {
+  return {
+    text: "Hello, world!",
+    language: "en",
+    duration: 3.5,
+    segments: [
+      {
+        id: 0,
+        text: "Hello, world!",
+        start: 0.0,
+        end: 3.5,
+        avg_logprob: -0.25,
+        compression_ratio: 1.2,
+        no_speech_prob: 0.01,
+        seek: 0,
+        temperature: 0.0,
+        tokens: [1, 2, 3],
+      },
+    ],
+    ...overrides,
+  };
+}
+
+describe("SpeachesSttProvider", () => {
+  let provider: SpeachesSttProvider;
+  let config: SpeechConfig;
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    mockOpenAIConstructorCalls.length = 0;
+    config = createTestConfig();
+    provider = new SpeachesSttProvider(config);
+  });
+
+  // ==========================================
+  // Provider identity
+  // ==========================================
+  describe("name", () => {
+    it("should have the name 'speaches'", () => {
+      expect(provider.name).toBe("speaches");
+    });
+  });
+
+  // ==========================================
+  // transcribe
+  // ==========================================
+  describe("transcribe", () => {
+    it("should call OpenAI audio.transcriptions.create with correct parameters", async () => {
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await provider.transcribe(audio);
+
+      expect(mockCreate).toHaveBeenCalledOnce();
+      const callArgs = mockCreate.mock.calls[0][0];
+      expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo");
+      expect(callArgs.language).toBe("en");
+      expect(callArgs.response_format).toBe("verbose_json");
+    });
+
+    it("should convert Buffer to File using toFile", async () => {
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await provider.transcribe(audio);
+
+      expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", {
+        type: "audio/wav",
+      });
+    });
+
+    it("should return TranscriptionResult with text and language", async () => {
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.text).toBe("Hello, world!");
+      expect(result.language).toBe("en");
+    });
+
+    it("should return durationSeconds from verbose response", async () => {
+      const mockResponse = createMockVerboseResponse({ duration: 5.25 });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.durationSeconds).toBe(5.25);
+    });
+
+    it("should map segments from verbose response", async () => {
+      const mockResponse = createMockVerboseResponse({
+        segments: [
+          {
+            id: 0,
+            text: "Hello,",
+            start: 0.0,
+            end: 1.5,
+            avg_logprob: -0.2,
+            compression_ratio: 1.1,
+            no_speech_prob: 0.01,
+            seek: 0,
+            temperature: 0.0,
+            tokens: [1, 2],
+          },
+          {
+            id: 1,
+            text: " world!",
+            start: 1.5,
+            end: 3.5,
+            avg_logprob: -0.3,
+            compression_ratio: 1.3,
+            no_speech_prob: 0.02,
+            seek: 0,
+            temperature: 0.0,
+            tokens: [3, 4],
+          },
+        ],
+      });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.segments).toHaveLength(2);
+      expect(result.segments?.[0]).toEqual({
+        text: "Hello,",
+        start: 0.0,
+        end: 1.5,
+      });
+      expect(result.segments?.[1]).toEqual({
+        text: " world!",
+        start: 1.5,
+        end: 3.5,
+      });
+    });
+
+    it("should handle response without segments gracefully", async () => {
+      const mockResponse = createMockVerboseResponse({ segments: undefined });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.text).toBe("Hello, world!");
+      expect(result.segments).toBeUndefined();
+    });
+
+    it("should handle response without duration gracefully", async () => {
+      const mockResponse = createMockVerboseResponse({ duration: undefined });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.text).toBe("Hello, world!");
+      expect(result.durationSeconds).toBeUndefined();
+    });
+
+    // ------------------------------------------
+    // Options override
+    // ------------------------------------------
+    describe("options override", () => {
+      it("should use custom model from options when provided", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { model: "custom-whisper-model" };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.model).toBe("custom-whisper-model");
+      });
+
+      it("should use custom language from options when provided", async () => {
+        const mockResponse = createMockVerboseResponse({ language: "fr" });
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { language: "fr" };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.language).toBe("fr");
+      });
+
+      it("should pass through prompt option", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { prompt: "This is a meeting about project planning." };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.prompt).toBe("This is a meeting about project planning.");
+      });
+
+      it("should pass through temperature option", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { temperature: 0.3 };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.temperature).toBe(0.3);
+      });
+
+      it("should use custom mimeType for file conversion when provided", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { mimeType: "audio/mp3" };
+        await provider.transcribe(audio, options);
+
+        expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", {
+          type: "audio/mp3",
+        });
+      });
+    });
+
+    // ------------------------------------------
+    // Simple response fallback
+    // ------------------------------------------
+    describe("simple response fallback", () => {
+      it("should handle simple Transcription response (text only, no verbose fields)", async () => {
+        // Some configurations may return just { text: "..." } without verbose fields
+        const simpleResponse = { text: "Simple transcription result." };
+        mockCreate.mockResolvedValueOnce(simpleResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const result = await provider.transcribe(audio);
+
+        expect(result.text).toBe("Simple transcription result.");
+        expect(result.language).toBe("en"); // Falls back to config language
+        expect(result.durationSeconds).toBeUndefined();
+        expect(result.segments).toBeUndefined();
+      });
+    });
+  });
+
+  // ==========================================
+  // Error handling
+  // ==========================================
+  describe("error handling", () => {
+    it("should throw a descriptive error on connection refused", async () => {
+      const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000");
+      mockCreate.mockRejectedValueOnce(connectionError);
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000"
+      );
+    });
+
+    it("should throw a descriptive error on timeout", async () => {
+      const timeoutError = new Error("Request timed out");
+      mockCreate.mockRejectedValueOnce(timeoutError);
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: Request timed out"
+      );
+    });
+
+    it("should throw a descriptive error on API error", async () => {
+      const apiError = new Error("Invalid model: nonexistent-model");
+      mockCreate.mockRejectedValueOnce(apiError);
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: Invalid model: nonexistent-model"
+      );
+    });
+
+    it("should handle non-Error thrown values", async () => {
+      mockCreate.mockRejectedValueOnce("unexpected string error");
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: unexpected string error"
+      );
+    });
+  });
+
+  // ==========================================
+  // isHealthy
+  // ==========================================
+  describe("isHealthy", () => {
+    it("should return true when the server is reachable", async () => {
+      mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] });
+
+      const healthy = await provider.isHealthy();
+      expect(healthy).toBe(true);
+    });
+
+    it("should return false when the server is unreachable", async () => {
+      mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
+
+      const healthy = await provider.isHealthy();
+      expect(healthy).toBe(false);
+    });
+
+    it("should not throw on health check failure", async () => {
+      mockModelsList.mockRejectedValueOnce(new Error("Network error"));
+
+      await expect(provider.isHealthy()).resolves.toBe(false);
+    });
+
+    it("should return false on unexpected error types", async () => {
+      mockModelsList.mockRejectedValueOnce("string error");
+
+      const healthy = await provider.isHealthy();
+      expect(healthy).toBe(false);
+    });
+  });
+
+  // ==========================================
+  // Config injection
+  // ==========================================
+  describe("config injection", () => {
+    it("should create OpenAI client with baseURL from config", () => {
+      // The constructor was called in beforeEach
+      expect(mockOpenAIConstructorCalls).toHaveLength(1);
+      expect(mockOpenAIConstructorCalls[0]).toEqual(
+        expect.objectContaining({
+          baseURL: "http://speaches:8000/v1",
+        })
+      );
+    });
+
+    it("should use custom baseURL from config", () => {
+      mockOpenAIConstructorCalls.length = 0;
+      const customConfig = createTestConfig({
+        baseUrl: "http://custom-speaches:9000/v1",
+      });
+      new SpeachesSttProvider(customConfig);
+
+      expect(mockOpenAIConstructorCalls).toHaveLength(1);
+      expect(mockOpenAIConstructorCalls[0]).toEqual(
+        expect.objectContaining({
+          baseURL: "http://custom-speaches:9000/v1",
+        })
+      );
+    });
+
+    it("should use default model from config for transcription", async () => {
+      const customConfig = createTestConfig({
+        model: "Systran/faster-whisper-small",
+      });
+      const customProvider = new SpeachesSttProvider(customConfig);
+
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await customProvider.transcribe(audio);
+
+      const callArgs = mockCreate.mock.calls[0][0];
+      expect(callArgs.model).toBe("Systran/faster-whisper-small");
+    });
+
+    it("should use default language from config for transcription", async () => {
+      const customConfig = createTestConfig({ language: "de" });
+      const customProvider = new SpeachesSttProvider(customConfig);
+
+      const mockResponse = createMockVerboseResponse({ language: "de" });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await customProvider.transcribe(audio);
+
+      const callArgs = mockCreate.mock.calls[0][0];
+      expect(callArgs.language).toBe("de");
+    });
+
+    it("should set a dummy API key for local Speaches server", () => {
+      expect(mockOpenAIConstructorCalls).toHaveLength(1);
+      expect(mockOpenAIConstructorCalls[0]).toEqual(
+        expect.objectContaining({
+          apiKey: "not-needed",
+        })
+      );
+    });
+  });
+});
--- a/apps/api/src/speech/providers/speaches-stt.provider.ts
+++ b/apps/api/src/speech/providers/speaches-stt.provider.ts
@@ -0,0 +1,180 @@
+/**
+ * SpeachesSttProvider
+ *
+ * Speech-to-text provider using Speaches (faster-whisper backend).
+ * Connects to the Speaches server via its OpenAI-compatible
+ * `/v1/audio/transcriptions` endpoint using the OpenAI SDK.
+ *
+ * Issue #390
+ */
+
+import { Injectable, Inject, Logger } from "@nestjs/common";
+import OpenAI from "openai";
+import { toFile } from "openai";
+import { speechConfig, type SpeechConfig } from "../speech.config";
+import type { ISTTProvider } from "../interfaces/stt-provider.interface";
+import type {
+  TranscribeOptions,
+  TranscriptionResult,
+  TranscriptionSegment,
+} from "../interfaces/speech-types";
+
+/**
+ * Derive file extension from a MIME type for use in the uploaded file name.
+ */
+function extensionFromMimeType(mimeType: string): string {
+  const mapping: Record<string, string> = {
+    "audio/wav": "wav",
+    "audio/wave": "wav",
+    "audio/x-wav": "wav",
+    "audio/mp3": "mp3",
+    "audio/mpeg": "mp3",
+    "audio/mp4": "mp4",
+    "audio/m4a": "m4a",
+    "audio/ogg": "ogg",
+    "audio/flac": "flac",
+    "audio/webm": "webm",
+    "audio/mpga": "mpga",
+  };
+  return mapping[mimeType] ?? "wav";
+}
+
+/**
+ * STT provider backed by a Speaches (faster-whisper) server.
+ *
+ * Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint,
+ * so we re-use the official OpenAI SDK with a custom `baseURL`.
+ *
+ * @example
+ * ```typescript
+ * const provider = new SpeachesSttProvider(speechConfig);
+ * const result = await provider.transcribe(audioBuffer, { language: "en" });
+ * console.log(result.text);
+ * ```
+ */
+@Injectable()
+export class SpeachesSttProvider implements ISTTProvider {
+  readonly name = "speaches";
+
+  private readonly logger = new Logger(SpeachesSttProvider.name);
+  private readonly client: OpenAI;
+  private readonly config: SpeechConfig;
+
+  constructor(
+    @Inject(speechConfig.KEY)
+    config: SpeechConfig
+  ) {
+    this.config = config;
+
+    this.client = new OpenAI({
+      baseURL: config.stt.baseUrl,
+      apiKey: "not-needed", // Speaches does not require an API key
+    });
+
+    this.logger.log(
+      `Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})`
+    );
+  }
+
+  /**
+   * Transcribe audio data to text using the Speaches server.
+   *
+   * Sends the audio buffer to the `/v1/audio/transcriptions` endpoint
+   * with `response_format=verbose_json` to get segments and duration data.
+   *
+   * @param audio - Raw audio data as a Buffer
+   * @param options - Optional transcription parameters (model, language, prompt, temperature)
+   * @returns Transcription result with text, language, duration, and optional segments
+   * @throws {Error} If transcription fails (connection error, API error, etc.)
+   */
+  async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
+    const model = options?.model ?? this.config.stt.model;
+    const language = options?.language ?? this.config.stt.language;
+    const mimeType = options?.mimeType ?? "audio/wav";
+    const extension = extensionFromMimeType(mimeType);
+
+    try {
+      const file = await toFile(audio, `audio.${extension}`, {
+        type: mimeType,
+      });
+
+      const response = await this.client.audio.transcriptions.create({
+        file,
+        model,
+        language,
+        response_format: "verbose_json",
+        ...(options?.prompt !== undefined ? { prompt: options.prompt } : {}),
+        ...(options?.temperature !== undefined ? { temperature: options.temperature } : {}),
+      });
+
+      return this.mapResponse(response, language);
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`Transcription failed: ${message}`);
+      throw new Error(`STT transcription failed: ${message}`);
+    }
+  }
+
+  /**
+   * Check if the Speaches server is healthy and reachable.
+   *
+   * Attempts to list models from the server. Returns true if the request
+   * succeeds, false otherwise.
+   *
+   * @returns true if the Speaches server is reachable and ready
+   */
+  async isHealthy(): Promise<boolean> {
+    try {
+      await this.client.models.list();
+      return true;
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.warn(`Speaches health check failed: ${message}`);
+      return false;
+    }
+  }
+
+  /**
+   * Map the OpenAI SDK transcription response to our TranscriptionResult type.
+   *
+   * Handles both verbose responses (with duration, segments) and simple
+   * responses (text only).
+   */
+  private mapResponse(
+    response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record<string, unknown>,
+    fallbackLanguage: string
+  ): TranscriptionResult {
+    const text = (response as { text: string }).text;
+    const verboseResponse = response as {
+      text: string;
+      language?: string;
+      duration?: number;
+      segments?: {
+        text: string;
+        start: number;
+        end: number;
+      }[];
+    };
+
+    const result: TranscriptionResult = {
+      text,
+      language: verboseResponse.language ?? fallbackLanguage,
+    };
+
+    if (verboseResponse.duration !== undefined) {
+      result.durationSeconds = verboseResponse.duration;
+    }
+
+    if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) {
+      result.segments = verboseResponse.segments.map(
+        (segment): TranscriptionSegment => ({
+          text: segment.text,
+          start: segment.start,
+          end: segment.end,
+        })
+      );
+    }
+
+    return result;
+  }
+}
--- a/apps/api/src/speech/speech.module.ts
+++ b/apps/api/src/speech/speech.module.ts
@@ -4,36 +4,60 @@
 * NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
 * Provides a provider abstraction layer with graceful fallback for TTS tiers.
 *
+ * TTS providers are created dynamically based on configuration:
+ * - default: Kokoro-FastAPI (CPU, always available)
+ * - premium: Chatterbox (GPU, voice cloning)
+ * - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
+ *
 * Imports:
 * - ConfigModule.forFeature(speechConfig) for speech configuration
 *
 * Providers:
 * - SpeechService: High-level speech operations with provider selection
- * - TTS_PROVIDERS: Empty Map<SpeechTier, ITTSProvider> (populated by provider modules)
+ * - TTS_PROVIDERS: Map<SpeechTier, ITTSProvider> populated by factory based on config
 *
 * Exports:
 * - SpeechService for use by other modules (e.g., controllers, brain)
 *
- * Issue #389
+ * Issue #389, #390, #391
 */

 import { Module, type OnModuleInit, Logger } from "@nestjs/common";
-import { ConfigModule } from "@nestjs/config";
-import { speechConfig, validateSpeechConfig } from "./speech.config";
+import { ConfigModule, ConfigService } from "@nestjs/config";
+import {
+  speechConfig,
+  validateSpeechConfig,
+  isSttEnabled,
+  type SpeechConfig,
+} from "./speech.config";
 import { SpeechService } from "./speech.service";
-import { TTS_PROVIDERS } from "./speech.constants";
-import type { SpeechTier } from "./interfaces/speech-types";
-import type { ITTSProvider } from "./interfaces/tts-provider.interface";
+import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
+import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
+import { createTTSProviders } from "./providers/tts-provider.factory";

@Module({
  imports: [ConfigModule.forFeature(speechConfig)],
  providers: [
    SpeechService,
-    // Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.)
-    // will register their providers in subsequent tasks.
+    // STT provider: conditionally register SpeachesSttProvider when STT is enabled
+    ...(isSttEnabled()
+      ? [
+          {
+            provide: STT_PROVIDER,
+            useClass: SpeachesSttProvider,
+          },
+        ]
+      : []),
    {
      provide: TTS_PROVIDERS,
-      useFactory: (): Map<SpeechTier, ITTSProvider> => new Map(),
+      useFactory: (configService: ConfigService) => {
+        const config = configService.get<SpeechConfig>("speech");
+        if (!config) {
+          return new Map();
+        }
+        return createTTSProviders(config);
+      },
+      inject: [ConfigService],
    },
  ],
  exports: [SpeechService],