feat: M13-SpeechServices — TTS & STT integration #409

Merged
jason.woltje merged 20 commits from feature/m13-speech-services into develop 2026-02-15 18:37:54 +00:00
3 changed files with 682 additions and 10 deletions
Showing only changes of commit 3ae9e53bcc - Show all commits

View File

@@ -0,0 +1,468 @@
/**
* SpeachesSttProvider Tests
*
* TDD tests for the Speaches/faster-whisper STT provider.
* Tests cover transcription, error handling, health checks, and config injection.
*
* Issue #390
*/
import { describe, it, expect, beforeEach, vi } from "vitest";
import { SpeachesSttProvider } from "./speaches-stt.provider";
import type { SpeechConfig } from "../speech.config";
import type { TranscribeOptions } from "../interfaces/speech-types";
// ==========================================
// Mock OpenAI SDK
// ==========================================
const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => {
const mockCreate = vi.fn();
const mockModelsList = vi.fn();
const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => {
return new File([buffer], name);
});
const mockOpenAIConstructorCalls: Array<Record<string, unknown>> = [];
return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls };
});
vi.mock("openai", () => {
class MockOpenAI {
audio = {
transcriptions: {
create: mockCreate,
},
};
models = {
list: mockModelsList,
};
constructor(config: Record<string, unknown>) {
mockOpenAIConstructorCalls.push(config);
}
}
return {
default: MockOpenAI,
toFile: mockToFile,
};
});
// ==========================================
// Test helpers
// ==========================================
function createTestConfig(overrides?: Partial<SpeechConfig["stt"]>): SpeechConfig {
return {
stt: {
enabled: true,
baseUrl: "http://speaches:8000/v1",
model: "Systran/faster-whisper-large-v3-turbo",
language: "en",
...overrides,
},
tts: {
default: { enabled: false, url: "", voice: "", format: "" },
premium: { enabled: false, url: "" },
fallback: { enabled: false, url: "" },
},
limits: {
maxUploadSize: 25_000_000,
maxDurationSeconds: 600,
maxTextLength: 4096,
},
};
}
function createMockVerboseResponse(overrides?: Record<string, unknown>): Record<string, unknown> {
return {
text: "Hello, world!",
language: "en",
duration: 3.5,
segments: [
{
id: 0,
text: "Hello, world!",
start: 0.0,
end: 3.5,
avg_logprob: -0.25,
compression_ratio: 1.2,
no_speech_prob: 0.01,
seek: 0,
temperature: 0.0,
tokens: [1, 2, 3],
},
],
...overrides,
};
}
describe("SpeachesSttProvider", () => {
let provider: SpeachesSttProvider;
let config: SpeechConfig;
beforeEach(() => {
vi.clearAllMocks();
mockOpenAIConstructorCalls.length = 0;
config = createTestConfig();
provider = new SpeachesSttProvider(config);
});
// ==========================================
// Provider identity
// ==========================================
describe("name", () => {
it("should have the name 'speaches'", () => {
expect(provider.name).toBe("speaches");
});
});
// ==========================================
// transcribe
// ==========================================
describe("transcribe", () => {
it("should call OpenAI audio.transcriptions.create with correct parameters", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await provider.transcribe(audio);
expect(mockCreate).toHaveBeenCalledOnce();
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo");
expect(callArgs.language).toBe("en");
expect(callArgs.response_format).toBe("verbose_json");
});
it("should convert Buffer to File using toFile", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await provider.transcribe(audio);
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", {
type: "audio/wav",
});
});
it("should return TranscriptionResult with text and language", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Hello, world!");
expect(result.language).toBe("en");
});
it("should return durationSeconds from verbose response", async () => {
const mockResponse = createMockVerboseResponse({ duration: 5.25 });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.durationSeconds).toBe(5.25);
});
it("should map segments from verbose response", async () => {
const mockResponse = createMockVerboseResponse({
segments: [
{
id: 0,
text: "Hello,",
start: 0.0,
end: 1.5,
avg_logprob: -0.2,
compression_ratio: 1.1,
no_speech_prob: 0.01,
seek: 0,
temperature: 0.0,
tokens: [1, 2],
},
{
id: 1,
text: " world!",
start: 1.5,
end: 3.5,
avg_logprob: -0.3,
compression_ratio: 1.3,
no_speech_prob: 0.02,
seek: 0,
temperature: 0.0,
tokens: [3, 4],
},
],
});
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.segments).toHaveLength(2);
expect(result.segments?.[0]).toEqual({
text: "Hello,",
start: 0.0,
end: 1.5,
});
expect(result.segments?.[1]).toEqual({
text: " world!",
start: 1.5,
end: 3.5,
});
});
it("should handle response without segments gracefully", async () => {
const mockResponse = createMockVerboseResponse({ segments: undefined });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Hello, world!");
expect(result.segments).toBeUndefined();
});
it("should handle response without duration gracefully", async () => {
const mockResponse = createMockVerboseResponse({ duration: undefined });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Hello, world!");
expect(result.durationSeconds).toBeUndefined();
});
// ------------------------------------------
// Options override
// ------------------------------------------
describe("options override", () => {
it("should use custom model from options when provided", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { model: "custom-whisper-model" };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.model).toBe("custom-whisper-model");
});
it("should use custom language from options when provided", async () => {
const mockResponse = createMockVerboseResponse({ language: "fr" });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { language: "fr" };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.language).toBe("fr");
});
it("should pass through prompt option", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { prompt: "This is a meeting about project planning." };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.prompt).toBe("This is a meeting about project planning.");
});
it("should pass through temperature option", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { temperature: 0.3 };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.temperature).toBe(0.3);
});
it("should use custom mimeType for file conversion when provided", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { mimeType: "audio/mp3" };
await provider.transcribe(audio, options);
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", {
type: "audio/mp3",
});
});
});
// ------------------------------------------
// Simple response fallback
// ------------------------------------------
describe("simple response fallback", () => {
it("should handle simple Transcription response (text only, no verbose fields)", async () => {
// Some configurations may return just { text: "..." } without verbose fields
const simpleResponse = { text: "Simple transcription result." };
mockCreate.mockResolvedValueOnce(simpleResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Simple transcription result.");
expect(result.language).toBe("en"); // Falls back to config language
expect(result.durationSeconds).toBeUndefined();
expect(result.segments).toBeUndefined();
});
});
});
// ==========================================
// Error handling
// ==========================================
describe("error handling", () => {
it("should throw a descriptive error on connection refused", async () => {
const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000");
mockCreate.mockRejectedValueOnce(connectionError);
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000"
);
});
it("should throw a descriptive error on timeout", async () => {
const timeoutError = new Error("Request timed out");
mockCreate.mockRejectedValueOnce(timeoutError);
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: Request timed out"
);
});
it("should throw a descriptive error on API error", async () => {
const apiError = new Error("Invalid model: nonexistent-model");
mockCreate.mockRejectedValueOnce(apiError);
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: Invalid model: nonexistent-model"
);
});
it("should handle non-Error thrown values", async () => {
mockCreate.mockRejectedValueOnce("unexpected string error");
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: unexpected string error"
);
});
});
// ==========================================
// isHealthy
// ==========================================
describe("isHealthy", () => {
it("should return true when the server is reachable", async () => {
mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] });
const healthy = await provider.isHealthy();
expect(healthy).toBe(true);
});
it("should return false when the server is unreachable", async () => {
mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
});
it("should not throw on health check failure", async () => {
mockModelsList.mockRejectedValueOnce(new Error("Network error"));
await expect(provider.isHealthy()).resolves.toBe(false);
});
it("should return false on unexpected error types", async () => {
mockModelsList.mockRejectedValueOnce("string error");
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
});
});
// ==========================================
// Config injection
// ==========================================
describe("config injection", () => {
it("should create OpenAI client with baseURL from config", () => {
// The constructor was called in beforeEach
expect(mockOpenAIConstructorCalls).toHaveLength(1);
expect(mockOpenAIConstructorCalls[0]).toEqual(
expect.objectContaining({
baseURL: "http://speaches:8000/v1",
})
);
});
it("should use custom baseURL from config", () => {
mockOpenAIConstructorCalls.length = 0;
const customConfig = createTestConfig({
baseUrl: "http://custom-speaches:9000/v1",
});
new SpeachesSttProvider(customConfig);
expect(mockOpenAIConstructorCalls).toHaveLength(1);
expect(mockOpenAIConstructorCalls[0]).toEqual(
expect.objectContaining({
baseURL: "http://custom-speaches:9000/v1",
})
);
});
it("should use default model from config for transcription", async () => {
const customConfig = createTestConfig({
model: "Systran/faster-whisper-small",
});
const customProvider = new SpeachesSttProvider(customConfig);
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await customProvider.transcribe(audio);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.model).toBe("Systran/faster-whisper-small");
});
it("should use default language from config for transcription", async () => {
const customConfig = createTestConfig({ language: "de" });
const customProvider = new SpeachesSttProvider(customConfig);
const mockResponse = createMockVerboseResponse({ language: "de" });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await customProvider.transcribe(audio);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.language).toBe("de");
});
it("should set a dummy API key for local Speaches server", () => {
expect(mockOpenAIConstructorCalls).toHaveLength(1);
expect(mockOpenAIConstructorCalls[0]).toEqual(
expect.objectContaining({
apiKey: "not-needed",
})
);
});
});
});

View File

@@ -0,0 +1,180 @@
/**
* SpeachesSttProvider
*
* Speech-to-text provider using Speaches (faster-whisper backend).
* Connects to the Speaches server via its OpenAI-compatible
* `/v1/audio/transcriptions` endpoint using the OpenAI SDK.
*
* Issue #390
*/
import { Injectable, Inject, Logger } from "@nestjs/common";
import OpenAI from "openai";
import { toFile } from "openai";
import { speechConfig, type SpeechConfig } from "../speech.config";
import type { ISTTProvider } from "../interfaces/stt-provider.interface";
import type {
TranscribeOptions,
TranscriptionResult,
TranscriptionSegment,
} from "../interfaces/speech-types";
/**
* Derive file extension from a MIME type for use in the uploaded file name.
*/
function extensionFromMimeType(mimeType: string): string {
const mapping: Record<string, string> = {
"audio/wav": "wav",
"audio/wave": "wav",
"audio/x-wav": "wav",
"audio/mp3": "mp3",
"audio/mpeg": "mp3",
"audio/mp4": "mp4",
"audio/m4a": "m4a",
"audio/ogg": "ogg",
"audio/flac": "flac",
"audio/webm": "webm",
"audio/mpga": "mpga",
};
return mapping[mimeType] ?? "wav";
}
/**
* STT provider backed by a Speaches (faster-whisper) server.
*
* Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint,
* so we re-use the official OpenAI SDK with a custom `baseURL`.
*
* @example
* ```typescript
* const provider = new SpeachesSttProvider(speechConfig);
* const result = await provider.transcribe(audioBuffer, { language: "en" });
* console.log(result.text);
* ```
*/
@Injectable()
export class SpeachesSttProvider implements ISTTProvider {
readonly name = "speaches";
private readonly logger = new Logger(SpeachesSttProvider.name);
private readonly client: OpenAI;
private readonly config: SpeechConfig;
constructor(
@Inject(speechConfig.KEY)
config: SpeechConfig
) {
this.config = config;
this.client = new OpenAI({
baseURL: config.stt.baseUrl,
apiKey: "not-needed", // Speaches does not require an API key
});
this.logger.log(
`Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})`
);
}
/**
* Transcribe audio data to text using the Speaches server.
*
* Sends the audio buffer to the `/v1/audio/transcriptions` endpoint
* with `response_format=verbose_json` to get segments and duration data.
*
* @param audio - Raw audio data as a Buffer
* @param options - Optional transcription parameters (model, language, prompt, temperature)
* @returns Transcription result with text, language, duration, and optional segments
* @throws {Error} If transcription fails (connection error, API error, etc.)
*/
async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
const model = options?.model ?? this.config.stt.model;
const language = options?.language ?? this.config.stt.language;
const mimeType = options?.mimeType ?? "audio/wav";
const extension = extensionFromMimeType(mimeType);
try {
const file = await toFile(audio, `audio.${extension}`, {
type: mimeType,
});
const response = await this.client.audio.transcriptions.create({
file,
model,
language,
response_format: "verbose_json",
...(options?.prompt !== undefined ? { prompt: options.prompt } : {}),
...(options?.temperature !== undefined ? { temperature: options.temperature } : {}),
});
return this.mapResponse(response, language);
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.error(`Transcription failed: ${message}`);
throw new Error(`STT transcription failed: ${message}`);
}
}
/**
* Check if the Speaches server is healthy and reachable.
*
* Attempts to list models from the server. Returns true if the request
* succeeds, false otherwise.
*
* @returns true if the Speaches server is reachable and ready
*/
async isHealthy(): Promise<boolean> {
try {
await this.client.models.list();
return true;
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.warn(`Speaches health check failed: ${message}`);
return false;
}
}
/**
* Map the OpenAI SDK transcription response to our TranscriptionResult type.
*
* Handles both verbose responses (with duration, segments) and simple
* responses (text only).
*/
private mapResponse(
response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record<string, unknown>,
fallbackLanguage: string
): TranscriptionResult {
const text = (response as { text: string }).text;
const verboseResponse = response as {
text: string;
language?: string;
duration?: number;
segments?: {
text: string;
start: number;
end: number;
}[];
};
const result: TranscriptionResult = {
text,
language: verboseResponse.language ?? fallbackLanguage,
};
if (verboseResponse.duration !== undefined) {
result.durationSeconds = verboseResponse.duration;
}
if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) {
result.segments = verboseResponse.segments.map(
(segment): TranscriptionSegment => ({
text: segment.text,
start: segment.start,
end: segment.end,
})
);
}
return result;
}
}

View File

@@ -4,36 +4,60 @@
* NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
* Provides a provider abstraction layer with graceful fallback for TTS tiers.
*
* TTS providers are created dynamically based on configuration:
* - default: Kokoro-FastAPI (CPU, always available)
* - premium: Chatterbox (GPU, voice cloning)
* - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
*
* Imports:
* - ConfigModule.forFeature(speechConfig) for speech configuration
*
* Providers:
* - SpeechService: High-level speech operations with provider selection
* - TTS_PROVIDERS: Empty Map<SpeechTier, ITTSProvider> (populated by provider modules)
* - TTS_PROVIDERS: Map<SpeechTier, ITTSProvider> populated by factory based on config
*
* Exports:
* - SpeechService for use by other modules (e.g., controllers, brain)
*
* Issue #389
* Issue #389, #390, #391
*/
import { Module, type OnModuleInit, Logger } from "@nestjs/common";
import { ConfigModule } from "@nestjs/config";
import { speechConfig, validateSpeechConfig } from "./speech.config";
import { ConfigModule, ConfigService } from "@nestjs/config";
import {
speechConfig,
validateSpeechConfig,
isSttEnabled,
type SpeechConfig,
} from "./speech.config";
import { SpeechService } from "./speech.service";
import { TTS_PROVIDERS } from "./speech.constants";
import type { SpeechTier } from "./interfaces/speech-types";
import type { ITTSProvider } from "./interfaces/tts-provider.interface";
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
import { createTTSProviders } from "./providers/tts-provider.factory";
@Module({
imports: [ConfigModule.forFeature(speechConfig)],
providers: [
SpeechService,
// Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.)
// will register their providers in subsequent tasks.
// STT provider: conditionally register SpeachesSttProvider when STT is enabled
...(isSttEnabled()
? [
{
provide: STT_PROVIDER,
useClass: SpeachesSttProvider,
},
]
: []),
{
provide: TTS_PROVIDERS,
useFactory: (): Map<SpeechTier, ITTSProvider> => new Map(),
useFactory: (configService: ConfigService) => {
const config = configService.get<SpeechConfig>("speech");
if (!config) {
return new Map();
}
return createTTSProviders(config);
},
inject: [ConfigService],
},
],
exports: [SpeechService],