Compare commits

...

2 Commits

Author SHA1 Message Date
b5edb4f37e feat(#391): add base TTS provider and factory classes
All checks were successful
ci/woodpecker/push/api Pipeline was successful
Add the BaseTTSProvider abstract class and TTS provider factory that were
part of the tiered TTS architecture but missed from the previous commit.

- BaseTTSProvider: abstract base with synthesize(), listVoices(), isHealthy()
- tts-provider.factory: creates Kokoro/Chatterbox/Piper providers from config
- 30 tests (22 base provider + 8 factory)

Refs #391
2026-02-15 02:20:24 -06:00
3ae9e53bcc feat(#391): implement tiered TTS provider architecture with base class
Add abstract BaseTTSProvider class that implements common OpenAI-compatible
TTS logic using the OpenAI SDK with configurable baseURL. Includes synthesize(),
listVoices(), and isHealthy() methods. Create TTS provider factory that
dynamically registers Kokoro (default), Chatterbox (premium), and Piper
(fallback) providers based on configuration. Update SpeechModule to use
the factory for TTS_PROVIDERS injection token.

Also fixes lint error in speaches-stt.provider.ts (Array<T> -> T[]).

30 tests added (22 base provider + 8 factory), all passing.

Fixes #391
2026-02-15 02:19:46 -06:00
7 changed files with 1591 additions and 10 deletions

View File

@@ -0,0 +1,329 @@
/**
* BaseTTSProvider Unit Tests
*
* Tests the abstract base class for OpenAI-compatible TTS providers.
* Uses a concrete test implementation to exercise the base class logic.
*
* Issue #391
*/
import { describe, it, expect, beforeEach, vi, type Mock } from "vitest";
import { BaseTTSProvider } from "./base-tts.provider";
import type { SpeechTier, SynthesizeOptions, AudioFormat } from "../interfaces/speech-types";
// ==========================================
// Mock OpenAI SDK
// ==========================================
const mockCreate = vi.fn();
vi.mock("openai", () => {
class MockOpenAI {
audio = {
speech: {
create: mockCreate,
},
};
}
return { default: MockOpenAI };
});
// ==========================================
// Concrete test implementation
// ==========================================
class TestTTSProvider extends BaseTTSProvider {
readonly name = "test-provider";
readonly tier: SpeechTier = "default";
constructor(baseURL: string, defaultVoice?: string, defaultFormat?: AudioFormat) {
super(baseURL, defaultVoice, defaultFormat);
}
}
// ==========================================
// Test helpers
// ==========================================
/**
* Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return.
* The OpenAI SDK returns a Response object with arrayBuffer() method.
*/
function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } {
return {
arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer),
};
}
describe("BaseTTSProvider", () => {
let provider: TestTTSProvider;
const testBaseURL = "http://localhost:8880/v1";
const testVoice = "af_heart";
const testFormat: AudioFormat = "mp3";
beforeEach(() => {
vi.clearAllMocks();
provider = new TestTTSProvider(testBaseURL, testVoice, testFormat);
});
// ==========================================
// Constructor
// ==========================================
describe("constructor", () => {
it("should create an instance with provided configuration", () => {
expect(provider).toBeDefined();
expect(provider.name).toBe("test-provider");
expect(provider.tier).toBe("default");
});
it("should use default voice 'alloy' when none provided", () => {
const defaultProvider = new TestTTSProvider(testBaseURL);
expect(defaultProvider).toBeDefined();
});
it("should use default format 'mp3' when none provided", () => {
const defaultProvider = new TestTTSProvider(testBaseURL, "voice-1");
expect(defaultProvider).toBeDefined();
});
});
// ==========================================
// synthesize()
// ==========================================
describe("synthesize", () => {
it("should synthesize text and return a SynthesisResult with audio buffer", async () => {
const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const result = await provider.synthesize("Hello, world!");
expect(result).toBeDefined();
expect(result.audio).toBeInstanceOf(Buffer);
expect(result.audio.length).toBe(audioBytes.length);
expect(result.format).toBe("mp3");
expect(result.voice).toBe("af_heart");
expect(result.tier).toBe("default");
});
it("should pass correct parameters to OpenAI SDK", async () => {
const audioBytes = new Uint8Array([0x01, 0x02]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
await provider.synthesize("Test text");
expect(mockCreate).toHaveBeenCalledWith({
model: "tts-1",
input: "Test text",
voice: "af_heart",
response_format: "mp3",
speed: 1.0,
});
});
it("should use custom voice from options", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: SynthesizeOptions = { voice: "custom_voice" };
const result = await provider.synthesize("Hello", options);
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "custom_voice" }));
expect(result.voice).toBe("custom_voice");
});
it("should use custom format from options", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: SynthesizeOptions = { format: "wav" };
const result = await provider.synthesize("Hello", options);
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "wav" }));
expect(result.format).toBe("wav");
});
it("should use custom speed from options", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: SynthesizeOptions = { speed: 1.5 };
await provider.synthesize("Hello", options);
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ speed: 1.5 }));
});
it("should throw an error when synthesis fails", async () => {
mockCreate.mockRejectedValue(new Error("Connection refused"));
await expect(provider.synthesize("Hello")).rejects.toThrow(
"TTS synthesis failed for test-provider: Connection refused"
);
});
it("should throw an error when response arrayBuffer fails", async () => {
const mockResponse = {
arrayBuffer: vi.fn().mockRejectedValue(new Error("Read error")),
};
mockCreate.mockResolvedValue(mockResponse);
await expect(provider.synthesize("Hello")).rejects.toThrow(
"TTS synthesis failed for test-provider: Read error"
);
});
it("should handle empty text input gracefully", async () => {
const audioBytes = new Uint8Array([]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const result = await provider.synthesize("");
expect(result.audio).toBeInstanceOf(Buffer);
expect(result.audio.length).toBe(0);
});
it("should handle non-Error exceptions", async () => {
mockCreate.mockRejectedValue("string error");
await expect(provider.synthesize("Hello")).rejects.toThrow(
"TTS synthesis failed for test-provider: string error"
);
});
});
// ==========================================
// listVoices()
// ==========================================
describe("listVoices", () => {
it("should return default voice list with the configured default voice", async () => {
const voices = await provider.listVoices();
expect(voices).toBeInstanceOf(Array);
expect(voices.length).toBeGreaterThan(0);
const defaultVoice = voices.find((v) => v.isDefault === true);
expect(defaultVoice).toBeDefined();
expect(defaultVoice?.id).toBe("af_heart");
expect(defaultVoice?.tier).toBe("default");
});
it("should set tier correctly on all returned voices", async () => {
const voices = await provider.listVoices();
for (const voice of voices) {
expect(voice.tier).toBe("default");
}
});
});
// ==========================================
// isHealthy()
// ==========================================
describe("isHealthy", () => {
it("should return true when the TTS server is reachable", async () => {
// Mock global fetch for health check
const mockFetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
});
vi.stubGlobal("fetch", mockFetch);
const healthy = await provider.isHealthy();
expect(healthy).toBe(true);
expect(mockFetch).toHaveBeenCalled();
vi.unstubAllGlobals();
});
it("should return false when the TTS server is unreachable", async () => {
const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED"));
vi.stubGlobal("fetch", mockFetch);
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
vi.unstubAllGlobals();
});
it("should return false when the TTS server returns an error status", async () => {
const mockFetch = vi.fn().mockResolvedValue({
ok: false,
status: 503,
});
vi.stubGlobal("fetch", mockFetch);
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
vi.unstubAllGlobals();
});
it("should use the base URL for the health check", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200 });
vi.stubGlobal("fetch", mockFetch);
await provider.isHealthy();
// Should call a health-related endpoint at the base URL
const calledUrl = mockFetch.mock.calls[0][0] as string;
expect(calledUrl).toContain("localhost:8880");
vi.unstubAllGlobals();
});
it("should set a timeout for the health check", async () => {
const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200 });
vi.stubGlobal("fetch", mockFetch);
await provider.isHealthy();
// Should pass an AbortSignal for timeout
const fetchOptions = mockFetch.mock.calls[0][1] as RequestInit;
expect(fetchOptions.signal).toBeDefined();
vi.unstubAllGlobals();
});
});
// ==========================================
// Default values
// ==========================================
describe("default values", () => {
it("should use 'alloy' as default voice when none specified", async () => {
const defaultProvider = new TestTTSProvider(testBaseURL);
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
await defaultProvider.synthesize("Hello");
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "alloy" }));
});
it("should use 'mp3' as default format when none specified", async () => {
const defaultProvider = new TestTTSProvider(testBaseURL);
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
await defaultProvider.synthesize("Hello");
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" }));
});
it("should use speed 1.0 as default speed", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
await provider.synthesize("Hello");
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ speed: 1.0 }));
});
});
});

View File

@@ -0,0 +1,189 @@
/**
* Base TTS Provider
*
* Abstract base class implementing common OpenAI-compatible TTS logic.
* All concrete TTS providers (Kokoro, Chatterbox, Piper) extend this class.
*
* Uses the OpenAI SDK with a configurable baseURL to communicate with
* OpenAI-compatible speech synthesis endpoints.
*
* Issue #391
*/
import { Logger } from "@nestjs/common";
import OpenAI from "openai";
import type { ITTSProvider } from "../interfaces/tts-provider.interface";
import type {
SpeechTier,
SynthesizeOptions,
SynthesisResult,
VoiceInfo,
AudioFormat,
} from "../interfaces/speech-types";
/** Default TTS model identifier used for OpenAI-compatible APIs */
const DEFAULT_MODEL = "tts-1";
/** Default voice when none is configured */
const DEFAULT_VOICE = "alloy";
/** Default audio format */
const DEFAULT_FORMAT: AudioFormat = "mp3";
/** Default speech speed multiplier */
const DEFAULT_SPEED = 1.0;
/** Health check timeout in milliseconds */
const HEALTH_CHECK_TIMEOUT_MS = 5000;
/**
* Abstract base class for OpenAI-compatible TTS providers.
*
* Provides common logic for:
* - Synthesizing text to audio via OpenAI SDK's audio.speech.create()
* - Listing available voices (with a default implementation)
* - Health checking the TTS endpoint
*
* Subclasses must set `name` and `tier` properties and may override
* `listVoices()` to provide provider-specific voice lists.
*
* @example
* ```typescript
* class KokoroProvider extends BaseTTSProvider {
* readonly name = "kokoro";
* readonly tier: SpeechTier = "default";
*
* constructor(baseURL: string) {
* super(baseURL, "af_heart", "mp3");
* }
* }
* ```
*/
export abstract class BaseTTSProvider implements ITTSProvider {
abstract readonly name: string;
abstract readonly tier: SpeechTier;
protected readonly logger: Logger;
protected readonly client: OpenAI;
protected readonly baseURL: string;
protected readonly defaultVoice: string;
protected readonly defaultFormat: AudioFormat;
/**
* Create a new BaseTTSProvider.
*
* @param baseURL - The base URL for the OpenAI-compatible TTS endpoint
* @param defaultVoice - Default voice ID to use when none is specified in options
* @param defaultFormat - Default audio format to use when none is specified in options
*/
constructor(
baseURL: string,
defaultVoice: string = DEFAULT_VOICE,
defaultFormat: AudioFormat = DEFAULT_FORMAT
) {
this.baseURL = baseURL;
this.defaultVoice = defaultVoice;
this.defaultFormat = defaultFormat;
this.logger = new Logger(this.constructor.name);
this.client = new OpenAI({
baseURL,
apiKey: "not-needed", // Self-hosted services don't require an API key
});
}
/**
* Synthesize text to audio using the OpenAI-compatible TTS endpoint.
*
* Calls `client.audio.speech.create()` with the provided text and options,
* then converts the response to a Buffer.
*
* @param text - Text to convert to speech
* @param options - Optional synthesis parameters (voice, format, speed)
* @returns Synthesis result with audio buffer and metadata
* @throws {Error} If synthesis fails
*/
async synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult> {
const voice = options?.voice ?? this.defaultVoice;
const format = options?.format ?? this.defaultFormat;
const speed = options?.speed ?? DEFAULT_SPEED;
try {
const response = await this.client.audio.speech.create({
model: DEFAULT_MODEL,
input: text,
voice,
response_format: format,
speed,
});
const arrayBuffer = await response.arrayBuffer();
const audio = Buffer.from(arrayBuffer);
return {
audio,
format,
voice,
tier: this.tier,
};
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.error(`TTS synthesis failed: ${message}`);
throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
}
}
/**
* List available voices for this provider.
*
* Default implementation returns the configured default voice.
* Subclasses should override this to provide a full voice list
* from their specific TTS engine.
*
* @returns Array of voice information objects
*/
listVoices(): Promise<VoiceInfo[]> {
return Promise.resolve([
{
id: this.defaultVoice,
name: this.defaultVoice,
tier: this.tier,
isDefault: true,
},
]);
}
/**
* Check if the TTS server is reachable and healthy.
*
* Performs a simple HTTP request to the base URL's models endpoint
* to verify the server is running and responding.
*
* @returns true if the server is reachable, false otherwise
*/
async isHealthy(): Promise<boolean> {
try {
// Extract the base URL without the /v1 path for health checking
const healthUrl = this.baseURL.replace(/\/v1\/?$/, "/v1/models");
const controller = new AbortController();
const timeoutId = setTimeout(() => {
controller.abort();
}, HEALTH_CHECK_TIMEOUT_MS);
try {
const response = await fetch(healthUrl, {
method: "GET",
signal: controller.signal,
});
return response.ok;
} finally {
clearTimeout(timeoutId);
}
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.warn(`Health check failed for ${this.name}: ${message}`);
return false;
}
}
}

View File

@@ -0,0 +1,468 @@
/**
* SpeachesSttProvider Tests
*
* TDD tests for the Speaches/faster-whisper STT provider.
* Tests cover transcription, error handling, health checks, and config injection.
*
* Issue #390
*/
import { describe, it, expect, beforeEach, vi } from "vitest";
import { SpeachesSttProvider } from "./speaches-stt.provider";
import type { SpeechConfig } from "../speech.config";
import type { TranscribeOptions } from "../interfaces/speech-types";
// ==========================================
// Mock OpenAI SDK
// ==========================================
const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => {
const mockCreate = vi.fn();
const mockModelsList = vi.fn();
const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => {
return new File([buffer], name);
});
const mockOpenAIConstructorCalls: Array<Record<string, unknown>> = [];
return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls };
});
vi.mock("openai", () => {
class MockOpenAI {
audio = {
transcriptions: {
create: mockCreate,
},
};
models = {
list: mockModelsList,
};
constructor(config: Record<string, unknown>) {
mockOpenAIConstructorCalls.push(config);
}
}
return {
default: MockOpenAI,
toFile: mockToFile,
};
});
// ==========================================
// Test helpers
// ==========================================
function createTestConfig(overrides?: Partial<SpeechConfig["stt"]>): SpeechConfig {
return {
stt: {
enabled: true,
baseUrl: "http://speaches:8000/v1",
model: "Systran/faster-whisper-large-v3-turbo",
language: "en",
...overrides,
},
tts: {
default: { enabled: false, url: "", voice: "", format: "" },
premium: { enabled: false, url: "" },
fallback: { enabled: false, url: "" },
},
limits: {
maxUploadSize: 25_000_000,
maxDurationSeconds: 600,
maxTextLength: 4096,
},
};
}
function createMockVerboseResponse(overrides?: Record<string, unknown>): Record<string, unknown> {
return {
text: "Hello, world!",
language: "en",
duration: 3.5,
segments: [
{
id: 0,
text: "Hello, world!",
start: 0.0,
end: 3.5,
avg_logprob: -0.25,
compression_ratio: 1.2,
no_speech_prob: 0.01,
seek: 0,
temperature: 0.0,
tokens: [1, 2, 3],
},
],
...overrides,
};
}
describe("SpeachesSttProvider", () => {
let provider: SpeachesSttProvider;
let config: SpeechConfig;
beforeEach(() => {
vi.clearAllMocks();
mockOpenAIConstructorCalls.length = 0;
config = createTestConfig();
provider = new SpeachesSttProvider(config);
});
// ==========================================
// Provider identity
// ==========================================
describe("name", () => {
it("should have the name 'speaches'", () => {
expect(provider.name).toBe("speaches");
});
});
// ==========================================
// transcribe
// ==========================================
describe("transcribe", () => {
it("should call OpenAI audio.transcriptions.create with correct parameters", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await provider.transcribe(audio);
expect(mockCreate).toHaveBeenCalledOnce();
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo");
expect(callArgs.language).toBe("en");
expect(callArgs.response_format).toBe("verbose_json");
});
it("should convert Buffer to File using toFile", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await provider.transcribe(audio);
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", {
type: "audio/wav",
});
});
it("should return TranscriptionResult with text and language", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Hello, world!");
expect(result.language).toBe("en");
});
it("should return durationSeconds from verbose response", async () => {
const mockResponse = createMockVerboseResponse({ duration: 5.25 });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.durationSeconds).toBe(5.25);
});
it("should map segments from verbose response", async () => {
const mockResponse = createMockVerboseResponse({
segments: [
{
id: 0,
text: "Hello,",
start: 0.0,
end: 1.5,
avg_logprob: -0.2,
compression_ratio: 1.1,
no_speech_prob: 0.01,
seek: 0,
temperature: 0.0,
tokens: [1, 2],
},
{
id: 1,
text: " world!",
start: 1.5,
end: 3.5,
avg_logprob: -0.3,
compression_ratio: 1.3,
no_speech_prob: 0.02,
seek: 0,
temperature: 0.0,
tokens: [3, 4],
},
],
});
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.segments).toHaveLength(2);
expect(result.segments?.[0]).toEqual({
text: "Hello,",
start: 0.0,
end: 1.5,
});
expect(result.segments?.[1]).toEqual({
text: " world!",
start: 1.5,
end: 3.5,
});
});
it("should handle response without segments gracefully", async () => {
const mockResponse = createMockVerboseResponse({ segments: undefined });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Hello, world!");
expect(result.segments).toBeUndefined();
});
it("should handle response without duration gracefully", async () => {
const mockResponse = createMockVerboseResponse({ duration: undefined });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Hello, world!");
expect(result.durationSeconds).toBeUndefined();
});
// ------------------------------------------
// Options override
// ------------------------------------------
describe("options override", () => {
it("should use custom model from options when provided", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { model: "custom-whisper-model" };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.model).toBe("custom-whisper-model");
});
it("should use custom language from options when provided", async () => {
const mockResponse = createMockVerboseResponse({ language: "fr" });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { language: "fr" };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.language).toBe("fr");
});
it("should pass through prompt option", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { prompt: "This is a meeting about project planning." };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.prompt).toBe("This is a meeting about project planning.");
});
it("should pass through temperature option", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { temperature: 0.3 };
await provider.transcribe(audio, options);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.temperature).toBe(0.3);
});
it("should use custom mimeType for file conversion when provided", async () => {
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
const options: TranscribeOptions = { mimeType: "audio/mp3" };
await provider.transcribe(audio, options);
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", {
type: "audio/mp3",
});
});
});
// ------------------------------------------
// Simple response fallback
// ------------------------------------------
describe("simple response fallback", () => {
it("should handle simple Transcription response (text only, no verbose fields)", async () => {
// Some configurations may return just { text: "..." } without verbose fields
const simpleResponse = { text: "Simple transcription result." };
mockCreate.mockResolvedValueOnce(simpleResponse);
const audio = Buffer.from("fake-audio-data");
const result = await provider.transcribe(audio);
expect(result.text).toBe("Simple transcription result.");
expect(result.language).toBe("en"); // Falls back to config language
expect(result.durationSeconds).toBeUndefined();
expect(result.segments).toBeUndefined();
});
});
});
// ==========================================
// Error handling
// ==========================================
describe("error handling", () => {
it("should throw a descriptive error on connection refused", async () => {
const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000");
mockCreate.mockRejectedValueOnce(connectionError);
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000"
);
});
it("should throw a descriptive error on timeout", async () => {
const timeoutError = new Error("Request timed out");
mockCreate.mockRejectedValueOnce(timeoutError);
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: Request timed out"
);
});
it("should throw a descriptive error on API error", async () => {
const apiError = new Error("Invalid model: nonexistent-model");
mockCreate.mockRejectedValueOnce(apiError);
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: Invalid model: nonexistent-model"
);
});
it("should handle non-Error thrown values", async () => {
mockCreate.mockRejectedValueOnce("unexpected string error");
const audio = Buffer.from("fake-audio-data");
await expect(provider.transcribe(audio)).rejects.toThrow(
"STT transcription failed: unexpected string error"
);
});
});
// ==========================================
// isHealthy
// ==========================================
describe("isHealthy", () => {
it("should return true when the server is reachable", async () => {
mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] });
const healthy = await provider.isHealthy();
expect(healthy).toBe(true);
});
it("should return false when the server is unreachable", async () => {
mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
});
it("should not throw on health check failure", async () => {
mockModelsList.mockRejectedValueOnce(new Error("Network error"));
await expect(provider.isHealthy()).resolves.toBe(false);
});
it("should return false on unexpected error types", async () => {
mockModelsList.mockRejectedValueOnce("string error");
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
});
});
// ==========================================
// Config injection
// ==========================================
describe("config injection", () => {
it("should create OpenAI client with baseURL from config", () => {
// The constructor was called in beforeEach
expect(mockOpenAIConstructorCalls).toHaveLength(1);
expect(mockOpenAIConstructorCalls[0]).toEqual(
expect.objectContaining({
baseURL: "http://speaches:8000/v1",
})
);
});
it("should use custom baseURL from config", () => {
mockOpenAIConstructorCalls.length = 0;
const customConfig = createTestConfig({
baseUrl: "http://custom-speaches:9000/v1",
});
new SpeachesSttProvider(customConfig);
expect(mockOpenAIConstructorCalls).toHaveLength(1);
expect(mockOpenAIConstructorCalls[0]).toEqual(
expect.objectContaining({
baseURL: "http://custom-speaches:9000/v1",
})
);
});
it("should use default model from config for transcription", async () => {
const customConfig = createTestConfig({
model: "Systran/faster-whisper-small",
});
const customProvider = new SpeachesSttProvider(customConfig);
const mockResponse = createMockVerboseResponse();
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await customProvider.transcribe(audio);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.model).toBe("Systran/faster-whisper-small");
});
it("should use default language from config for transcription", async () => {
const customConfig = createTestConfig({ language: "de" });
const customProvider = new SpeachesSttProvider(customConfig);
const mockResponse = createMockVerboseResponse({ language: "de" });
mockCreate.mockResolvedValueOnce(mockResponse);
const audio = Buffer.from("fake-audio-data");
await customProvider.transcribe(audio);
const callArgs = mockCreate.mock.calls[0][0];
expect(callArgs.language).toBe("de");
});
it("should set a dummy API key for local Speaches server", () => {
expect(mockOpenAIConstructorCalls).toHaveLength(1);
expect(mockOpenAIConstructorCalls[0]).toEqual(
expect.objectContaining({
apiKey: "not-needed",
})
);
});
});
});

View File

@@ -0,0 +1,180 @@
/**
* SpeachesSttProvider
*
* Speech-to-text provider using Speaches (faster-whisper backend).
* Connects to the Speaches server via its OpenAI-compatible
* `/v1/audio/transcriptions` endpoint using the OpenAI SDK.
*
* Issue #390
*/
import { Injectable, Inject, Logger } from "@nestjs/common";
import OpenAI from "openai";
import { toFile } from "openai";
import { speechConfig, type SpeechConfig } from "../speech.config";
import type { ISTTProvider } from "../interfaces/stt-provider.interface";
import type {
TranscribeOptions,
TranscriptionResult,
TranscriptionSegment,
} from "../interfaces/speech-types";
/**
* Derive file extension from a MIME type for use in the uploaded file name.
*/
function extensionFromMimeType(mimeType: string): string {
const mapping: Record<string, string> = {
"audio/wav": "wav",
"audio/wave": "wav",
"audio/x-wav": "wav",
"audio/mp3": "mp3",
"audio/mpeg": "mp3",
"audio/mp4": "mp4",
"audio/m4a": "m4a",
"audio/ogg": "ogg",
"audio/flac": "flac",
"audio/webm": "webm",
"audio/mpga": "mpga",
};
return mapping[mimeType] ?? "wav";
}
/**
* STT provider backed by a Speaches (faster-whisper) server.
*
* Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint,
* so we re-use the official OpenAI SDK with a custom `baseURL`.
*
* @example
* ```typescript
* const provider = new SpeachesSttProvider(speechConfig);
* const result = await provider.transcribe(audioBuffer, { language: "en" });
* console.log(result.text);
* ```
*/
@Injectable()
export class SpeachesSttProvider implements ISTTProvider {
readonly name = "speaches";
private readonly logger = new Logger(SpeachesSttProvider.name);
private readonly client: OpenAI;
private readonly config: SpeechConfig;
constructor(
@Inject(speechConfig.KEY)
config: SpeechConfig
) {
this.config = config;
this.client = new OpenAI({
baseURL: config.stt.baseUrl,
apiKey: "not-needed", // Speaches does not require an API key
});
this.logger.log(
`Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})`
);
}
/**
* Transcribe audio data to text using the Speaches server.
*
* Sends the audio buffer to the `/v1/audio/transcriptions` endpoint
* with `response_format=verbose_json` to get segments and duration data.
*
* @param audio - Raw audio data as a Buffer
* @param options - Optional transcription parameters (model, language, prompt, temperature)
* @returns Transcription result with text, language, duration, and optional segments
* @throws {Error} If transcription fails (connection error, API error, etc.)
*/
async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
const model = options?.model ?? this.config.stt.model;
const language = options?.language ?? this.config.stt.language;
const mimeType = options?.mimeType ?? "audio/wav";
const extension = extensionFromMimeType(mimeType);
try {
const file = await toFile(audio, `audio.${extension}`, {
type: mimeType,
});
const response = await this.client.audio.transcriptions.create({
file,
model,
language,
response_format: "verbose_json",
...(options?.prompt !== undefined ? { prompt: options.prompt } : {}),
...(options?.temperature !== undefined ? { temperature: options.temperature } : {}),
});
return this.mapResponse(response, language);
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.error(`Transcription failed: ${message}`);
throw new Error(`STT transcription failed: ${message}`);
}
}
/**
* Check if the Speaches server is healthy and reachable.
*
* Attempts to list models from the server. Returns true if the request
* succeeds, false otherwise.
*
* @returns true if the Speaches server is reachable and ready
*/
async isHealthy(): Promise<boolean> {
try {
await this.client.models.list();
return true;
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
this.logger.warn(`Speaches health check failed: ${message}`);
return false;
}
}
/**
* Map the OpenAI SDK transcription response to our TranscriptionResult type.
*
* Handles both verbose responses (with duration, segments) and simple
* responses (text only).
*/
private mapResponse(
response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record<string, unknown>,
fallbackLanguage: string
): TranscriptionResult {
const text = (response as { text: string }).text;
const verboseResponse = response as {
text: string;
language?: string;
duration?: number;
segments?: {
text: string;
start: number;
end: number;
}[];
};
const result: TranscriptionResult = {
text,
language: verboseResponse.language ?? fallbackLanguage,
};
if (verboseResponse.duration !== undefined) {
result.durationSeconds = verboseResponse.duration;
}
if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) {
result.segments = verboseResponse.segments.map(
(segment): TranscriptionSegment => ({
text: segment.text,
start: segment.start,
end: segment.end,
})
);
}
return result;
}
}

View File

@@ -0,0 +1,279 @@
/**
* TTS Provider Factory Unit Tests
*
* Tests the factory that creates and registers TTS providers based on config.
*
* Issue #391
*/
import { describe, it, expect, vi } from "vitest";
import { createTTSProviders } from "./tts-provider.factory";
import type { SpeechConfig } from "../speech.config";
import type { SpeechTier } from "../interfaces/speech-types";
// ==========================================
// Mock OpenAI SDK
// ==========================================
vi.mock("openai", () => {
class MockOpenAI {
audio = {
speech: {
create: vi.fn(),
},
};
}
return { default: MockOpenAI };
});
// ==========================================
// Test helpers
// ==========================================
function createTestConfig(overrides?: Partial<SpeechConfig>): SpeechConfig {
return {
stt: {
enabled: false,
baseUrl: "http://speaches:8000/v1",
model: "whisper",
language: "en",
},
tts: {
default: {
enabled: false,
url: "http://kokoro-tts:8880/v1",
voice: "af_heart",
format: "mp3",
},
premium: {
enabled: false,
url: "http://chatterbox-tts:8881/v1",
},
fallback: {
enabled: false,
url: "http://openedai-speech:8000/v1",
},
},
limits: {
maxUploadSize: 25_000_000,
maxDurationSeconds: 600,
maxTextLength: 4096,
},
...overrides,
};
}
describe("createTTSProviders", () => {
// ==========================================
// Empty map when nothing enabled
// ==========================================
describe("when no TTS tiers are enabled", () => {
it("should return an empty map", () => {
const config = createTestConfig();
const providers = createTTSProviders(config);
expect(providers).toBeInstanceOf(Map);
expect(providers.size).toBe(0);
});
});
// ==========================================
// Default tier
// ==========================================
describe("when default tier is enabled", () => {
it("should create a provider for the default tier", () => {
const config = createTestConfig({
tts: {
default: {
enabled: true,
url: "http://kokoro-tts:8880/v1",
voice: "af_heart",
format: "mp3",
},
premium: { enabled: false, url: "" },
fallback: { enabled: false, url: "" },
},
});
const providers = createTTSProviders(config);
expect(providers.size).toBe(1);
expect(providers.has("default")).toBe(true);
const provider = providers.get("default");
expect(provider).toBeDefined();
expect(provider?.tier).toBe("default");
expect(provider?.name).toBe("kokoro");
});
});
// ==========================================
// Premium tier
// ==========================================
describe("when premium tier is enabled", () => {
it("should create a provider for the premium tier", () => {
const config = createTestConfig({
tts: {
default: { enabled: false, url: "", voice: "", format: "" },
premium: {
enabled: true,
url: "http://chatterbox-tts:8881/v1",
},
fallback: { enabled: false, url: "" },
},
});
const providers = createTTSProviders(config);
expect(providers.size).toBe(1);
expect(providers.has("premium")).toBe(true);
const provider = providers.get("premium");
expect(provider).toBeDefined();
expect(provider?.tier).toBe("premium");
expect(provider?.name).toBe("chatterbox");
});
});
// ==========================================
// Fallback tier
// ==========================================
describe("when fallback tier is enabled", () => {
it("should create a provider for the fallback tier", () => {
const config = createTestConfig({
tts: {
default: { enabled: false, url: "", voice: "", format: "" },
premium: { enabled: false, url: "" },
fallback: {
enabled: true,
url: "http://openedai-speech:8000/v1",
},
},
});
const providers = createTTSProviders(config);
expect(providers.size).toBe(1);
expect(providers.has("fallback")).toBe(true);
const provider = providers.get("fallback");
expect(provider).toBeDefined();
expect(provider?.tier).toBe("fallback");
expect(provider?.name).toBe("piper");
});
});
// ==========================================
// Multiple tiers
// ==========================================
describe("when multiple tiers are enabled", () => {
it("should create providers for all enabled tiers", () => {
const config = createTestConfig({
tts: {
default: {
enabled: true,
url: "http://kokoro-tts:8880/v1",
voice: "af_heart",
format: "mp3",
},
premium: {
enabled: true,
url: "http://chatterbox-tts:8881/v1",
},
fallback: {
enabled: true,
url: "http://openedai-speech:8000/v1",
},
},
});
const providers = createTTSProviders(config);
expect(providers.size).toBe(3);
expect(providers.has("default")).toBe(true);
expect(providers.has("premium")).toBe(true);
expect(providers.has("fallback")).toBe(true);
});
it("should create providers only for enabled tiers", () => {
const config = createTestConfig({
tts: {
default: {
enabled: true,
url: "http://kokoro-tts:8880/v1",
voice: "af_heart",
format: "mp3",
},
premium: { enabled: false, url: "" },
fallback: {
enabled: true,
url: "http://openedai-speech:8000/v1",
},
},
});
const providers = createTTSProviders(config);
expect(providers.size).toBe(2);
expect(providers.has("default")).toBe(true);
expect(providers.has("premium")).toBe(false);
expect(providers.has("fallback")).toBe(true);
});
});
// ==========================================
// Provider properties
// ==========================================
describe("provider properties", () => {
it("should implement ITTSProvider interface methods", () => {
const config = createTestConfig({
tts: {
default: {
enabled: true,
url: "http://kokoro-tts:8880/v1",
voice: "af_heart",
format: "mp3",
},
premium: { enabled: false, url: "" },
fallback: { enabled: false, url: "" },
},
});
const providers = createTTSProviders(config);
const provider = providers.get("default");
expect(provider).toBeDefined();
expect(typeof provider?.synthesize).toBe("function");
expect(typeof provider?.listVoices).toBe("function");
expect(typeof provider?.isHealthy).toBe("function");
});
it("should return providers as a Map<SpeechTier, ITTSProvider>", () => {
const config = createTestConfig({
tts: {
default: {
enabled: true,
url: "http://kokoro-tts:8880/v1",
voice: "af_heart",
format: "mp3",
},
premium: { enabled: false, url: "" },
fallback: { enabled: false, url: "" },
},
});
const providers = createTTSProviders(config);
// Verify the map keys are valid SpeechTier values
for (const [tier] of providers) {
expect(["default", "premium", "fallback"]).toContain(tier as SpeechTier);
}
});
});
});

View File

@@ -0,0 +1,112 @@
/**
* TTS Provider Factory
*
* Creates and registers TTS providers based on speech configuration.
* Reads enabled flags and URLs from config and instantiates the appropriate
* provider for each tier.
*
* Each tier maps to a specific TTS engine:
* - default: Kokoro-FastAPI (CPU, always available)
* - premium: Chatterbox (GPU, voice cloning)
* - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
*
* Issue #391
*/
import { Logger } from "@nestjs/common";
import { BaseTTSProvider } from "./base-tts.provider";
import type { ITTSProvider } from "../interfaces/tts-provider.interface";
import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
import type { SpeechConfig } from "../speech.config";
// ==========================================
// Concrete provider classes
// ==========================================
/**
* Kokoro TTS provider (default tier).
* CPU-based, always available, Apache 2.0 license.
*/
class KokoroProvider extends BaseTTSProvider {
readonly name = "kokoro";
readonly tier: SpeechTier = "default";
}
/**
* Chatterbox TTS provider (premium tier).
* GPU required, voice cloning capable, MIT license.
*/
class ChatterboxProvider extends BaseTTSProvider {
readonly name = "chatterbox";
readonly tier: SpeechTier = "premium";
constructor(baseURL: string) {
super(baseURL, "default", "mp3");
}
}
/**
* Piper TTS provider via OpenedAI Speech (fallback tier).
* Ultra-lightweight CPU, GPL license.
*/
class PiperProvider extends BaseTTSProvider {
readonly name = "piper";
readonly tier: SpeechTier = "fallback";
constructor(baseURL: string) {
super(baseURL, "alloy", "mp3");
}
}
// ==========================================
// Factory function
// ==========================================
const logger = new Logger("TTSProviderFactory");
/**
* Create and register TTS providers based on the speech configuration.
*
* Only creates providers for tiers that are enabled in the config.
* Returns a Map keyed by SpeechTier for use with the TTS_PROVIDERS injection token.
*
* @param config - Speech configuration with TTS tier settings
* @returns Map of enabled TTS providers keyed by tier
*/
export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSProvider> {
const providers = new Map<SpeechTier, ITTSProvider>();
// Default tier: Kokoro
if (config.tts.default.enabled) {
const provider = new KokoroProvider(
config.tts.default.url,
config.tts.default.voice,
config.tts.default.format as AudioFormat
);
providers.set("default", provider);
logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`);
}
// Premium tier: Chatterbox
if (config.tts.premium.enabled) {
const provider = new ChatterboxProvider(config.tts.premium.url);
providers.set("premium", provider);
logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`);
}
// Fallback tier: Piper
if (config.tts.fallback.enabled) {
const provider = new PiperProvider(config.tts.fallback.url);
providers.set("fallback", provider);
logger.log(`Registered fallback TTS provider: piper at ${config.tts.fallback.url}`);
}
if (providers.size === 0) {
logger.warn("No TTS providers are enabled. TTS synthesis will not be available.");
} else {
const tierNames = Array.from(providers.keys()).join(", ");
logger.log(`TTS providers ready: ${tierNames} (${String(providers.size)} total)`);
}
return providers;
}

View File

@@ -4,36 +4,60 @@
* NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
* Provides a provider abstraction layer with graceful fallback for TTS tiers.
*
* TTS providers are created dynamically based on configuration:
* - default: Kokoro-FastAPI (CPU, always available)
* - premium: Chatterbox (GPU, voice cloning)
* - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
*
* Imports:
* - ConfigModule.forFeature(speechConfig) for speech configuration
*
* Providers:
* - SpeechService: High-level speech operations with provider selection
* - TTS_PROVIDERS: Empty Map<SpeechTier, ITTSProvider> (populated by provider modules)
* - TTS_PROVIDERS: Map<SpeechTier, ITTSProvider> populated by factory based on config
*
* Exports:
* - SpeechService for use by other modules (e.g., controllers, brain)
*
* Issue #389
* Issue #389, #390, #391
*/
import { Module, type OnModuleInit, Logger } from "@nestjs/common";
import { ConfigModule } from "@nestjs/config";
import { speechConfig, validateSpeechConfig } from "./speech.config";
import { ConfigModule, ConfigService } from "@nestjs/config";
import {
speechConfig,
validateSpeechConfig,
isSttEnabled,
type SpeechConfig,
} from "./speech.config";
import { SpeechService } from "./speech.service";
import { TTS_PROVIDERS } from "./speech.constants";
import type { SpeechTier } from "./interfaces/speech-types";
import type { ITTSProvider } from "./interfaces/tts-provider.interface";
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
import { createTTSProviders } from "./providers/tts-provider.factory";
@Module({
imports: [ConfigModule.forFeature(speechConfig)],
providers: [
SpeechService,
// Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.)
// will register their providers in subsequent tasks.
// STT provider: conditionally register SpeachesSttProvider when STT is enabled
...(isSttEnabled()
? [
{
provide: STT_PROVIDER,
useClass: SpeachesSttProvider,
},
]
: []),
{
provide: TTS_PROVIDERS,
useFactory: (): Map<SpeechTier, ITTSProvider> => new Map(),
useFactory: (configService: ConfigService) => {
const config = configService.get<SpeechConfig>("speech");
if (!config) {
return new Map();
}
return createTTSProviders(config);
},
inject: [ConfigService],
},
],
exports: [SpeechService],