chore: upgrade Node.js runtime to v24 across codebase #419
468
apps/api/src/speech/providers/speaches-stt.provider.spec.ts
Normal file
468
apps/api/src/speech/providers/speaches-stt.provider.spec.ts
Normal file
@@ -0,0 +1,468 @@
|
||||
/**
|
||||
* SpeachesSttProvider Tests
|
||||
*
|
||||
* TDD tests for the Speaches/faster-whisper STT provider.
|
||||
* Tests cover transcription, error handling, health checks, and config injection.
|
||||
*
|
||||
* Issue #390
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeEach, vi } from "vitest";
|
||||
import { SpeachesSttProvider } from "./speaches-stt.provider";
|
||||
import type { SpeechConfig } from "../speech.config";
|
||||
import type { TranscribeOptions } from "../interfaces/speech-types";
|
||||
|
||||
// ==========================================
|
||||
// Mock OpenAI SDK
|
||||
// ==========================================
|
||||
|
||||
const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => {
|
||||
const mockCreate = vi.fn();
|
||||
const mockModelsList = vi.fn();
|
||||
const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => {
|
||||
return new File([buffer], name);
|
||||
});
|
||||
const mockOpenAIConstructorCalls: Array<Record<string, unknown>> = [];
|
||||
return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls };
|
||||
});
|
||||
|
||||
vi.mock("openai", () => {
|
||||
class MockOpenAI {
|
||||
audio = {
|
||||
transcriptions: {
|
||||
create: mockCreate,
|
||||
},
|
||||
};
|
||||
models = {
|
||||
list: mockModelsList,
|
||||
};
|
||||
constructor(config: Record<string, unknown>) {
|
||||
mockOpenAIConstructorCalls.push(config);
|
||||
}
|
||||
}
|
||||
return {
|
||||
default: MockOpenAI,
|
||||
toFile: mockToFile,
|
||||
};
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Test helpers
|
||||
// ==========================================
|
||||
|
||||
function createTestConfig(overrides?: Partial<SpeechConfig["stt"]>): SpeechConfig {
|
||||
return {
|
||||
stt: {
|
||||
enabled: true,
|
||||
baseUrl: "http://speaches:8000/v1",
|
||||
model: "Systran/faster-whisper-large-v3-turbo",
|
||||
language: "en",
|
||||
...overrides,
|
||||
},
|
||||
tts: {
|
||||
default: { enabled: false, url: "", voice: "", format: "" },
|
||||
premium: { enabled: false, url: "" },
|
||||
fallback: { enabled: false, url: "" },
|
||||
},
|
||||
limits: {
|
||||
maxUploadSize: 25_000_000,
|
||||
maxDurationSeconds: 600,
|
||||
maxTextLength: 4096,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function createMockVerboseResponse(overrides?: Record<string, unknown>): Record<string, unknown> {
|
||||
return {
|
||||
text: "Hello, world!",
|
||||
language: "en",
|
||||
duration: 3.5,
|
||||
segments: [
|
||||
{
|
||||
id: 0,
|
||||
text: "Hello, world!",
|
||||
start: 0.0,
|
||||
end: 3.5,
|
||||
avg_logprob: -0.25,
|
||||
compression_ratio: 1.2,
|
||||
no_speech_prob: 0.01,
|
||||
seek: 0,
|
||||
temperature: 0.0,
|
||||
tokens: [1, 2, 3],
|
||||
},
|
||||
],
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe("SpeachesSttProvider", () => {
|
||||
let provider: SpeachesSttProvider;
|
||||
let config: SpeechConfig;
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
mockOpenAIConstructorCalls.length = 0;
|
||||
config = createTestConfig();
|
||||
provider = new SpeachesSttProvider(config);
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Provider identity
|
||||
// ==========================================
|
||||
describe("name", () => {
|
||||
it("should have the name 'speaches'", () => {
|
||||
expect(provider.name).toBe("speaches");
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// transcribe
|
||||
// ==========================================
|
||||
describe("transcribe", () => {
|
||||
it("should call OpenAI audio.transcriptions.create with correct parameters", async () => {
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await provider.transcribe(audio);
|
||||
|
||||
expect(mockCreate).toHaveBeenCalledOnce();
|
||||
const callArgs = mockCreate.mock.calls[0][0];
|
||||
expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo");
|
||||
expect(callArgs.language).toBe("en");
|
||||
expect(callArgs.response_format).toBe("verbose_json");
|
||||
});
|
||||
|
||||
it("should convert Buffer to File using toFile", async () => {
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await provider.transcribe(audio);
|
||||
|
||||
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", {
|
||||
type: "audio/wav",
|
||||
});
|
||||
});
|
||||
|
||||
it("should return TranscriptionResult with text and language", async () => {
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const result = await provider.transcribe(audio);
|
||||
|
||||
expect(result.text).toBe("Hello, world!");
|
||||
expect(result.language).toBe("en");
|
||||
});
|
||||
|
||||
it("should return durationSeconds from verbose response", async () => {
|
||||
const mockResponse = createMockVerboseResponse({ duration: 5.25 });
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const result = await provider.transcribe(audio);
|
||||
|
||||
expect(result.durationSeconds).toBe(5.25);
|
||||
});
|
||||
|
||||
it("should map segments from verbose response", async () => {
|
||||
const mockResponse = createMockVerboseResponse({
|
||||
segments: [
|
||||
{
|
||||
id: 0,
|
||||
text: "Hello,",
|
||||
start: 0.0,
|
||||
end: 1.5,
|
||||
avg_logprob: -0.2,
|
||||
compression_ratio: 1.1,
|
||||
no_speech_prob: 0.01,
|
||||
seek: 0,
|
||||
temperature: 0.0,
|
||||
tokens: [1, 2],
|
||||
},
|
||||
{
|
||||
id: 1,
|
||||
text: " world!",
|
||||
start: 1.5,
|
||||
end: 3.5,
|
||||
avg_logprob: -0.3,
|
||||
compression_ratio: 1.3,
|
||||
no_speech_prob: 0.02,
|
||||
seek: 0,
|
||||
temperature: 0.0,
|
||||
tokens: [3, 4],
|
||||
},
|
||||
],
|
||||
});
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const result = await provider.transcribe(audio);
|
||||
|
||||
expect(result.segments).toHaveLength(2);
|
||||
expect(result.segments?.[0]).toEqual({
|
||||
text: "Hello,",
|
||||
start: 0.0,
|
||||
end: 1.5,
|
||||
});
|
||||
expect(result.segments?.[1]).toEqual({
|
||||
text: " world!",
|
||||
start: 1.5,
|
||||
end: 3.5,
|
||||
});
|
||||
});
|
||||
|
||||
it("should handle response without segments gracefully", async () => {
|
||||
const mockResponse = createMockVerboseResponse({ segments: undefined });
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const result = await provider.transcribe(audio);
|
||||
|
||||
expect(result.text).toBe("Hello, world!");
|
||||
expect(result.segments).toBeUndefined();
|
||||
});
|
||||
|
||||
it("should handle response without duration gracefully", async () => {
|
||||
const mockResponse = createMockVerboseResponse({ duration: undefined });
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const result = await provider.transcribe(audio);
|
||||
|
||||
expect(result.text).toBe("Hello, world!");
|
||||
expect(result.durationSeconds).toBeUndefined();
|
||||
});
|
||||
|
||||
// ------------------------------------------
|
||||
// Options override
|
||||
// ------------------------------------------
|
||||
describe("options override", () => {
|
||||
it("should use custom model from options when provided", async () => {
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const options: TranscribeOptions = { model: "custom-whisper-model" };
|
||||
await provider.transcribe(audio, options);
|
||||
|
||||
const callArgs = mockCreate.mock.calls[0][0];
|
||||
expect(callArgs.model).toBe("custom-whisper-model");
|
||||
});
|
||||
|
||||
it("should use custom language from options when provided", async () => {
|
||||
const mockResponse = createMockVerboseResponse({ language: "fr" });
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const options: TranscribeOptions = { language: "fr" };
|
||||
await provider.transcribe(audio, options);
|
||||
|
||||
const callArgs = mockCreate.mock.calls[0][0];
|
||||
expect(callArgs.language).toBe("fr");
|
||||
});
|
||||
|
||||
it("should pass through prompt option", async () => {
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const options: TranscribeOptions = { prompt: "This is a meeting about project planning." };
|
||||
await provider.transcribe(audio, options);
|
||||
|
||||
const callArgs = mockCreate.mock.calls[0][0];
|
||||
expect(callArgs.prompt).toBe("This is a meeting about project planning.");
|
||||
});
|
||||
|
||||
it("should pass through temperature option", async () => {
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const options: TranscribeOptions = { temperature: 0.3 };
|
||||
await provider.transcribe(audio, options);
|
||||
|
||||
const callArgs = mockCreate.mock.calls[0][0];
|
||||
expect(callArgs.temperature).toBe(0.3);
|
||||
});
|
||||
|
||||
it("should use custom mimeType for file conversion when provided", async () => {
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const options: TranscribeOptions = { mimeType: "audio/mp3" };
|
||||
await provider.transcribe(audio, options);
|
||||
|
||||
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", {
|
||||
type: "audio/mp3",
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// ------------------------------------------
|
||||
// Simple response fallback
|
||||
// ------------------------------------------
|
||||
describe("simple response fallback", () => {
|
||||
it("should handle simple Transcription response (text only, no verbose fields)", async () => {
|
||||
// Some configurations may return just { text: "..." } without verbose fields
|
||||
const simpleResponse = { text: "Simple transcription result." };
|
||||
mockCreate.mockResolvedValueOnce(simpleResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
const result = await provider.transcribe(audio);
|
||||
|
||||
expect(result.text).toBe("Simple transcription result.");
|
||||
expect(result.language).toBe("en"); // Falls back to config language
|
||||
expect(result.durationSeconds).toBeUndefined();
|
||||
expect(result.segments).toBeUndefined();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Error handling
|
||||
// ==========================================
|
||||
describe("error handling", () => {
|
||||
it("should throw a descriptive error on connection refused", async () => {
|
||||
const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000");
|
||||
mockCreate.mockRejectedValueOnce(connectionError);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||
"STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000"
|
||||
);
|
||||
});
|
||||
|
||||
it("should throw a descriptive error on timeout", async () => {
|
||||
const timeoutError = new Error("Request timed out");
|
||||
mockCreate.mockRejectedValueOnce(timeoutError);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||
"STT transcription failed: Request timed out"
|
||||
);
|
||||
});
|
||||
|
||||
it("should throw a descriptive error on API error", async () => {
|
||||
const apiError = new Error("Invalid model: nonexistent-model");
|
||||
mockCreate.mockRejectedValueOnce(apiError);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||
"STT transcription failed: Invalid model: nonexistent-model"
|
||||
);
|
||||
});
|
||||
|
||||
it("should handle non-Error thrown values", async () => {
|
||||
mockCreate.mockRejectedValueOnce("unexpected string error");
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||
"STT transcription failed: unexpected string error"
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// isHealthy
|
||||
// ==========================================
|
||||
describe("isHealthy", () => {
|
||||
it("should return true when the server is reachable", async () => {
|
||||
mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] });
|
||||
|
||||
const healthy = await provider.isHealthy();
|
||||
expect(healthy).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false when the server is unreachable", async () => {
|
||||
mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
|
||||
|
||||
const healthy = await provider.isHealthy();
|
||||
expect(healthy).toBe(false);
|
||||
});
|
||||
|
||||
it("should not throw on health check failure", async () => {
|
||||
mockModelsList.mockRejectedValueOnce(new Error("Network error"));
|
||||
|
||||
await expect(provider.isHealthy()).resolves.toBe(false);
|
||||
});
|
||||
|
||||
it("should return false on unexpected error types", async () => {
|
||||
mockModelsList.mockRejectedValueOnce("string error");
|
||||
|
||||
const healthy = await provider.isHealthy();
|
||||
expect(healthy).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ==========================================
|
||||
// Config injection
|
||||
// ==========================================
|
||||
describe("config injection", () => {
|
||||
it("should create OpenAI client with baseURL from config", () => {
|
||||
// The constructor was called in beforeEach
|
||||
expect(mockOpenAIConstructorCalls).toHaveLength(1);
|
||||
expect(mockOpenAIConstructorCalls[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
baseURL: "http://speaches:8000/v1",
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it("should use custom baseURL from config", () => {
|
||||
mockOpenAIConstructorCalls.length = 0;
|
||||
const customConfig = createTestConfig({
|
||||
baseUrl: "http://custom-speaches:9000/v1",
|
||||
});
|
||||
new SpeachesSttProvider(customConfig);
|
||||
|
||||
expect(mockOpenAIConstructorCalls).toHaveLength(1);
|
||||
expect(mockOpenAIConstructorCalls[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
baseURL: "http://custom-speaches:9000/v1",
|
||||
})
|
||||
);
|
||||
});
|
||||
|
||||
it("should use default model from config for transcription", async () => {
|
||||
const customConfig = createTestConfig({
|
||||
model: "Systran/faster-whisper-small",
|
||||
});
|
||||
const customProvider = new SpeachesSttProvider(customConfig);
|
||||
|
||||
const mockResponse = createMockVerboseResponse();
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await customProvider.transcribe(audio);
|
||||
|
||||
const callArgs = mockCreate.mock.calls[0][0];
|
||||
expect(callArgs.model).toBe("Systran/faster-whisper-small");
|
||||
});
|
||||
|
||||
it("should use default language from config for transcription", async () => {
|
||||
const customConfig = createTestConfig({ language: "de" });
|
||||
const customProvider = new SpeachesSttProvider(customConfig);
|
||||
|
||||
const mockResponse = createMockVerboseResponse({ language: "de" });
|
||||
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||
|
||||
const audio = Buffer.from("fake-audio-data");
|
||||
await customProvider.transcribe(audio);
|
||||
|
||||
const callArgs = mockCreate.mock.calls[0][0];
|
||||
expect(callArgs.language).toBe("de");
|
||||
});
|
||||
|
||||
it("should set a dummy API key for local Speaches server", () => {
|
||||
expect(mockOpenAIConstructorCalls).toHaveLength(1);
|
||||
expect(mockOpenAIConstructorCalls[0]).toEqual(
|
||||
expect.objectContaining({
|
||||
apiKey: "not-needed",
|
||||
})
|
||||
);
|
||||
});
|
||||
});
|
||||
});
|
||||
180
apps/api/src/speech/providers/speaches-stt.provider.ts
Normal file
180
apps/api/src/speech/providers/speaches-stt.provider.ts
Normal file
@@ -0,0 +1,180 @@
|
||||
/**
|
||||
* SpeachesSttProvider
|
||||
*
|
||||
* Speech-to-text provider using Speaches (faster-whisper backend).
|
||||
* Connects to the Speaches server via its OpenAI-compatible
|
||||
* `/v1/audio/transcriptions` endpoint using the OpenAI SDK.
|
||||
*
|
||||
* Issue #390
|
||||
*/
|
||||
|
||||
import { Injectable, Inject, Logger } from "@nestjs/common";
|
||||
import OpenAI from "openai";
|
||||
import { toFile } from "openai";
|
||||
import { speechConfig, type SpeechConfig } from "../speech.config";
|
||||
import type { ISTTProvider } from "../interfaces/stt-provider.interface";
|
||||
import type {
|
||||
TranscribeOptions,
|
||||
TranscriptionResult,
|
||||
TranscriptionSegment,
|
||||
} from "../interfaces/speech-types";
|
||||
|
||||
/**
|
||||
* Derive file extension from a MIME type for use in the uploaded file name.
|
||||
*/
|
||||
function extensionFromMimeType(mimeType: string): string {
|
||||
const mapping: Record<string, string> = {
|
||||
"audio/wav": "wav",
|
||||
"audio/wave": "wav",
|
||||
"audio/x-wav": "wav",
|
||||
"audio/mp3": "mp3",
|
||||
"audio/mpeg": "mp3",
|
||||
"audio/mp4": "mp4",
|
||||
"audio/m4a": "m4a",
|
||||
"audio/ogg": "ogg",
|
||||
"audio/flac": "flac",
|
||||
"audio/webm": "webm",
|
||||
"audio/mpga": "mpga",
|
||||
};
|
||||
return mapping[mimeType] ?? "wav";
|
||||
}
|
||||
|
||||
/**
|
||||
* STT provider backed by a Speaches (faster-whisper) server.
|
||||
*
|
||||
* Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint,
|
||||
* so we re-use the official OpenAI SDK with a custom `baseURL`.
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const provider = new SpeachesSttProvider(speechConfig);
|
||||
* const result = await provider.transcribe(audioBuffer, { language: "en" });
|
||||
* console.log(result.text);
|
||||
* ```
|
||||
*/
|
||||
@Injectable()
|
||||
export class SpeachesSttProvider implements ISTTProvider {
|
||||
readonly name = "speaches";
|
||||
|
||||
private readonly logger = new Logger(SpeachesSttProvider.name);
|
||||
private readonly client: OpenAI;
|
||||
private readonly config: SpeechConfig;
|
||||
|
||||
constructor(
|
||||
@Inject(speechConfig.KEY)
|
||||
config: SpeechConfig
|
||||
) {
|
||||
this.config = config;
|
||||
|
||||
this.client = new OpenAI({
|
||||
baseURL: config.stt.baseUrl,
|
||||
apiKey: "not-needed", // Speaches does not require an API key
|
||||
});
|
||||
|
||||
this.logger.log(
|
||||
`Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})`
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio data to text using the Speaches server.
|
||||
*
|
||||
* Sends the audio buffer to the `/v1/audio/transcriptions` endpoint
|
||||
* with `response_format=verbose_json` to get segments and duration data.
|
||||
*
|
||||
* @param audio - Raw audio data as a Buffer
|
||||
* @param options - Optional transcription parameters (model, language, prompt, temperature)
|
||||
* @returns Transcription result with text, language, duration, and optional segments
|
||||
* @throws {Error} If transcription fails (connection error, API error, etc.)
|
||||
*/
|
||||
async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
|
||||
const model = options?.model ?? this.config.stt.model;
|
||||
const language = options?.language ?? this.config.stt.language;
|
||||
const mimeType = options?.mimeType ?? "audio/wav";
|
||||
const extension = extensionFromMimeType(mimeType);
|
||||
|
||||
try {
|
||||
const file = await toFile(audio, `audio.${extension}`, {
|
||||
type: mimeType,
|
||||
});
|
||||
|
||||
const response = await this.client.audio.transcriptions.create({
|
||||
file,
|
||||
model,
|
||||
language,
|
||||
response_format: "verbose_json",
|
||||
...(options?.prompt !== undefined ? { prompt: options.prompt } : {}),
|
||||
...(options?.temperature !== undefined ? { temperature: options.temperature } : {}),
|
||||
});
|
||||
|
||||
return this.mapResponse(response, language);
|
||||
} catch (error: unknown) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
this.logger.error(`Transcription failed: ${message}`);
|
||||
throw new Error(`STT transcription failed: ${message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the Speaches server is healthy and reachable.
|
||||
*
|
||||
* Attempts to list models from the server. Returns true if the request
|
||||
* succeeds, false otherwise.
|
||||
*
|
||||
* @returns true if the Speaches server is reachable and ready
|
||||
*/
|
||||
async isHealthy(): Promise<boolean> {
|
||||
try {
|
||||
await this.client.models.list();
|
||||
return true;
|
||||
} catch (error: unknown) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
this.logger.warn(`Speaches health check failed: ${message}`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Map the OpenAI SDK transcription response to our TranscriptionResult type.
|
||||
*
|
||||
* Handles both verbose responses (with duration, segments) and simple
|
||||
* responses (text only).
|
||||
*/
|
||||
private mapResponse(
|
||||
response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record<string, unknown>,
|
||||
fallbackLanguage: string
|
||||
): TranscriptionResult {
|
||||
const text = (response as { text: string }).text;
|
||||
const verboseResponse = response as {
|
||||
text: string;
|
||||
language?: string;
|
||||
duration?: number;
|
||||
segments?: {
|
||||
text: string;
|
||||
start: number;
|
||||
end: number;
|
||||
}[];
|
||||
};
|
||||
|
||||
const result: TranscriptionResult = {
|
||||
text,
|
||||
language: verboseResponse.language ?? fallbackLanguage,
|
||||
};
|
||||
|
||||
if (verboseResponse.duration !== undefined) {
|
||||
result.durationSeconds = verboseResponse.duration;
|
||||
}
|
||||
|
||||
if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) {
|
||||
result.segments = verboseResponse.segments.map(
|
||||
(segment): TranscriptionSegment => ({
|
||||
text: segment.text,
|
||||
start: segment.start,
|
||||
end: segment.end,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -4,36 +4,60 @@
|
||||
* NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
|
||||
* Provides a provider abstraction layer with graceful fallback for TTS tiers.
|
||||
*
|
||||
* TTS providers are created dynamically based on configuration:
|
||||
* - default: Kokoro-FastAPI (CPU, always available)
|
||||
* - premium: Chatterbox (GPU, voice cloning)
|
||||
* - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
|
||||
*
|
||||
* Imports:
|
||||
* - ConfigModule.forFeature(speechConfig) for speech configuration
|
||||
*
|
||||
* Providers:
|
||||
* - SpeechService: High-level speech operations with provider selection
|
||||
* - TTS_PROVIDERS: Empty Map<SpeechTier, ITTSProvider> (populated by provider modules)
|
||||
* - TTS_PROVIDERS: Map<SpeechTier, ITTSProvider> populated by factory based on config
|
||||
*
|
||||
* Exports:
|
||||
* - SpeechService for use by other modules (e.g., controllers, brain)
|
||||
*
|
||||
* Issue #389
|
||||
* Issue #389, #390, #391
|
||||
*/
|
||||
|
||||
import { Module, type OnModuleInit, Logger } from "@nestjs/common";
|
||||
import { ConfigModule } from "@nestjs/config";
|
||||
import { speechConfig, validateSpeechConfig } from "./speech.config";
|
||||
import { ConfigModule, ConfigService } from "@nestjs/config";
|
||||
import {
|
||||
speechConfig,
|
||||
validateSpeechConfig,
|
||||
isSttEnabled,
|
||||
type SpeechConfig,
|
||||
} from "./speech.config";
|
||||
import { SpeechService } from "./speech.service";
|
||||
import { TTS_PROVIDERS } from "./speech.constants";
|
||||
import type { SpeechTier } from "./interfaces/speech-types";
|
||||
import type { ITTSProvider } from "./interfaces/tts-provider.interface";
|
||||
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
|
||||
import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
|
||||
import { createTTSProviders } from "./providers/tts-provider.factory";
|
||||
|
||||
@Module({
|
||||
imports: [ConfigModule.forFeature(speechConfig)],
|
||||
providers: [
|
||||
SpeechService,
|
||||
// Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.)
|
||||
// will register their providers in subsequent tasks.
|
||||
// STT provider: conditionally register SpeachesSttProvider when STT is enabled
|
||||
...(isSttEnabled()
|
||||
? [
|
||||
{
|
||||
provide: STT_PROVIDER,
|
||||
useClass: SpeachesSttProvider,
|
||||
},
|
||||
]
|
||||
: []),
|
||||
{
|
||||
provide: TTS_PROVIDERS,
|
||||
useFactory: (): Map<SpeechTier, ITTSProvider> => new Map(),
|
||||
useFactory: (configService: ConfigService) => {
|
||||
const config = configService.get<SpeechConfig>("speech");
|
||||
if (!config) {
|
||||
return new Map();
|
||||
}
|
||||
return createTTSProviders(config);
|
||||
},
|
||||
inject: [ConfigService],
|
||||
},
|
||||
],
|
||||
exports: [SpeechService],
|
||||
|
||||
Reference in New Issue
Block a user