chore: upgrade Node.js runtime to v24 across codebase #419
468
apps/api/src/speech/providers/speaches-stt.provider.spec.ts
Normal file
468
apps/api/src/speech/providers/speaches-stt.provider.spec.ts
Normal file
@@ -0,0 +1,468 @@
|
|||||||
|
/**
|
||||||
|
* SpeachesSttProvider Tests
|
||||||
|
*
|
||||||
|
* TDD tests for the Speaches/faster-whisper STT provider.
|
||||||
|
* Tests cover transcription, error handling, health checks, and config injection.
|
||||||
|
*
|
||||||
|
* Issue #390
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, expect, beforeEach, vi } from "vitest";
|
||||||
|
import { SpeachesSttProvider } from "./speaches-stt.provider";
|
||||||
|
import type { SpeechConfig } from "../speech.config";
|
||||||
|
import type { TranscribeOptions } from "../interfaces/speech-types";
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Mock OpenAI SDK
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => {
|
||||||
|
const mockCreate = vi.fn();
|
||||||
|
const mockModelsList = vi.fn();
|
||||||
|
const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => {
|
||||||
|
return new File([buffer], name);
|
||||||
|
});
|
||||||
|
const mockOpenAIConstructorCalls: Array<Record<string, unknown>> = [];
|
||||||
|
return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls };
|
||||||
|
});
|
||||||
|
|
||||||
|
vi.mock("openai", () => {
|
||||||
|
class MockOpenAI {
|
||||||
|
audio = {
|
||||||
|
transcriptions: {
|
||||||
|
create: mockCreate,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
models = {
|
||||||
|
list: mockModelsList,
|
||||||
|
};
|
||||||
|
constructor(config: Record<string, unknown>) {
|
||||||
|
mockOpenAIConstructorCalls.push(config);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
default: MockOpenAI,
|
||||||
|
toFile: mockToFile,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Test helpers
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
function createTestConfig(overrides?: Partial<SpeechConfig["stt"]>): SpeechConfig {
|
||||||
|
return {
|
||||||
|
stt: {
|
||||||
|
enabled: true,
|
||||||
|
baseUrl: "http://speaches:8000/v1",
|
||||||
|
model: "Systran/faster-whisper-large-v3-turbo",
|
||||||
|
language: "en",
|
||||||
|
...overrides,
|
||||||
|
},
|
||||||
|
tts: {
|
||||||
|
default: { enabled: false, url: "", voice: "", format: "" },
|
||||||
|
premium: { enabled: false, url: "" },
|
||||||
|
fallback: { enabled: false, url: "" },
|
||||||
|
},
|
||||||
|
limits: {
|
||||||
|
maxUploadSize: 25_000_000,
|
||||||
|
maxDurationSeconds: 600,
|
||||||
|
maxTextLength: 4096,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function createMockVerboseResponse(overrides?: Record<string, unknown>): Record<string, unknown> {
|
||||||
|
return {
|
||||||
|
text: "Hello, world!",
|
||||||
|
language: "en",
|
||||||
|
duration: 3.5,
|
||||||
|
segments: [
|
||||||
|
{
|
||||||
|
id: 0,
|
||||||
|
text: "Hello, world!",
|
||||||
|
start: 0.0,
|
||||||
|
end: 3.5,
|
||||||
|
avg_logprob: -0.25,
|
||||||
|
compression_ratio: 1.2,
|
||||||
|
no_speech_prob: 0.01,
|
||||||
|
seek: 0,
|
||||||
|
temperature: 0.0,
|
||||||
|
tokens: [1, 2, 3],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
...overrides,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("SpeachesSttProvider", () => {
|
||||||
|
let provider: SpeachesSttProvider;
|
||||||
|
let config: SpeechConfig;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
mockOpenAIConstructorCalls.length = 0;
|
||||||
|
config = createTestConfig();
|
||||||
|
provider = new SpeachesSttProvider(config);
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Provider identity
|
||||||
|
// ==========================================
|
||||||
|
describe("name", () => {
|
||||||
|
it("should have the name 'speaches'", () => {
|
||||||
|
expect(provider.name).toBe("speaches");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// transcribe
|
||||||
|
// ==========================================
|
||||||
|
describe("transcribe", () => {
|
||||||
|
it("should call OpenAI audio.transcriptions.create with correct parameters", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledOnce();
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0];
|
||||||
|
expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo");
|
||||||
|
expect(callArgs.language).toBe("en");
|
||||||
|
expect(callArgs.response_format).toBe("verbose_json");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should convert Buffer to File using toFile", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", {
|
||||||
|
type: "audio/wav",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return TranscriptionResult with text and language", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const result = await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(result.text).toBe("Hello, world!");
|
||||||
|
expect(result.language).toBe("en");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return durationSeconds from verbose response", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse({ duration: 5.25 });
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const result = await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(result.durationSeconds).toBe(5.25);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should map segments from verbose response", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse({
|
||||||
|
segments: [
|
||||||
|
{
|
||||||
|
id: 0,
|
||||||
|
text: "Hello,",
|
||||||
|
start: 0.0,
|
||||||
|
end: 1.5,
|
||||||
|
avg_logprob: -0.2,
|
||||||
|
compression_ratio: 1.1,
|
||||||
|
no_speech_prob: 0.01,
|
||||||
|
seek: 0,
|
||||||
|
temperature: 0.0,
|
||||||
|
tokens: [1, 2],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 1,
|
||||||
|
text: " world!",
|
||||||
|
start: 1.5,
|
||||||
|
end: 3.5,
|
||||||
|
avg_logprob: -0.3,
|
||||||
|
compression_ratio: 1.3,
|
||||||
|
no_speech_prob: 0.02,
|
||||||
|
seek: 0,
|
||||||
|
temperature: 0.0,
|
||||||
|
tokens: [3, 4],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const result = await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(result.segments).toHaveLength(2);
|
||||||
|
expect(result.segments?.[0]).toEqual({
|
||||||
|
text: "Hello,",
|
||||||
|
start: 0.0,
|
||||||
|
end: 1.5,
|
||||||
|
});
|
||||||
|
expect(result.segments?.[1]).toEqual({
|
||||||
|
text: " world!",
|
||||||
|
start: 1.5,
|
||||||
|
end: 3.5,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle response without segments gracefully", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse({ segments: undefined });
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const result = await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(result.text).toBe("Hello, world!");
|
||||||
|
expect(result.segments).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle response without duration gracefully", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse({ duration: undefined });
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const result = await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(result.text).toBe("Hello, world!");
|
||||||
|
expect(result.durationSeconds).toBeUndefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
// ------------------------------------------
|
||||||
|
// Options override
|
||||||
|
// ------------------------------------------
|
||||||
|
describe("options override", () => {
|
||||||
|
it("should use custom model from options when provided", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const options: TranscribeOptions = { model: "custom-whisper-model" };
|
||||||
|
await provider.transcribe(audio, options);
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0];
|
||||||
|
expect(callArgs.model).toBe("custom-whisper-model");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use custom language from options when provided", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse({ language: "fr" });
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const options: TranscribeOptions = { language: "fr" };
|
||||||
|
await provider.transcribe(audio, options);
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0];
|
||||||
|
expect(callArgs.language).toBe("fr");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass through prompt option", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const options: TranscribeOptions = { prompt: "This is a meeting about project planning." };
|
||||||
|
await provider.transcribe(audio, options);
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0];
|
||||||
|
expect(callArgs.prompt).toBe("This is a meeting about project planning.");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass through temperature option", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const options: TranscribeOptions = { temperature: 0.3 };
|
||||||
|
await provider.transcribe(audio, options);
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0];
|
||||||
|
expect(callArgs.temperature).toBe(0.3);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use custom mimeType for file conversion when provided", async () => {
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const options: TranscribeOptions = { mimeType: "audio/mp3" };
|
||||||
|
await provider.transcribe(audio, options);
|
||||||
|
|
||||||
|
expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", {
|
||||||
|
type: "audio/mp3",
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ------------------------------------------
|
||||||
|
// Simple response fallback
|
||||||
|
// ------------------------------------------
|
||||||
|
describe("simple response fallback", () => {
|
||||||
|
it("should handle simple Transcription response (text only, no verbose fields)", async () => {
|
||||||
|
// Some configurations may return just { text: "..." } without verbose fields
|
||||||
|
const simpleResponse = { text: "Simple transcription result." };
|
||||||
|
mockCreate.mockResolvedValueOnce(simpleResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
const result = await provider.transcribe(audio);
|
||||||
|
|
||||||
|
expect(result.text).toBe("Simple transcription result.");
|
||||||
|
expect(result.language).toBe("en"); // Falls back to config language
|
||||||
|
expect(result.durationSeconds).toBeUndefined();
|
||||||
|
expect(result.segments).toBeUndefined();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Error handling
|
||||||
|
// ==========================================
|
||||||
|
describe("error handling", () => {
|
||||||
|
it("should throw a descriptive error on connection refused", async () => {
|
||||||
|
const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000");
|
||||||
|
mockCreate.mockRejectedValueOnce(connectionError);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||||
|
"STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should throw a descriptive error on timeout", async () => {
|
||||||
|
const timeoutError = new Error("Request timed out");
|
||||||
|
mockCreate.mockRejectedValueOnce(timeoutError);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||||
|
"STT transcription failed: Request timed out"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should throw a descriptive error on API error", async () => {
|
||||||
|
const apiError = new Error("Invalid model: nonexistent-model");
|
||||||
|
mockCreate.mockRejectedValueOnce(apiError);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||||
|
"STT transcription failed: Invalid model: nonexistent-model"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should handle non-Error thrown values", async () => {
|
||||||
|
mockCreate.mockRejectedValueOnce("unexpected string error");
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await expect(provider.transcribe(audio)).rejects.toThrow(
|
||||||
|
"STT transcription failed: unexpected string error"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// isHealthy
|
||||||
|
// ==========================================
|
||||||
|
describe("isHealthy", () => {
|
||||||
|
it("should return true when the server is reachable", async () => {
|
||||||
|
mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] });
|
||||||
|
|
||||||
|
const healthy = await provider.isHealthy();
|
||||||
|
expect(healthy).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false when the server is unreachable", async () => {
|
||||||
|
mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
|
||||||
|
|
||||||
|
const healthy = await provider.isHealthy();
|
||||||
|
expect(healthy).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should not throw on health check failure", async () => {
|
||||||
|
mockModelsList.mockRejectedValueOnce(new Error("Network error"));
|
||||||
|
|
||||||
|
await expect(provider.isHealthy()).resolves.toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false on unexpected error types", async () => {
|
||||||
|
mockModelsList.mockRejectedValueOnce("string error");
|
||||||
|
|
||||||
|
const healthy = await provider.isHealthy();
|
||||||
|
expect(healthy).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Config injection
|
||||||
|
// ==========================================
|
||||||
|
describe("config injection", () => {
|
||||||
|
it("should create OpenAI client with baseURL from config", () => {
|
||||||
|
// The constructor was called in beforeEach
|
||||||
|
expect(mockOpenAIConstructorCalls).toHaveLength(1);
|
||||||
|
expect(mockOpenAIConstructorCalls[0]).toEqual(
|
||||||
|
expect.objectContaining({
|
||||||
|
baseURL: "http://speaches:8000/v1",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use custom baseURL from config", () => {
|
||||||
|
mockOpenAIConstructorCalls.length = 0;
|
||||||
|
const customConfig = createTestConfig({
|
||||||
|
baseUrl: "http://custom-speaches:9000/v1",
|
||||||
|
});
|
||||||
|
new SpeachesSttProvider(customConfig);
|
||||||
|
|
||||||
|
expect(mockOpenAIConstructorCalls).toHaveLength(1);
|
||||||
|
expect(mockOpenAIConstructorCalls[0]).toEqual(
|
||||||
|
expect.objectContaining({
|
||||||
|
baseURL: "http://custom-speaches:9000/v1",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use default model from config for transcription", async () => {
|
||||||
|
const customConfig = createTestConfig({
|
||||||
|
model: "Systran/faster-whisper-small",
|
||||||
|
});
|
||||||
|
const customProvider = new SpeachesSttProvider(customConfig);
|
||||||
|
|
||||||
|
const mockResponse = createMockVerboseResponse();
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await customProvider.transcribe(audio);
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0];
|
||||||
|
expect(callArgs.model).toBe("Systran/faster-whisper-small");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use default language from config for transcription", async () => {
|
||||||
|
const customConfig = createTestConfig({ language: "de" });
|
||||||
|
const customProvider = new SpeachesSttProvider(customConfig);
|
||||||
|
|
||||||
|
const mockResponse = createMockVerboseResponse({ language: "de" });
|
||||||
|
mockCreate.mockResolvedValueOnce(mockResponse);
|
||||||
|
|
||||||
|
const audio = Buffer.from("fake-audio-data");
|
||||||
|
await customProvider.transcribe(audio);
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0];
|
||||||
|
expect(callArgs.language).toBe("de");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set a dummy API key for local Speaches server", () => {
|
||||||
|
expect(mockOpenAIConstructorCalls).toHaveLength(1);
|
||||||
|
expect(mockOpenAIConstructorCalls[0]).toEqual(
|
||||||
|
expect.objectContaining({
|
||||||
|
apiKey: "not-needed",
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
180
apps/api/src/speech/providers/speaches-stt.provider.ts
Normal file
180
apps/api/src/speech/providers/speaches-stt.provider.ts
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
/**
|
||||||
|
* SpeachesSttProvider
|
||||||
|
*
|
||||||
|
* Speech-to-text provider using Speaches (faster-whisper backend).
|
||||||
|
* Connects to the Speaches server via its OpenAI-compatible
|
||||||
|
* `/v1/audio/transcriptions` endpoint using the OpenAI SDK.
|
||||||
|
*
|
||||||
|
* Issue #390
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { Injectable, Inject, Logger } from "@nestjs/common";
|
||||||
|
import OpenAI from "openai";
|
||||||
|
import { toFile } from "openai";
|
||||||
|
import { speechConfig, type SpeechConfig } from "../speech.config";
|
||||||
|
import type { ISTTProvider } from "../interfaces/stt-provider.interface";
|
||||||
|
import type {
|
||||||
|
TranscribeOptions,
|
||||||
|
TranscriptionResult,
|
||||||
|
TranscriptionSegment,
|
||||||
|
} from "../interfaces/speech-types";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Derive file extension from a MIME type for use in the uploaded file name.
|
||||||
|
*/
|
||||||
|
function extensionFromMimeType(mimeType: string): string {
|
||||||
|
const mapping: Record<string, string> = {
|
||||||
|
"audio/wav": "wav",
|
||||||
|
"audio/wave": "wav",
|
||||||
|
"audio/x-wav": "wav",
|
||||||
|
"audio/mp3": "mp3",
|
||||||
|
"audio/mpeg": "mp3",
|
||||||
|
"audio/mp4": "mp4",
|
||||||
|
"audio/m4a": "m4a",
|
||||||
|
"audio/ogg": "ogg",
|
||||||
|
"audio/flac": "flac",
|
||||||
|
"audio/webm": "webm",
|
||||||
|
"audio/mpga": "mpga",
|
||||||
|
};
|
||||||
|
return mapping[mimeType] ?? "wav";
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* STT provider backed by a Speaches (faster-whisper) server.
|
||||||
|
*
|
||||||
|
* Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint,
|
||||||
|
* so we re-use the official OpenAI SDK with a custom `baseURL`.
|
||||||
|
*
|
||||||
|
* @example
|
||||||
|
* ```typescript
|
||||||
|
* const provider = new SpeachesSttProvider(speechConfig);
|
||||||
|
* const result = await provider.transcribe(audioBuffer, { language: "en" });
|
||||||
|
* console.log(result.text);
|
||||||
|
* ```
|
||||||
|
*/
|
||||||
|
@Injectable()
|
||||||
|
export class SpeachesSttProvider implements ISTTProvider {
|
||||||
|
readonly name = "speaches";
|
||||||
|
|
||||||
|
private readonly logger = new Logger(SpeachesSttProvider.name);
|
||||||
|
private readonly client: OpenAI;
|
||||||
|
private readonly config: SpeechConfig;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
@Inject(speechConfig.KEY)
|
||||||
|
config: SpeechConfig
|
||||||
|
) {
|
||||||
|
this.config = config;
|
||||||
|
|
||||||
|
this.client = new OpenAI({
|
||||||
|
baseURL: config.stt.baseUrl,
|
||||||
|
apiKey: "not-needed", // Speaches does not require an API key
|
||||||
|
});
|
||||||
|
|
||||||
|
this.logger.log(
|
||||||
|
`Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcribe audio data to text using the Speaches server.
|
||||||
|
*
|
||||||
|
* Sends the audio buffer to the `/v1/audio/transcriptions` endpoint
|
||||||
|
* with `response_format=verbose_json` to get segments and duration data.
|
||||||
|
*
|
||||||
|
* @param audio - Raw audio data as a Buffer
|
||||||
|
* @param options - Optional transcription parameters (model, language, prompt, temperature)
|
||||||
|
* @returns Transcription result with text, language, duration, and optional segments
|
||||||
|
* @throws {Error} If transcription fails (connection error, API error, etc.)
|
||||||
|
*/
|
||||||
|
async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
|
||||||
|
const model = options?.model ?? this.config.stt.model;
|
||||||
|
const language = options?.language ?? this.config.stt.language;
|
||||||
|
const mimeType = options?.mimeType ?? "audio/wav";
|
||||||
|
const extension = extensionFromMimeType(mimeType);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const file = await toFile(audio, `audio.${extension}`, {
|
||||||
|
type: mimeType,
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await this.client.audio.transcriptions.create({
|
||||||
|
file,
|
||||||
|
model,
|
||||||
|
language,
|
||||||
|
response_format: "verbose_json",
|
||||||
|
...(options?.prompt !== undefined ? { prompt: options.prompt } : {}),
|
||||||
|
...(options?.temperature !== undefined ? { temperature: options.temperature } : {}),
|
||||||
|
});
|
||||||
|
|
||||||
|
return this.mapResponse(response, language);
|
||||||
|
} catch (error: unknown) {
|
||||||
|
const message = error instanceof Error ? error.message : String(error);
|
||||||
|
this.logger.error(`Transcription failed: ${message}`);
|
||||||
|
throw new Error(`STT transcription failed: ${message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the Speaches server is healthy and reachable.
|
||||||
|
*
|
||||||
|
* Attempts to list models from the server. Returns true if the request
|
||||||
|
* succeeds, false otherwise.
|
||||||
|
*
|
||||||
|
* @returns true if the Speaches server is reachable and ready
|
||||||
|
*/
|
||||||
|
async isHealthy(): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await this.client.models.list();
|
||||||
|
return true;
|
||||||
|
} catch (error: unknown) {
|
||||||
|
const message = error instanceof Error ? error.message : String(error);
|
||||||
|
this.logger.warn(`Speaches health check failed: ${message}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map the OpenAI SDK transcription response to our TranscriptionResult type.
|
||||||
|
*
|
||||||
|
* Handles both verbose responses (with duration, segments) and simple
|
||||||
|
* responses (text only).
|
||||||
|
*/
|
||||||
|
private mapResponse(
|
||||||
|
response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record<string, unknown>,
|
||||||
|
fallbackLanguage: string
|
||||||
|
): TranscriptionResult {
|
||||||
|
const text = (response as { text: string }).text;
|
||||||
|
const verboseResponse = response as {
|
||||||
|
text: string;
|
||||||
|
language?: string;
|
||||||
|
duration?: number;
|
||||||
|
segments?: {
|
||||||
|
text: string;
|
||||||
|
start: number;
|
||||||
|
end: number;
|
||||||
|
}[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const result: TranscriptionResult = {
|
||||||
|
text,
|
||||||
|
language: verboseResponse.language ?? fallbackLanguage,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (verboseResponse.duration !== undefined) {
|
||||||
|
result.durationSeconds = verboseResponse.duration;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) {
|
||||||
|
result.segments = verboseResponse.segments.map(
|
||||||
|
(segment): TranscriptionSegment => ({
|
||||||
|
text: segment.text,
|
||||||
|
start: segment.start,
|
||||||
|
end: segment.end,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,36 +4,60 @@
|
|||||||
* NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
|
* NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
|
||||||
* Provides a provider abstraction layer with graceful fallback for TTS tiers.
|
* Provides a provider abstraction layer with graceful fallback for TTS tiers.
|
||||||
*
|
*
|
||||||
|
* TTS providers are created dynamically based on configuration:
|
||||||
|
* - default: Kokoro-FastAPI (CPU, always available)
|
||||||
|
* - premium: Chatterbox (GPU, voice cloning)
|
||||||
|
* - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
|
||||||
|
*
|
||||||
* Imports:
|
* Imports:
|
||||||
* - ConfigModule.forFeature(speechConfig) for speech configuration
|
* - ConfigModule.forFeature(speechConfig) for speech configuration
|
||||||
*
|
*
|
||||||
* Providers:
|
* Providers:
|
||||||
* - SpeechService: High-level speech operations with provider selection
|
* - SpeechService: High-level speech operations with provider selection
|
||||||
* - TTS_PROVIDERS: Empty Map<SpeechTier, ITTSProvider> (populated by provider modules)
|
* - TTS_PROVIDERS: Map<SpeechTier, ITTSProvider> populated by factory based on config
|
||||||
*
|
*
|
||||||
* Exports:
|
* Exports:
|
||||||
* - SpeechService for use by other modules (e.g., controllers, brain)
|
* - SpeechService for use by other modules (e.g., controllers, brain)
|
||||||
*
|
*
|
||||||
* Issue #389
|
* Issue #389, #390, #391
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { Module, type OnModuleInit, Logger } from "@nestjs/common";
|
import { Module, type OnModuleInit, Logger } from "@nestjs/common";
|
||||||
import { ConfigModule } from "@nestjs/config";
|
import { ConfigModule, ConfigService } from "@nestjs/config";
|
||||||
import { speechConfig, validateSpeechConfig } from "./speech.config";
|
import {
|
||||||
|
speechConfig,
|
||||||
|
validateSpeechConfig,
|
||||||
|
isSttEnabled,
|
||||||
|
type SpeechConfig,
|
||||||
|
} from "./speech.config";
|
||||||
import { SpeechService } from "./speech.service";
|
import { SpeechService } from "./speech.service";
|
||||||
import { TTS_PROVIDERS } from "./speech.constants";
|
import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
|
||||||
import type { SpeechTier } from "./interfaces/speech-types";
|
import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
|
||||||
import type { ITTSProvider } from "./interfaces/tts-provider.interface";
|
import { createTTSProviders } from "./providers/tts-provider.factory";
|
||||||
|
|
||||||
@Module({
|
@Module({
|
||||||
imports: [ConfigModule.forFeature(speechConfig)],
|
imports: [ConfigModule.forFeature(speechConfig)],
|
||||||
providers: [
|
providers: [
|
||||||
SpeechService,
|
SpeechService,
|
||||||
// Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.)
|
// STT provider: conditionally register SpeachesSttProvider when STT is enabled
|
||||||
// will register their providers in subsequent tasks.
|
...(isSttEnabled()
|
||||||
|
? [
|
||||||
|
{
|
||||||
|
provide: STT_PROVIDER,
|
||||||
|
useClass: SpeachesSttProvider,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
: []),
|
||||||
{
|
{
|
||||||
provide: TTS_PROVIDERS,
|
provide: TTS_PROVIDERS,
|
||||||
useFactory: (): Map<SpeechTier, ITTSProvider> => new Map(),
|
useFactory: (configService: ConfigService) => {
|
||||||
|
const config = configService.get<SpeechConfig>("speech");
|
||||||
|
if (!config) {
|
||||||
|
return new Map();
|
||||||
|
}
|
||||||
|
return createTTSProviders(config);
|
||||||
|
},
|
||||||
|
inject: [ConfigService],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
exports: [SpeechService],
|
exports: [SpeechService],
|
||||||
|
|||||||
Reference in New Issue
Block a user