feat(#394): implement Chatterbox TTS provider with voice cloning
All checks were successful
ci/woodpecker/push/api Pipeline was successful
All checks were successful
ci/woodpecker/push/api Pipeline was successful
Add ChatterboxSynthesizeOptions interface with referenceAudio and emotionExaggeration fields, and comprehensive unit tests (26 tests) covering voice cloning, emotion control, clamping, graceful degradation, and cross-language support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -128,6 +128,33 @@ export interface SynthesisResult {
|
|||||||
durationSeconds?: number;
|
durationSeconds?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extended options for Chatterbox TTS synthesis.
|
||||||
|
*
|
||||||
|
* Chatterbox supports voice cloning via a reference audio buffer and
|
||||||
|
* emotion exaggeration control. These are passed as extra body parameters
|
||||||
|
* to the OpenAI-compatible API.
|
||||||
|
*
|
||||||
|
* Issue #394
|
||||||
|
*/
|
||||||
|
export interface ChatterboxSynthesizeOptions extends SynthesizeOptions {
|
||||||
|
/**
|
||||||
|
* Reference audio buffer for voice cloning.
|
||||||
|
* When provided, Chatterbox will clone the voice from this audio sample.
|
||||||
|
* Should be a WAV or MP3 file of 5-30 seconds for best results.
|
||||||
|
*/
|
||||||
|
referenceAudio?: Buffer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Emotion exaggeration factor (0.0 to 1.0).
|
||||||
|
* Controls how much emotional expression is applied to the synthesized speech.
|
||||||
|
* - 0.0: Neutral, minimal emotion
|
||||||
|
* - 0.5: Moderate emotion (default when not specified)
|
||||||
|
* - 1.0: Maximum emotion exaggeration
|
||||||
|
*/
|
||||||
|
emotionExaggeration?: number;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Information about an available TTS voice.
|
* Information about an available TTS voice.
|
||||||
*/
|
*/
|
||||||
|
|||||||
436
apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts
Normal file
436
apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts
Normal file
@@ -0,0 +1,436 @@
|
|||||||
|
/**
|
||||||
|
* ChatterboxTTSProvider Unit Tests
|
||||||
|
*
|
||||||
|
* Tests the premium-tier TTS provider with voice cloning and
|
||||||
|
* emotion exaggeration support for Chatterbox.
|
||||||
|
*
|
||||||
|
* Issue #394
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, it, expect, beforeEach, vi, type Mock } from "vitest";
|
||||||
|
import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
|
||||||
|
import type { ChatterboxSynthesizeOptions, AudioFormat } from "../interfaces/speech-types";
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Mock OpenAI SDK
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
const mockCreate = vi.fn();
|
||||||
|
|
||||||
|
vi.mock("openai", () => {
|
||||||
|
class MockOpenAI {
|
||||||
|
audio = {
|
||||||
|
speech: {
|
||||||
|
create: mockCreate,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return { default: MockOpenAI };
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Test helpers
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return.
|
||||||
|
*/
|
||||||
|
function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } {
|
||||||
|
return {
|
||||||
|
arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("ChatterboxTTSProvider", () => {
|
||||||
|
let provider: ChatterboxTTSProvider;
|
||||||
|
|
||||||
|
const testBaseURL = "http://chatterbox-tts:8881/v1";
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
vi.clearAllMocks();
|
||||||
|
provider = new ChatterboxTTSProvider(testBaseURL);
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Provider identity
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("provider identity", () => {
|
||||||
|
it("should have name 'chatterbox'", () => {
|
||||||
|
expect(provider.name).toBe("chatterbox");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should have tier 'premium'", () => {
|
||||||
|
expect(provider.tier).toBe("premium");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// Constructor
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("constructor", () => {
|
||||||
|
it("should create an instance with the provided baseURL", () => {
|
||||||
|
expect(provider).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use 'default' as the default voice", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01, 0x02]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const result = await provider.synthesize("Hello");
|
||||||
|
|
||||||
|
expect(result.voice).toBe("default");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use 'wav' as the default format", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01, 0x02]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const result = await provider.synthesize("Hello");
|
||||||
|
|
||||||
|
expect(result.format).toBe("wav");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// synthesize() — basic (no Chatterbox-specific options)
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("synthesize (basic)", () => {
|
||||||
|
it("should synthesize text and return a SynthesisResult", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const result = await provider.synthesize("Hello, world!");
|
||||||
|
|
||||||
|
expect(result).toBeDefined();
|
||||||
|
expect(result.audio).toBeInstanceOf(Buffer);
|
||||||
|
expect(result.audio.length).toBe(audioBytes.length);
|
||||||
|
expect(result.format).toBe("wav");
|
||||||
|
expect(result.voice).toBe("default");
|
||||||
|
expect(result.tier).toBe("premium");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should pass correct base parameters to OpenAI SDK when no extra options", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
await provider.synthesize("Test text");
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith({
|
||||||
|
model: "tts-1",
|
||||||
|
input: "Test text",
|
||||||
|
voice: "default",
|
||||||
|
response_format: "wav",
|
||||||
|
speed: 1.0,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use custom voice from options", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const options: ChatterboxSynthesizeOptions = { voice: "cloned_voice_1" };
|
||||||
|
const result = await provider.synthesize("Hello", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "cloned_voice_1" }));
|
||||||
|
expect(result.voice).toBe("cloned_voice_1");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should use custom format from options", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const options: ChatterboxSynthesizeOptions = { format: "mp3" as AudioFormat };
|
||||||
|
const result = await provider.synthesize("Hello", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" }));
|
||||||
|
expect(result.format).toBe("mp3");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should throw on synthesis failure", async () => {
|
||||||
|
mockCreate.mockRejectedValue(new Error("GPU out of memory"));
|
||||||
|
|
||||||
|
await expect(provider.synthesize("Hello")).rejects.toThrow(
|
||||||
|
"TTS synthesis failed for chatterbox: GPU out of memory"
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// synthesize() — voice cloning (referenceAudio)
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("synthesize (voice cloning)", () => {
|
||||||
|
it("should pass referenceAudio as base64 in extra body params", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01, 0x02]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const referenceAudio = Buffer.from("fake-audio-data-for-cloning");
|
||||||
|
const options: ChatterboxSynthesizeOptions = {
|
||||||
|
referenceAudio,
|
||||||
|
};
|
||||||
|
|
||||||
|
await provider.synthesize("Clone my voice", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
input: "Clone my voice",
|
||||||
|
reference_audio: referenceAudio.toString("base64"),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should not include reference_audio when referenceAudio is not provided", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
await provider.synthesize("No cloning");
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0] as Record<string, unknown>;
|
||||||
|
expect(callArgs).not.toHaveProperty("reference_audio");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// synthesize() — emotion exaggeration
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("synthesize (emotion exaggeration)", () => {
|
||||||
|
it("should pass emotionExaggeration as exaggeration in extra body params", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01, 0x02]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const options: ChatterboxSynthesizeOptions = {
|
||||||
|
emotionExaggeration: 0.7,
|
||||||
|
};
|
||||||
|
|
||||||
|
await provider.synthesize("Very emotional text", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
exaggeration: 0.7,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should not include exaggeration when emotionExaggeration is not provided", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
await provider.synthesize("Neutral text");
|
||||||
|
|
||||||
|
const callArgs = mockCreate.mock.calls[0][0] as Record<string, unknown>;
|
||||||
|
expect(callArgs).not.toHaveProperty("exaggeration");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should accept emotionExaggeration of 0.0", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const options: ChatterboxSynthesizeOptions = {
|
||||||
|
emotionExaggeration: 0.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
await provider.synthesize("Minimal emotion", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
exaggeration: 0.0,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should accept emotionExaggeration of 1.0", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const options: ChatterboxSynthesizeOptions = {
|
||||||
|
emotionExaggeration: 1.0,
|
||||||
|
};
|
||||||
|
|
||||||
|
await provider.synthesize("Maximum emotion", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
exaggeration: 1.0,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should clamp emotionExaggeration above 1.0 to 1.0", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const options: ChatterboxSynthesizeOptions = {
|
||||||
|
emotionExaggeration: 1.5,
|
||||||
|
};
|
||||||
|
|
||||||
|
await provider.synthesize("Over the top", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
exaggeration: 1.0,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should clamp emotionExaggeration below 0.0 to 0.0", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const options: ChatterboxSynthesizeOptions = {
|
||||||
|
emotionExaggeration: -0.5,
|
||||||
|
};
|
||||||
|
|
||||||
|
await provider.synthesize("Negative emotion", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith(
|
||||||
|
expect.objectContaining({
|
||||||
|
exaggeration: 0.0,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// synthesize() — combined options
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("synthesize (combined options)", () => {
|
||||||
|
it("should handle referenceAudio and emotionExaggeration together", async () => {
|
||||||
|
const audioBytes = new Uint8Array([0x01, 0x02, 0x03]);
|
||||||
|
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
|
||||||
|
|
||||||
|
const referenceAudio = Buffer.from("reference-audio-sample");
|
||||||
|
const options: ChatterboxSynthesizeOptions = {
|
||||||
|
voice: "custom_voice",
|
||||||
|
format: "mp3",
|
||||||
|
speed: 0.9,
|
||||||
|
referenceAudio,
|
||||||
|
emotionExaggeration: 0.6,
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await provider.synthesize("Full options test", options);
|
||||||
|
|
||||||
|
expect(mockCreate).toHaveBeenCalledWith({
|
||||||
|
model: "tts-1",
|
||||||
|
input: "Full options test",
|
||||||
|
voice: "custom_voice",
|
||||||
|
response_format: "mp3",
|
||||||
|
speed: 0.9,
|
||||||
|
reference_audio: referenceAudio.toString("base64"),
|
||||||
|
exaggeration: 0.6,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.audio).toBeInstanceOf(Buffer);
|
||||||
|
expect(result.voice).toBe("custom_voice");
|
||||||
|
expect(result.format).toBe("mp3");
|
||||||
|
expect(result.tier).toBe("premium");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// isHealthy() — graceful degradation
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("isHealthy (graceful degradation)", () => {
|
||||||
|
it("should return true when the Chatterbox server is reachable", async () => {
|
||||||
|
const mockFetch = vi.fn().mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
});
|
||||||
|
vi.stubGlobal("fetch", mockFetch);
|
||||||
|
|
||||||
|
const healthy = await provider.isHealthy();
|
||||||
|
|
||||||
|
expect(healthy).toBe(true);
|
||||||
|
|
||||||
|
vi.unstubAllGlobals();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false when GPU is unavailable (server unreachable)", async () => {
|
||||||
|
const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED"));
|
||||||
|
vi.stubGlobal("fetch", mockFetch);
|
||||||
|
|
||||||
|
const healthy = await provider.isHealthy();
|
||||||
|
|
||||||
|
expect(healthy).toBe(false);
|
||||||
|
|
||||||
|
vi.unstubAllGlobals();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false when the server returns 503 (GPU overloaded)", async () => {
|
||||||
|
const mockFetch = vi.fn().mockResolvedValue({
|
||||||
|
ok: false,
|
||||||
|
status: 503,
|
||||||
|
});
|
||||||
|
vi.stubGlobal("fetch", mockFetch);
|
||||||
|
|
||||||
|
const healthy = await provider.isHealthy();
|
||||||
|
|
||||||
|
expect(healthy).toBe(false);
|
||||||
|
|
||||||
|
vi.unstubAllGlobals();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should return false on timeout (slow GPU response)", async () => {
|
||||||
|
const mockFetch = vi
|
||||||
|
.fn()
|
||||||
|
.mockRejectedValue(new Error("AbortError: The operation was aborted"));
|
||||||
|
vi.stubGlobal("fetch", mockFetch);
|
||||||
|
|
||||||
|
const healthy = await provider.isHealthy();
|
||||||
|
|
||||||
|
expect(healthy).toBe(false);
|
||||||
|
|
||||||
|
vi.unstubAllGlobals();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// listVoices()
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("listVoices", () => {
|
||||||
|
it("should return the default voice in the premium tier", async () => {
|
||||||
|
const voices = await provider.listVoices();
|
||||||
|
|
||||||
|
expect(voices).toBeInstanceOf(Array);
|
||||||
|
expect(voices.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
const defaultVoice = voices.find((v) => v.isDefault === true);
|
||||||
|
expect(defaultVoice).toBeDefined();
|
||||||
|
expect(defaultVoice?.id).toBe("default");
|
||||||
|
expect(defaultVoice?.tier).toBe("premium");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("should set tier to 'premium' on all voices", async () => {
|
||||||
|
const voices = await provider.listVoices();
|
||||||
|
|
||||||
|
for (const voice of voices) {
|
||||||
|
expect(voice.tier).toBe("premium");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// ==========================================
|
||||||
|
// supportedLanguages
|
||||||
|
// ==========================================
|
||||||
|
|
||||||
|
describe("supportedLanguages", () => {
|
||||||
|
it("should expose a list of supported languages for cross-language transfer", () => {
|
||||||
|
const languages = provider.supportedLanguages;
|
||||||
|
|
||||||
|
expect(languages).toBeInstanceOf(Array);
|
||||||
|
expect(languages.length).toBe(23);
|
||||||
|
expect(languages).toContain("en");
|
||||||
|
expect(languages).toContain("fr");
|
||||||
|
expect(languages).toContain("de");
|
||||||
|
expect(languages).toContain("es");
|
||||||
|
expect(languages).toContain("ja");
|
||||||
|
expect(languages).toContain("zh");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user