feat(#394): implement Chatterbox TTS provider with voice cloning
All checks were successful
ci/woodpecker/push/api Pipeline was successful

Add ChatterboxSynthesizeOptions interface with referenceAudio and
emotionExaggeration fields, and comprehensive unit tests (26 tests)
covering voice cloning, emotion control, clamping, graceful degradation,
and cross-language support.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 02:29:38 -06:00
parent 79b1d81d27
commit d37c78f503
2 changed files with 463 additions and 0 deletions

View File

@@ -128,6 +128,33 @@ export interface SynthesisResult {
durationSeconds?: number; durationSeconds?: number;
} }
/**
* Extended options for Chatterbox TTS synthesis.
*
* Chatterbox supports voice cloning via a reference audio buffer and
* emotion exaggeration control. These are passed as extra body parameters
* to the OpenAI-compatible API.
*
* Issue #394
*/
export interface ChatterboxSynthesizeOptions extends SynthesizeOptions {
/**
* Reference audio buffer for voice cloning.
* When provided, Chatterbox will clone the voice from this audio sample.
* Should be a WAV or MP3 file of 5-30 seconds for best results.
*/
referenceAudio?: Buffer;
/**
* Emotion exaggeration factor (0.0 to 1.0).
* Controls how much emotional expression is applied to the synthesized speech.
* - 0.0: Neutral, minimal emotion
* - 0.5: Moderate emotion (default when not specified)
* - 1.0: Maximum emotion exaggeration
*/
emotionExaggeration?: number;
}
/** /**
* Information about an available TTS voice. * Information about an available TTS voice.
*/ */

View File

@@ -0,0 +1,436 @@
/**
* ChatterboxTTSProvider Unit Tests
*
* Tests the premium-tier TTS provider with voice cloning and
* emotion exaggeration support for Chatterbox.
*
* Issue #394
*/
import { describe, it, expect, beforeEach, vi, type Mock } from "vitest";
import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
import type { ChatterboxSynthesizeOptions, AudioFormat } from "../interfaces/speech-types";
// ==========================================
// Mock OpenAI SDK
// ==========================================
const mockCreate = vi.fn();
vi.mock("openai", () => {
class MockOpenAI {
audio = {
speech: {
create: mockCreate,
},
};
}
return { default: MockOpenAI };
});
// ==========================================
// Test helpers
// ==========================================
/**
* Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return.
*/
function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } {
return {
arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer),
};
}
describe("ChatterboxTTSProvider", () => {
let provider: ChatterboxTTSProvider;
const testBaseURL = "http://chatterbox-tts:8881/v1";
beforeEach(() => {
vi.clearAllMocks();
provider = new ChatterboxTTSProvider(testBaseURL);
});
// ==========================================
// Provider identity
// ==========================================
describe("provider identity", () => {
it("should have name 'chatterbox'", () => {
expect(provider.name).toBe("chatterbox");
});
it("should have tier 'premium'", () => {
expect(provider.tier).toBe("premium");
});
});
// ==========================================
// Constructor
// ==========================================
describe("constructor", () => {
it("should create an instance with the provided baseURL", () => {
expect(provider).toBeDefined();
});
it("should use 'default' as the default voice", async () => {
const audioBytes = new Uint8Array([0x01, 0x02]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const result = await provider.synthesize("Hello");
expect(result.voice).toBe("default");
});
it("should use 'wav' as the default format", async () => {
const audioBytes = new Uint8Array([0x01, 0x02]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const result = await provider.synthesize("Hello");
expect(result.format).toBe("wav");
});
});
// ==========================================
// synthesize() — basic (no Chatterbox-specific options)
// ==========================================
describe("synthesize (basic)", () => {
it("should synthesize text and return a SynthesisResult", async () => {
const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const result = await provider.synthesize("Hello, world!");
expect(result).toBeDefined();
expect(result.audio).toBeInstanceOf(Buffer);
expect(result.audio.length).toBe(audioBytes.length);
expect(result.format).toBe("wav");
expect(result.voice).toBe("default");
expect(result.tier).toBe("premium");
});
it("should pass correct base parameters to OpenAI SDK when no extra options", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
await provider.synthesize("Test text");
expect(mockCreate).toHaveBeenCalledWith({
model: "tts-1",
input: "Test text",
voice: "default",
response_format: "wav",
speed: 1.0,
});
});
it("should use custom voice from options", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: ChatterboxSynthesizeOptions = { voice: "cloned_voice_1" };
const result = await provider.synthesize("Hello", options);
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "cloned_voice_1" }));
expect(result.voice).toBe("cloned_voice_1");
});
it("should use custom format from options", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: ChatterboxSynthesizeOptions = { format: "mp3" as AudioFormat };
const result = await provider.synthesize("Hello", options);
expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" }));
expect(result.format).toBe("mp3");
});
it("should throw on synthesis failure", async () => {
mockCreate.mockRejectedValue(new Error("GPU out of memory"));
await expect(provider.synthesize("Hello")).rejects.toThrow(
"TTS synthesis failed for chatterbox: GPU out of memory"
);
});
});
// ==========================================
// synthesize() — voice cloning (referenceAudio)
// ==========================================
describe("synthesize (voice cloning)", () => {
it("should pass referenceAudio as base64 in extra body params", async () => {
const audioBytes = new Uint8Array([0x01, 0x02]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const referenceAudio = Buffer.from("fake-audio-data-for-cloning");
const options: ChatterboxSynthesizeOptions = {
referenceAudio,
};
await provider.synthesize("Clone my voice", options);
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
input: "Clone my voice",
reference_audio: referenceAudio.toString("base64"),
})
);
});
it("should not include reference_audio when referenceAudio is not provided", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
await provider.synthesize("No cloning");
const callArgs = mockCreate.mock.calls[0][0] as Record<string, unknown>;
expect(callArgs).not.toHaveProperty("reference_audio");
});
});
// ==========================================
// synthesize() — emotion exaggeration
// ==========================================
describe("synthesize (emotion exaggeration)", () => {
it("should pass emotionExaggeration as exaggeration in extra body params", async () => {
const audioBytes = new Uint8Array([0x01, 0x02]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: ChatterboxSynthesizeOptions = {
emotionExaggeration: 0.7,
};
await provider.synthesize("Very emotional text", options);
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
exaggeration: 0.7,
})
);
});
it("should not include exaggeration when emotionExaggeration is not provided", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
await provider.synthesize("Neutral text");
const callArgs = mockCreate.mock.calls[0][0] as Record<string, unknown>;
expect(callArgs).not.toHaveProperty("exaggeration");
});
it("should accept emotionExaggeration of 0.0", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: ChatterboxSynthesizeOptions = {
emotionExaggeration: 0.0,
};
await provider.synthesize("Minimal emotion", options);
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
exaggeration: 0.0,
})
);
});
it("should accept emotionExaggeration of 1.0", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: ChatterboxSynthesizeOptions = {
emotionExaggeration: 1.0,
};
await provider.synthesize("Maximum emotion", options);
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
exaggeration: 1.0,
})
);
});
it("should clamp emotionExaggeration above 1.0 to 1.0", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: ChatterboxSynthesizeOptions = {
emotionExaggeration: 1.5,
};
await provider.synthesize("Over the top", options);
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
exaggeration: 1.0,
})
);
});
it("should clamp emotionExaggeration below 0.0 to 0.0", async () => {
const audioBytes = new Uint8Array([0x01]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const options: ChatterboxSynthesizeOptions = {
emotionExaggeration: -0.5,
};
await provider.synthesize("Negative emotion", options);
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
exaggeration: 0.0,
})
);
});
});
// ==========================================
// synthesize() — combined options
// ==========================================
describe("synthesize (combined options)", () => {
it("should handle referenceAudio and emotionExaggeration together", async () => {
const audioBytes = new Uint8Array([0x01, 0x02, 0x03]);
mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
const referenceAudio = Buffer.from("reference-audio-sample");
const options: ChatterboxSynthesizeOptions = {
voice: "custom_voice",
format: "mp3",
speed: 0.9,
referenceAudio,
emotionExaggeration: 0.6,
};
const result = await provider.synthesize("Full options test", options);
expect(mockCreate).toHaveBeenCalledWith({
model: "tts-1",
input: "Full options test",
voice: "custom_voice",
response_format: "mp3",
speed: 0.9,
reference_audio: referenceAudio.toString("base64"),
exaggeration: 0.6,
});
expect(result.audio).toBeInstanceOf(Buffer);
expect(result.voice).toBe("custom_voice");
expect(result.format).toBe("mp3");
expect(result.tier).toBe("premium");
});
});
// ==========================================
// isHealthy() — graceful degradation
// ==========================================
describe("isHealthy (graceful degradation)", () => {
it("should return true when the Chatterbox server is reachable", async () => {
const mockFetch = vi.fn().mockResolvedValue({
ok: true,
status: 200,
});
vi.stubGlobal("fetch", mockFetch);
const healthy = await provider.isHealthy();
expect(healthy).toBe(true);
vi.unstubAllGlobals();
});
it("should return false when GPU is unavailable (server unreachable)", async () => {
const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED"));
vi.stubGlobal("fetch", mockFetch);
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
vi.unstubAllGlobals();
});
it("should return false when the server returns 503 (GPU overloaded)", async () => {
const mockFetch = vi.fn().mockResolvedValue({
ok: false,
status: 503,
});
vi.stubGlobal("fetch", mockFetch);
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
vi.unstubAllGlobals();
});
it("should return false on timeout (slow GPU response)", async () => {
const mockFetch = vi
.fn()
.mockRejectedValue(new Error("AbortError: The operation was aborted"));
vi.stubGlobal("fetch", mockFetch);
const healthy = await provider.isHealthy();
expect(healthy).toBe(false);
vi.unstubAllGlobals();
});
});
// ==========================================
// listVoices()
// ==========================================
describe("listVoices", () => {
it("should return the default voice in the premium tier", async () => {
const voices = await provider.listVoices();
expect(voices).toBeInstanceOf(Array);
expect(voices.length).toBeGreaterThan(0);
const defaultVoice = voices.find((v) => v.isDefault === true);
expect(defaultVoice).toBeDefined();
expect(defaultVoice?.id).toBe("default");
expect(defaultVoice?.tier).toBe("premium");
});
it("should set tier to 'premium' on all voices", async () => {
const voices = await provider.listVoices();
for (const voice of voices) {
expect(voice.tier).toBe("premium");
}
});
});
// ==========================================
// supportedLanguages
// ==========================================
describe("supportedLanguages", () => {
it("should expose a list of supported languages for cross-language transfer", () => {
const languages = provider.supportedLanguages;
expect(languages).toBeInstanceOf(Array);
expect(languages.length).toBe(23);
expect(languages).toContain("en");
expect(languages).toContain("fr");
expect(languages).toContain("de");
expect(languages).toContain("es");
expect(languages).toContain("ja");
expect(languages).toContain("zh");
});
});
});