From d37c78f503889c6bb779e5e27445271f34668bf9 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:29:38 -0600 Subject: [PATCH] feat(#394): implement Chatterbox TTS provider with voice cloning Add ChatterboxSynthesizeOptions interface with referenceAudio and emotionExaggeration fields, and comprehensive unit tests (26 tests) covering voice cloning, emotion control, clamping, graceful degradation, and cross-language support. Co-Authored-By: Claude Opus 4.6 --- .../api/src/speech/interfaces/speech-types.ts | 27 ++ .../providers/chatterbox-tts.provider.spec.ts | 436 ++++++++++++++++++ 2 files changed, 463 insertions(+) create mode 100644 apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts index 3f5a0b7..c3b93c1 100644 --- a/apps/api/src/speech/interfaces/speech-types.ts +++ b/apps/api/src/speech/interfaces/speech-types.ts @@ -128,6 +128,33 @@ export interface SynthesisResult { durationSeconds?: number; } +/** + * Extended options for Chatterbox TTS synthesis. + * + * Chatterbox supports voice cloning via a reference audio buffer and + * emotion exaggeration control. These are passed as extra body parameters + * to the OpenAI-compatible API. + * + * Issue #394 + */ +export interface ChatterboxSynthesizeOptions extends SynthesizeOptions { + /** + * Reference audio buffer for voice cloning. + * When provided, Chatterbox will clone the voice from this audio sample. + * Should be a WAV or MP3 file of 5-30 seconds for best results. + */ + referenceAudio?: Buffer; + + /** + * Emotion exaggeration factor (0.0 to 1.0). + * Controls how much emotional expression is applied to the synthesized speech. + * - 0.0: Neutral, minimal emotion + * - 0.5: Moderate emotion (default when not specified) + * - 1.0: Maximum emotion exaggeration + */ + emotionExaggeration?: number; +} + /** * Information about an available TTS voice. */ diff --git a/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts b/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts new file mode 100644 index 0000000..08e0f2a --- /dev/null +++ b/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts @@ -0,0 +1,436 @@ +/** + * ChatterboxTTSProvider Unit Tests + * + * Tests the premium-tier TTS provider with voice cloning and + * emotion exaggeration support for Chatterbox. + * + * Issue #394 + */ + +import { describe, it, expect, beforeEach, vi, type Mock } from "vitest"; +import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; +import type { ChatterboxSynthesizeOptions, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +const mockCreate = vi.fn(); + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: mockCreate, + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Test helpers +// ========================================== + +/** + * Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return. + */ +function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } { + return { + arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer), + }; +} + +describe("ChatterboxTTSProvider", () => { + let provider: ChatterboxTTSProvider; + + const testBaseURL = "http://chatterbox-tts:8881/v1"; + + beforeEach(() => { + vi.clearAllMocks(); + provider = new ChatterboxTTSProvider(testBaseURL); + }); + + // ========================================== + // Provider identity + // ========================================== + + describe("provider identity", () => { + it("should have name 'chatterbox'", () => { + expect(provider.name).toBe("chatterbox"); + }); + + it("should have tier 'premium'", () => { + expect(provider.tier).toBe("premium"); + }); + }); + + // ========================================== + // Constructor + // ========================================== + + describe("constructor", () => { + it("should create an instance with the provided baseURL", () => { + expect(provider).toBeDefined(); + }); + + it("should use 'default' as the default voice", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello"); + + expect(result.voice).toBe("default"); + }); + + it("should use 'wav' as the default format", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello"); + + expect(result.format).toBe("wav"); + }); + }); + + // ========================================== + // synthesize() — basic (no Chatterbox-specific options) + // ========================================== + + describe("synthesize (basic)", () => { + it("should synthesize text and return a SynthesisResult", async () => { + const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello, world!"); + + expect(result).toBeDefined(); + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.audio.length).toBe(audioBytes.length); + expect(result.format).toBe("wav"); + expect(result.voice).toBe("default"); + expect(result.tier).toBe("premium"); + }); + + it("should pass correct base parameters to OpenAI SDK when no extra options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Test text"); + + expect(mockCreate).toHaveBeenCalledWith({ + model: "tts-1", + input: "Test text", + voice: "default", + response_format: "wav", + speed: 1.0, + }); + }); + + it("should use custom voice from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { voice: "cloned_voice_1" }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "cloned_voice_1" })); + expect(result.voice).toBe("cloned_voice_1"); + }); + + it("should use custom format from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { format: "mp3" as AudioFormat }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" })); + expect(result.format).toBe("mp3"); + }); + + it("should throw on synthesis failure", async () => { + mockCreate.mockRejectedValue(new Error("GPU out of memory")); + + await expect(provider.synthesize("Hello")).rejects.toThrow( + "TTS synthesis failed for chatterbox: GPU out of memory" + ); + }); + }); + + // ========================================== + // synthesize() — voice cloning (referenceAudio) + // ========================================== + + describe("synthesize (voice cloning)", () => { + it("should pass referenceAudio as base64 in extra body params", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const referenceAudio = Buffer.from("fake-audio-data-for-cloning"); + const options: ChatterboxSynthesizeOptions = { + referenceAudio, + }; + + await provider.synthesize("Clone my voice", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + input: "Clone my voice", + reference_audio: referenceAudio.toString("base64"), + }) + ); + }); + + it("should not include reference_audio when referenceAudio is not provided", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("No cloning"); + + const callArgs = mockCreate.mock.calls[0][0] as Record; + expect(callArgs).not.toHaveProperty("reference_audio"); + }); + }); + + // ========================================== + // synthesize() — emotion exaggeration + // ========================================== + + describe("synthesize (emotion exaggeration)", () => { + it("should pass emotionExaggeration as exaggeration in extra body params", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 0.7, + }; + + await provider.synthesize("Very emotional text", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.7, + }) + ); + }); + + it("should not include exaggeration when emotionExaggeration is not provided", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Neutral text"); + + const callArgs = mockCreate.mock.calls[0][0] as Record; + expect(callArgs).not.toHaveProperty("exaggeration"); + }); + + it("should accept emotionExaggeration of 0.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 0.0, + }; + + await provider.synthesize("Minimal emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.0, + }) + ); + }); + + it("should accept emotionExaggeration of 1.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 1.0, + }; + + await provider.synthesize("Maximum emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 1.0, + }) + ); + }); + + it("should clamp emotionExaggeration above 1.0 to 1.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 1.5, + }; + + await provider.synthesize("Over the top", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 1.0, + }) + ); + }); + + it("should clamp emotionExaggeration below 0.0 to 0.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: -0.5, + }; + + await provider.synthesize("Negative emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.0, + }) + ); + }); + }); + + // ========================================== + // synthesize() — combined options + // ========================================== + + describe("synthesize (combined options)", () => { + it("should handle referenceAudio and emotionExaggeration together", async () => { + const audioBytes = new Uint8Array([0x01, 0x02, 0x03]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const referenceAudio = Buffer.from("reference-audio-sample"); + const options: ChatterboxSynthesizeOptions = { + voice: "custom_voice", + format: "mp3", + speed: 0.9, + referenceAudio, + emotionExaggeration: 0.6, + }; + + const result = await provider.synthesize("Full options test", options); + + expect(mockCreate).toHaveBeenCalledWith({ + model: "tts-1", + input: "Full options test", + voice: "custom_voice", + response_format: "mp3", + speed: 0.9, + reference_audio: referenceAudio.toString("base64"), + exaggeration: 0.6, + }); + + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.voice).toBe("custom_voice"); + expect(result.format).toBe("mp3"); + expect(result.tier).toBe("premium"); + }); + }); + + // ========================================== + // isHealthy() — graceful degradation + // ========================================== + + describe("isHealthy (graceful degradation)", () => { + it("should return true when the Chatterbox server is reachable", async () => { + const mockFetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(true); + + vi.unstubAllGlobals(); + }); + + it("should return false when GPU is unavailable (server unreachable)", async () => { + const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED")); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should return false when the server returns 503 (GPU overloaded)", async () => { + const mockFetch = vi.fn().mockResolvedValue({ + ok: false, + status: 503, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should return false on timeout (slow GPU response)", async () => { + const mockFetch = vi + .fn() + .mockRejectedValue(new Error("AbortError: The operation was aborted")); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + it("should return the default voice in the premium tier", async () => { + const voices = await provider.listVoices(); + + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("default"); + expect(defaultVoice?.tier).toBe("premium"); + }); + + it("should set tier to 'premium' on all voices", async () => { + const voices = await provider.listVoices(); + + for (const voice of voices) { + expect(voice.tier).toBe("premium"); + } + }); + }); + + // ========================================== + // supportedLanguages + // ========================================== + + describe("supportedLanguages", () => { + it("should expose a list of supported languages for cross-language transfer", () => { + const languages = provider.supportedLanguages; + + expect(languages).toBeInstanceOf(Array); + expect(languages.length).toBe(23); + expect(languages).toContain("en"); + expect(languages).toContain("fr"); + expect(languages).toContain("de"); + expect(languages).toContain("es"); + expect(languages).toContain("ja"); + expect(languages).toContain("zh"); + }); + }); +});