diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts index 3f5a0b7..c3b93c1 100644 --- a/apps/api/src/speech/interfaces/speech-types.ts +++ b/apps/api/src/speech/interfaces/speech-types.ts @@ -128,6 +128,33 @@ export interface SynthesisResult { durationSeconds?: number; } +/** + * Extended options for Chatterbox TTS synthesis. + * + * Chatterbox supports voice cloning via a reference audio buffer and + * emotion exaggeration control. These are passed as extra body parameters + * to the OpenAI-compatible API. + * + * Issue #394 + */ +export interface ChatterboxSynthesizeOptions extends SynthesizeOptions { + /** + * Reference audio buffer for voice cloning. + * When provided, Chatterbox will clone the voice from this audio sample. + * Should be a WAV or MP3 file of 5-30 seconds for best results. + */ + referenceAudio?: Buffer; + + /** + * Emotion exaggeration factor (0.0 to 1.0). + * Controls how much emotional expression is applied to the synthesized speech. + * - 0.0: Neutral, minimal emotion + * - 0.5: Moderate emotion (default when not specified) + * - 1.0: Maximum emotion exaggeration + */ + emotionExaggeration?: number; +} + /** * Information about an available TTS voice. */ diff --git a/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts b/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts new file mode 100644 index 0000000..08e0f2a --- /dev/null +++ b/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts @@ -0,0 +1,436 @@ +/** + * ChatterboxTTSProvider Unit Tests + * + * Tests the premium-tier TTS provider with voice cloning and + * emotion exaggeration support for Chatterbox. + * + * Issue #394 + */ + +import { describe, it, expect, beforeEach, vi, type Mock } from "vitest"; +import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; +import type { ChatterboxSynthesizeOptions, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +const mockCreate = vi.fn(); + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: mockCreate, + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Test helpers +// ========================================== + +/** + * Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return. + */ +function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } { + return { + arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer), + }; +} + +describe("ChatterboxTTSProvider", () => { + let provider: ChatterboxTTSProvider; + + const testBaseURL = "http://chatterbox-tts:8881/v1"; + + beforeEach(() => { + vi.clearAllMocks(); + provider = new ChatterboxTTSProvider(testBaseURL); + }); + + // ========================================== + // Provider identity + // ========================================== + + describe("provider identity", () => { + it("should have name 'chatterbox'", () => { + expect(provider.name).toBe("chatterbox"); + }); + + it("should have tier 'premium'", () => { + expect(provider.tier).toBe("premium"); + }); + }); + + // ========================================== + // Constructor + // ========================================== + + describe("constructor", () => { + it("should create an instance with the provided baseURL", () => { + expect(provider).toBeDefined(); + }); + + it("should use 'default' as the default voice", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello"); + + expect(result.voice).toBe("default"); + }); + + it("should use 'wav' as the default format", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello"); + + expect(result.format).toBe("wav"); + }); + }); + + // ========================================== + // synthesize() — basic (no Chatterbox-specific options) + // ========================================== + + describe("synthesize (basic)", () => { + it("should synthesize text and return a SynthesisResult", async () => { + const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello, world!"); + + expect(result).toBeDefined(); + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.audio.length).toBe(audioBytes.length); + expect(result.format).toBe("wav"); + expect(result.voice).toBe("default"); + expect(result.tier).toBe("premium"); + }); + + it("should pass correct base parameters to OpenAI SDK when no extra options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Test text"); + + expect(mockCreate).toHaveBeenCalledWith({ + model: "tts-1", + input: "Test text", + voice: "default", + response_format: "wav", + speed: 1.0, + }); + }); + + it("should use custom voice from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { voice: "cloned_voice_1" }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "cloned_voice_1" })); + expect(result.voice).toBe("cloned_voice_1"); + }); + + it("should use custom format from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { format: "mp3" as AudioFormat }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" })); + expect(result.format).toBe("mp3"); + }); + + it("should throw on synthesis failure", async () => { + mockCreate.mockRejectedValue(new Error("GPU out of memory")); + + await expect(provider.synthesize("Hello")).rejects.toThrow( + "TTS synthesis failed for chatterbox: GPU out of memory" + ); + }); + }); + + // ========================================== + // synthesize() — voice cloning (referenceAudio) + // ========================================== + + describe("synthesize (voice cloning)", () => { + it("should pass referenceAudio as base64 in extra body params", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const referenceAudio = Buffer.from("fake-audio-data-for-cloning"); + const options: ChatterboxSynthesizeOptions = { + referenceAudio, + }; + + await provider.synthesize("Clone my voice", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + input: "Clone my voice", + reference_audio: referenceAudio.toString("base64"), + }) + ); + }); + + it("should not include reference_audio when referenceAudio is not provided", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("No cloning"); + + const callArgs = mockCreate.mock.calls[0][0] as Record; + expect(callArgs).not.toHaveProperty("reference_audio"); + }); + }); + + // ========================================== + // synthesize() — emotion exaggeration + // ========================================== + + describe("synthesize (emotion exaggeration)", () => { + it("should pass emotionExaggeration as exaggeration in extra body params", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 0.7, + }; + + await provider.synthesize("Very emotional text", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.7, + }) + ); + }); + + it("should not include exaggeration when emotionExaggeration is not provided", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Neutral text"); + + const callArgs = mockCreate.mock.calls[0][0] as Record; + expect(callArgs).not.toHaveProperty("exaggeration"); + }); + + it("should accept emotionExaggeration of 0.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 0.0, + }; + + await provider.synthesize("Minimal emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.0, + }) + ); + }); + + it("should accept emotionExaggeration of 1.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 1.0, + }; + + await provider.synthesize("Maximum emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 1.0, + }) + ); + }); + + it("should clamp emotionExaggeration above 1.0 to 1.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 1.5, + }; + + await provider.synthesize("Over the top", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 1.0, + }) + ); + }); + + it("should clamp emotionExaggeration below 0.0 to 0.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: -0.5, + }; + + await provider.synthesize("Negative emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.0, + }) + ); + }); + }); + + // ========================================== + // synthesize() — combined options + // ========================================== + + describe("synthesize (combined options)", () => { + it("should handle referenceAudio and emotionExaggeration together", async () => { + const audioBytes = new Uint8Array([0x01, 0x02, 0x03]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const referenceAudio = Buffer.from("reference-audio-sample"); + const options: ChatterboxSynthesizeOptions = { + voice: "custom_voice", + format: "mp3", + speed: 0.9, + referenceAudio, + emotionExaggeration: 0.6, + }; + + const result = await provider.synthesize("Full options test", options); + + expect(mockCreate).toHaveBeenCalledWith({ + model: "tts-1", + input: "Full options test", + voice: "custom_voice", + response_format: "mp3", + speed: 0.9, + reference_audio: referenceAudio.toString("base64"), + exaggeration: 0.6, + }); + + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.voice).toBe("custom_voice"); + expect(result.format).toBe("mp3"); + expect(result.tier).toBe("premium"); + }); + }); + + // ========================================== + // isHealthy() — graceful degradation + // ========================================== + + describe("isHealthy (graceful degradation)", () => { + it("should return true when the Chatterbox server is reachable", async () => { + const mockFetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(true); + + vi.unstubAllGlobals(); + }); + + it("should return false when GPU is unavailable (server unreachable)", async () => { + const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED")); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should return false when the server returns 503 (GPU overloaded)", async () => { + const mockFetch = vi.fn().mockResolvedValue({ + ok: false, + status: 503, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should return false on timeout (slow GPU response)", async () => { + const mockFetch = vi + .fn() + .mockRejectedValue(new Error("AbortError: The operation was aborted")); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + it("should return the default voice in the premium tier", async () => { + const voices = await provider.listVoices(); + + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("default"); + expect(defaultVoice?.tier).toBe("premium"); + }); + + it("should set tier to 'premium' on all voices", async () => { + const voices = await provider.listVoices(); + + for (const voice of voices) { + expect(voice.tier).toBe("premium"); + } + }); + }); + + // ========================================== + // supportedLanguages + // ========================================== + + describe("supportedLanguages", () => { + it("should expose a list of supported languages for cross-language transfer", () => { + const languages = provider.supportedLanguages; + + expect(languages).toBeInstanceOf(Array); + expect(languages.length).toBe(23); + expect(languages).toContain("en"); + expect(languages).toContain("fr"); + expect(languages).toContain("de"); + expect(languages).toContain("es"); + expect(languages).toContain("ja"); + expect(languages).toContain("zh"); + }); + }); +});