From 24065aa1999ae9468a0de9016a183be23d5a195f Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:23:22 -0600 Subject: [PATCH] docs(#406): add speech services documentation Comprehensive documentation for the speech services module: - docs/SPEECH.md: Architecture, API reference, WebSocket protocol, environment variables, provider configuration, Docker setup, GPU VRAM budget, and frontend integration examples - apps/api/src/speech/AGENTS.md: Module structure, provider pattern, how to add new providers, gotchas, and test patterns - README.md: Speech capabilities section with quick start Fixes #406 Co-Authored-By: Claude Opus 4.6 --- README.md | 50 +- apps/api/src/speech/AGENTS.md | 247 +++++++++ docs/SPEECH.md | 929 ++++++++++++++++++++++++++++++++++ 3 files changed, 1213 insertions(+), 13 deletions(-) create mode 100644 apps/api/src/speech/AGENTS.md create mode 100644 docs/SPEECH.md diff --git a/README.md b/README.md index 65b2ab2..a93c803 100644 --- a/README.md +++ b/README.md @@ -19,19 +19,20 @@ Mosaic Stack is a modern, PDA-friendly platform designed to help users manage th ## Technology Stack -| Layer | Technology | -| -------------- | -------------------------------------------- | -| **Frontend** | Next.js 16 + React + TailwindCSS + Shadcn/ui | -| **Backend** | NestJS + Prisma ORM | -| **Database** | PostgreSQL 17 + pgvector | -| **Cache** | Valkey (Redis-compatible) | -| **Auth** | Authentik (OIDC) via BetterAuth | -| **AI** | Ollama (local or remote) | -| **Messaging** | MoltBot (stock + plugins) | -| **Real-time** | WebSockets (Socket.io) | -| **Monorepo** | pnpm workspaces + TurboRepo | -| **Testing** | Vitest + Playwright | -| **Deployment** | Docker + docker-compose | +| Layer | Technology | +| -------------- | ---------------------------------------------- | +| **Frontend** | Next.js 16 + React + TailwindCSS + Shadcn/ui | +| **Backend** | NestJS + Prisma ORM | +| **Database** | PostgreSQL 17 + pgvector | +| **Cache** | Valkey (Redis-compatible) | +| **Auth** | Authentik (OIDC) via BetterAuth | +| **AI** | Ollama (local or remote) | +| **Messaging** | MoltBot (stock + plugins) | +| **Real-time** | WebSockets (Socket.io) | +| **Speech** | Speaches (STT) + Kokoro/Chatterbox/Piper (TTS) | +| **Monorepo** | pnpm workspaces + TurboRepo | +| **Testing** | Vitest + Playwright | +| **Deployment** | Docker + docker-compose | ## Quick Start @@ -356,6 +357,29 @@ Mosaic Stack includes a sophisticated agent orchestration system for autonomous See [Agent Orchestration Design](docs/design/agent-orchestration.md) for architecture details. +## Speech Services + +Mosaic Stack includes integrated speech-to-text (STT) and text-to-speech (TTS) capabilities through a modular provider architecture. Each component is optional and independently configurable. + +- **Speech-to-Text** - Transcribe audio files and real-time audio streams using Whisper (via Speaches) +- **Text-to-Speech** - Synthesize speech with 54+ voices across 8 languages (via Kokoro, CPU-based) +- **Premium Voice Cloning** - Clone voices from audio samples with emotion control (via Chatterbox, GPU) +- **Fallback TTS** - Ultra-lightweight CPU fallback for low-resource environments (via Piper/OpenedAI Speech) +- **WebSocket Streaming** - Real-time streaming transcription via Socket.IO `/speech` namespace +- **Automatic Fallback** - TTS tier system with graceful degradation (premium -> default -> fallback) + +**Quick Start:** + +```bash +# Start speech services alongside core stack +make speech-up + +# Or with Docker Compose directly +docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d +``` + +See [Speech Services Documentation](docs/SPEECH.md) for architecture details, API reference, provider configuration, and deployment options. + ## Current Implementation Status ### ✅ Completed (v0.0.1-0.0.6) diff --git a/apps/api/src/speech/AGENTS.md b/apps/api/src/speech/AGENTS.md new file mode 100644 index 0000000..04b6d97 --- /dev/null +++ b/apps/api/src/speech/AGENTS.md @@ -0,0 +1,247 @@ +# speech — Agent Context + +> Part of the `apps/api/src` layer. Speech-to-text (STT) and text-to-speech (TTS) services. + +## Module Structure + +``` +speech/ +├── speech.module.ts # NestJS module (conditional provider registration) +├── speech.config.ts # Environment validation + typed config (registerAs) +├── speech.config.spec.ts # 51 config validation tests +├── speech.constants.ts # NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS) +├── speech.controller.ts # REST endpoints (transcribe, synthesize, voices, health) +├── speech.controller.spec.ts # Controller tests +├── speech.service.ts # High-level service with fallback orchestration +├── speech.service.spec.ts # Service tests +├── speech.gateway.ts # WebSocket gateway (/speech namespace) +├── speech.gateway.spec.ts # Gateway tests +├── dto/ +│ ├── transcribe.dto.ts # Transcription request DTO (class-validator) +│ ├── synthesize.dto.ts # Synthesis request DTO (class-validator) +│ └── index.ts # Barrel export +├── interfaces/ +│ ├── speech-types.ts # Shared types (SpeechTier, AudioFormat, options, results) +│ ├── stt-provider.interface.ts # ISTTProvider contract +│ ├── tts-provider.interface.ts # ITTSProvider contract +│ └── index.ts # Barrel export +├── pipes/ +│ ├── audio-validation.pipe.ts # Validates uploaded audio (MIME type, size) +│ ├── audio-validation.pipe.spec.ts +│ ├── text-validation.pipe.ts # Validates TTS text input (non-empty, max length) +│ ├── text-validation.pipe.spec.ts +│ └── index.ts # Barrel export +└── providers/ + ├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic) + ├── base-tts.provider.spec.ts + ├── kokoro-tts.provider.ts # Default tier (CPU, 54 voices, 8 languages) + ├── kokoro-tts.provider.spec.ts + ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control) + ├── chatterbox-tts.provider.spec.ts + ├── piper-tts.provider.ts # Fallback tier (CPU, lightweight, Raspberry Pi) + ├── piper-tts.provider.spec.ts + ├── speaches-stt.provider.ts # STT provider (Whisper via Speaches) + ├── speaches-stt.provider.spec.ts + ├── tts-provider.factory.ts # Factory: creates providers from config + └── tts-provider.factory.spec.ts +``` + +## Codebase Patterns + +### Provider Pattern (BaseTTSProvider + Factory) + +All TTS providers extend `BaseTTSProvider`: + +```typescript +export class MyNewProvider extends BaseTTSProvider { + readonly name = "my-provider"; + readonly tier: SpeechTier = "default"; // or "premium" or "fallback" + + constructor(baseURL: string) { + super(baseURL, "default-voice-id", "mp3"); + } + + // Override listVoices() for custom voice catalog + override listVoices(): Promise { ... } + + // Override synthesize() only if non-standard API behavior is needed + // (see ChatterboxTTSProvider for example with extra body params) +} +``` + +The base class handles: + +- OpenAI SDK client creation with custom `baseURL` and `apiKey: "not-needed"` +- Standard `synthesize()` via `client.audio.speech.create()` +- Default `listVoices()` returning just the default voice +- `isHealthy()` via GET to the `/v1/models` endpoint + +### Config Pattern + +Config follows the existing pattern (`auth.config.ts`, `federation.config.ts`): + +- Export `isSttEnabled()`, `isTtsEnabled()`, etc. (boolean checks from env) +- Export `validateSpeechConfig()` (called at module init, throws on missing required vars) +- Export `getSpeechConfig()` (typed config object with defaults) +- Export `speechConfig = registerAs("speech", ...)` for NestJS ConfigModule + +Boolean env parsing: `value === "true" || value === "1"`. No default-true. + +### Conditional Provider Registration + +In `speech.module.ts`: + +- STT provider uses `isSttEnabled()` at module definition time to decide whether to register +- TTS providers use a factory function injected with `ConfigService` +- `@Optional()` decorator on `SpeechService`'s `sttProvider` handles the case where STT is disabled + +### Injection Tokens + +```typescript +// speech.constants.ts +export const STT_PROVIDER = Symbol("STT_PROVIDER"); // ISTTProvider +export const TTS_PROVIDERS = Symbol("TTS_PROVIDERS"); // Map +``` + +### Fallback Chain + +TTS fallback order: `premium` -> `default` -> `fallback` + +- Chain starts at the requested tier and goes downward +- Only tiers that are both enabled AND have a registered provider are attempted +- `ServiceUnavailableException` if all providers fail + +### WebSocket Gateway + +- Separate `/speech` namespace (not on the main gateway) +- Authentication mirrors the main WS gateway pattern (token extraction from handshake) +- One session per client, accumulates audio chunks in memory +- Chunks concatenated and transcribed on `stop-transcription` +- Session cleanup on disconnect + +## How to Add a New TTS Provider + +1. **Create the provider class** in `providers/`: + +```typescript +// providers/my-tts.provider.ts +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier } from "../interfaces/speech-types"; + +export class MyTtsProvider extends BaseTTSProvider { + readonly name = "my-provider"; + readonly tier: SpeechTier = "default"; // Choose tier + + constructor(baseURL: string) { + super(baseURL, "default-voice", "mp3"); + } + + override listVoices(): Promise { + // Return your voice catalog + } +} +``` + +2. **Add env vars** to `speech.config.ts`: + - Add enabled check function + - Add URL to validation in `validateSpeechConfig()` + - Add config section in `getSpeechConfig()` + +3. **Register in factory** (`tts-provider.factory.ts`): + +```typescript +if (config.tts.myTier.enabled) { + const provider = new MyTtsProvider(config.tts.myTier.url); + providers.set("myTier", provider); +} +``` + +4. **Add env vars** to `.env.example` + +5. **Write tests** following existing patterns (mock OpenAI SDK, test synthesis + listVoices + isHealthy) + +## How to Add a New STT Provider + +1. **Implement `ISTTProvider`** (does not use a base class -- STT has only one implementation currently) +2. **Add config section** similar to `stt` in `speech.config.ts` +3. **Register** in `speech.module.ts` providers array with `STT_PROVIDER` token +4. **Write tests** following `speaches-stt.provider.spec.ts` pattern + +## Common Gotchas + +- **OpenAI SDK `apiKey`**: Self-hosted services do not require an API key. Use `apiKey: "not-needed"` when creating the OpenAI client. +- **`toFile()` import**: The `toFile` helper is imported from `"openai"` (not from a subpath). Used in the STT provider to convert Buffer to a File-like object for multipart upload. +- **Health check URL**: `BaseTTSProvider.isHealthy()` calls `GET /v1/models`. The base URL is expected to end with `/v1`. +- **Voice ID prefix parsing**: Kokoro voice IDs encode language + gender in first two characters. See `parseVoicePrefix()` in `kokoro-tts.provider.ts`. +- **Chatterbox extra body params**: The `reference_audio` (base64) and `exaggeration` fields are passed via the OpenAI SDK by casting the request body. This works because the SDK passes through unknown fields. +- **WebSocket auth**: The gateway checks `auth.token`, then `query.token`, then `Authorization` header (in that order). Match this in test setup. +- **Config validation timing**: `validateSpeechConfig()` runs at module init (`onModuleInit`), not at provider construction. This means a misconfigured provider will fail at startup, not at first request. + +## Test Patterns + +### Mocking OpenAI SDK + +All provider tests mock the OpenAI SDK. Pattern: + +```typescript +vi.mock("openai", () => ({ + default: vi.fn().mockImplementation(() => ({ + audio: { + speech: { + create: vi.fn().mockResolvedValue({ + arrayBuffer: () => Promise.resolve(new ArrayBuffer(10)), + }), + }, + transcriptions: { + create: vi.fn().mockResolvedValue({ + text: "transcribed text", + language: "en", + duration: 3.5, + }), + }, + }, + models: { list: vi.fn().mockResolvedValue({ data: [] }) }, + })), +})); +``` + +### Mocking Config Injection + +```typescript +const mockConfig: SpeechConfig = { + stt: { enabled: true, baseUrl: "http://test:8000/v1", model: "test-model", language: "en" }, + tts: { + default: { enabled: true, url: "http://test:8880/v1", voice: "af_heart", format: "mp3" }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + limits: { maxUploadSize: 25000000, maxDurationSeconds: 600, maxTextLength: 4096 }, +}; +``` + +### Config Test Pattern + +`speech.config.spec.ts` saves and restores `process.env` around each test: + +```typescript +let savedEnv: NodeJS.ProcessEnv; +beforeEach(() => { + savedEnv = { ...process.env }; +}); +afterEach(() => { + process.env = savedEnv; +}); +``` + +## Key Files + +| File | Purpose | +| ----------------------------------- | ------------------------------------------------------------------------ | +| `speech.module.ts` | Module registration with conditional providers | +| `speech.config.ts` | All speech env vars + validation (51 tests) | +| `speech.service.ts` | Core service: transcribe, synthesize (with fallback), listVoices | +| `speech.controller.ts` | REST endpoints: POST transcribe, POST synthesize, GET voices, GET health | +| `speech.gateway.ts` | WebSocket streaming transcription (/speech namespace) | +| `providers/base-tts.provider.ts` | Abstract base for all TTS providers (OpenAI SDK wrapper) | +| `providers/tts-provider.factory.ts` | Creates provider instances from config | +| `interfaces/speech-types.ts` | All shared types: SpeechTier, AudioFormat, options, results | diff --git a/docs/SPEECH.md b/docs/SPEECH.md new file mode 100644 index 0000000..3ea7dd4 --- /dev/null +++ b/docs/SPEECH.md @@ -0,0 +1,929 @@ +# Speech Services + +Mosaic Stack provides integrated speech-to-text (STT) and text-to-speech (TTS) services through a provider abstraction layer. Speech services are optional and modular -- each component can be independently enabled, disabled, or pointed at external infrastructure. + +## Table of Contents + +- [Architecture Overview](#architecture-overview) +- [Provider Abstraction](#provider-abstraction) +- [TTS Tier System and Fallback Chain](#tts-tier-system-and-fallback-chain) +- [API Endpoint Reference](#api-endpoint-reference) +- [WebSocket Streaming Protocol](#websocket-streaming-protocol) +- [Environment Variable Reference](#environment-variable-reference) +- [Provider Configuration](#provider-configuration) +- [Voice Cloning Setup (Chatterbox)](#voice-cloning-setup-chatterbox) +- [Docker Compose Setup](#docker-compose-setup) +- [GPU VRAM Budget](#gpu-vram-budget) +- [Frontend Integration](#frontend-integration) + +--- + +## Architecture Overview + +``` + +-------------------+ + | SpeechController | + | (REST endpoints) | + +--------+----------+ + | + +--------------+--------------+ + | SpeechService | + | (provider selection, | + | fallback orchestration) | + +---------+----------+---------+ + | | + +------------+ +-----+-------+ + | | | + +------+------+ +-----+-----+ +-----+-----+ + | STT Provider| |TTS Provider| |TTS Provider| + | (Speaches) | |Map | |Map | + +------+------+ +-----+-----+ +-----+-----+ + | | | + +------+------+ +-----+-----+ +-----+-----+ + | Speaches | | Kokoro | | Chatterbox | + | (Whisper) | | (default) | | (premium) | + +-------------+ +-----------+ +-----+------+ + | + +-----+-----+ + | Piper | + | (fallback)| + +-----------+ + + +-------------------+ + | SpeechGateway | + | (WebSocket /speech) + +--------+----------+ + | + Uses SpeechService.transcribe() +``` + +The speech module (`apps/api/src/speech/`) is a self-contained NestJS module consisting of: + +| Component | File | Purpose | +| ---------- | ---------------------- | ------------------------------------------ | +| Module | `speech.module.ts` | Registers providers, controllers, gateway | +| Config | `speech.config.ts` | Environment validation and typed config | +| Service | `speech.service.ts` | High-level speech operations with fallback | +| Controller | `speech.controller.ts` | REST API endpoints | +| Gateway | `speech.gateway.ts` | WebSocket streaming transcription | +| Constants | `speech.constants.ts` | NestJS injection tokens | + +### Key Design Decisions + +1. **OpenAI-compatible APIs**: All providers (Speaches, Kokoro, Chatterbox, Piper/OpenedAI) expose OpenAI-compatible endpoints. The official OpenAI SDK is used as the HTTP client with a custom `baseURL`. + +2. **Provider abstraction**: STT and TTS providers implement well-defined interfaces (`ISTTProvider`, `ITTSProvider`). New providers can be added without modifying the service layer. + +3. **Conditional registration**: Providers are only instantiated when their corresponding `*_ENABLED` flag is `true`. The STT provider uses NestJS `@Optional()` injection. + +4. **Fail-fast validation**: Configuration is validated at module initialization. If a service is enabled but its URL is missing, the application fails on startup with a descriptive error. + +--- + +## Provider Abstraction + +### STT Provider Interface + +```typescript +interface ISTTProvider { + readonly name: string; + transcribe(audio: Buffer, options?: TranscribeOptions): Promise; + isHealthy(): Promise; +} +``` + +Currently implemented by `SpeachesSttProvider` which connects to a Speaches (faster-whisper) server. + +### TTS Provider Interface + +```typescript +interface ITTSProvider { + readonly name: string; + readonly tier: SpeechTier; + synthesize(text: string, options?: SynthesizeOptions): Promise; + listVoices(): Promise; + isHealthy(): Promise; +} +``` + +All TTS providers extend `BaseTTSProvider`, an abstract class that implements common OpenAI-compatible synthesis logic. Concrete providers only need to set `name` and `tier` and optionally override `listVoices()` or `synthesize()`. + +### Provider Registration + +Providers are created by the `TTS Provider Factory` (`providers/tts-provider.factory.ts`) based on configuration: + +| Tier | Provider Class | Engine | Requirements | +| ---------- | ----------------------- | ------------------------- | ------------ | +| `default` | `KokoroTtsProvider` | Kokoro-FastAPI | CPU only | +| `premium` | `ChatterboxTTSProvider` | Chatterbox TTS Server | NVIDIA GPU | +| `fallback` | `PiperTtsProvider` | Piper via OpenedAI Speech | CPU only | + +--- + +## TTS Tier System and Fallback Chain + +TTS uses a tiered architecture with automatic fallback: + +``` +Request with tier="premium" + | + v +[premium] Chatterbox available? --yes--> Use Chatterbox + | | + no (success/fail) + | + v +[default] Kokoro available? ------yes--> Use Kokoro + | | + no (success/fail) + | + v +[fallback] Piper available? -----yes--> Use Piper + | | + no (success/fail) + | + v +ServiceUnavailableException +``` + +**Fallback order:** `premium` -> `default` -> `fallback` + +The fallback chain starts from the requested tier and proceeds downward. A tier is only attempted if: + +1. It is enabled in configuration (`TTS_ENABLED`, `TTS_PREMIUM_ENABLED`, `TTS_FALLBACK_ENABLED`) +2. A provider is registered for that tier + +If no tier is specified in the request, `default` is used as the starting point. + +--- + +## API Endpoint Reference + +All speech endpoints are under `/api/speech/` and require authentication (Bearer token) plus workspace context (`x-workspace-id` header). + +### POST /api/speech/transcribe + +Transcribe an uploaded audio file to text. + +**Authentication:** Bearer token + workspace membership +**Content-Type:** `multipart/form-data` + +**Form Fields:** + +| Field | Type | Required | Description | +| ------------- | ------ | -------- | ------------------------------------------------------ | +| `file` | File | Yes | Audio file (max 25 MB) | +| `language` | string | No | Language code (e.g., "en", "fr"). Default: from config | +| `model` | string | No | Whisper model override. Default: from config | +| `prompt` | string | No | Prompt to guide transcription (max 1000 chars) | +| `temperature` | number | No | Temperature 0.0-1.0. Lower = more deterministic | + +**Accepted Audio Formats:** +`audio/wav`, `audio/mp3`, `audio/mpeg`, `audio/webm`, `audio/ogg`, `audio/flac`, `audio/x-m4a` + +**Response:** + +```json +{ + "data": { + "text": "Hello, this is a transcription test.", + "language": "en", + "durationSeconds": 3.5, + "confidence": 0.95, + "segments": [ + { + "text": "Hello, this is a transcription test.", + "start": 0.0, + "end": 3.5, + "confidence": 0.95 + } + ] + } +} +``` + +**Example:** + +```bash +curl -X POST http://localhost:3001/api/speech/transcribe \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "x-workspace-id: WORKSPACE_ID" \ + -F "file=@recording.wav" \ + -F "language=en" +``` + +### POST /api/speech/synthesize + +Synthesize text to audio using TTS providers. + +**Authentication:** Bearer token + workspace membership +**Content-Type:** `application/json` + +**Request Body:** + +| Field | Type | Required | Description | +| -------- | ------ | -------- | ----------------------------------------------------------- | +| `text` | string | Yes | Text to synthesize (max 4096 chars) | +| `voice` | string | No | Voice ID. Default: from config (e.g., "af_heart") | +| `speed` | number | No | Speed multiplier 0.5-2.0. Default: 1.0 | +| `format` | string | No | Output format: mp3, wav, opus, flac, aac, pcm. Default: mp3 | +| `tier` | string | No | Provider tier: default, premium, fallback. Default: default | + +**Response:** Binary audio data with appropriate `Content-Type` header. + +| Format | Content-Type | +| ------ | ------------ | +| mp3 | `audio/mpeg` | +| wav | `audio/wav` | +| opus | `audio/opus` | +| flac | `audio/flac` | +| aac | `audio/aac` | +| pcm | `audio/pcm` | + +**Example:** + +```bash +curl -X POST http://localhost:3001/api/speech/synthesize \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "x-workspace-id: WORKSPACE_ID" \ + -H "Content-Type: application/json" \ + -d '{"text": "Hello world", "voice": "af_heart", "format": "mp3"}' \ + --output speech.mp3 +``` + +### GET /api/speech/voices + +List available TTS voices across all tiers. + +**Authentication:** Bearer token + workspace access +**Query Parameters:** + +| Parameter | Type | Required | Description | +| --------- | ------ | -------- | ------------------------------------------ | +| `tier` | string | No | Filter by tier: default, premium, fallback | + +**Response:** + +```json +{ + "data": [ + { + "id": "af_heart", + "name": "Heart (American Female)", + "language": "en-US", + "tier": "default", + "isDefault": true + }, + { + "id": "am_adam", + "name": "Adam (American Male)", + "language": "en-US", + "tier": "default", + "isDefault": false + } + ] +} +``` + +**Example:** + +```bash +curl -X GET 'http://localhost:3001/api/speech/voices?tier=default' \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "x-workspace-id: WORKSPACE_ID" +``` + +### GET /api/speech/health + +Check availability of STT and TTS providers. + +**Authentication:** Bearer token + workspace access + +**Response:** + +```json +{ + "data": { + "stt": { "available": true }, + "tts": { "available": true } + } +} +``` + +--- + +## WebSocket Streaming Protocol + +The speech module provides a WebSocket gateway at namespace `/speech` for real-time streaming transcription. Audio chunks are accumulated on the server and transcribed when the session is stopped. + +### Connection + +Connect to the `/speech` namespace with authentication: + +```typescript +import { io } from "socket.io-client"; + +const socket = io("http://localhost:3001/speech", { + auth: { token: "YOUR_SESSION_TOKEN" }, +}); +``` + +**Authentication methods** (checked in order): + +1. `auth.token` in handshake +2. `query.token` in handshake URL +3. `Authorization: Bearer ` header + +Connection is rejected if: + +- No valid token is provided +- Session verification fails +- User has no workspace membership + +**Connection timeout:** 5 seconds for authentication. + +### Protocol Flow + +``` +Client Server + | | + |--- connect (with token) ----->| + | | (authenticate, check workspace) + |<--- connected ----------------| + | | + |--- start-transcription ------>| { language?: "en" } + |<--- transcription-started ----| { sessionId, language } + | | + |--- audio-chunk -------------->| (Buffer/Uint8Array) + |--- audio-chunk -------------->| (Buffer/Uint8Array) + |--- audio-chunk -------------->| (Buffer/Uint8Array) + | | + |--- stop-transcription ------->| + | | (concatenate chunks, transcribe) + |<--- transcription-final ------| { text, language, durationSeconds, ... } + | | +``` + +### Client Events (emit) + +| Event | Payload | Description | +| --------------------- | ------------------------ | ---------------------------------------- | +| `start-transcription` | `{ language?: string }` | Begin a new transcription session | +| `audio-chunk` | `Buffer` or `Uint8Array` | Send audio data chunk | +| `stop-transcription` | (none) | Stop recording and trigger transcription | + +### Server Events (listen) + +| Event | Payload | Description | +| ----------------------- | ----------------------------------------------------------- | -------------------------- | +| `transcription-started` | `{ sessionId, language }` | Session created | +| `transcription-final` | `{ text, language, durationSeconds, confidence, segments }` | Transcription result | +| `transcription-error` | `{ message }` | Error during transcription | + +### Session Management + +- One active transcription session per client connection +- Starting a new session replaces any existing session +- Sessions are cleaned up on client disconnect +- Audio chunks are accumulated in memory +- Total accumulated size is capped by `SPEECH_MAX_UPLOAD_SIZE` (default: 25 MB) + +### Example Client Usage + +```typescript +import { io } from "socket.io-client"; + +const socket = io("http://localhost:3001/speech", { + auth: { token: sessionToken }, +}); + +// Start recording +socket.emit("start-transcription", { language: "en" }); + +socket.on("transcription-started", ({ sessionId }) => { + console.log("Session started:", sessionId); +}); + +// Stream audio chunks from MediaRecorder +mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + event.data.arrayBuffer().then((buffer) => { + socket.emit("audio-chunk", new Uint8Array(buffer)); + }); + } +}; + +// Stop and get result +socket.emit("stop-transcription"); + +socket.on("transcription-final", (result) => { + console.log("Transcription:", result.text); + console.log("Duration:", result.durationSeconds, "seconds"); +}); + +socket.on("transcription-error", ({ message }) => { + console.error("Transcription error:", message); +}); +``` + +--- + +## Environment Variable Reference + +### Speech-to-Text (STT) + +| Variable | Default | Description | +| -------------- | --------------------------------------- | ---------------------------------------------------- | +| `STT_ENABLED` | `false` | Enable speech-to-text transcription | +| `STT_BASE_URL` | `http://speaches:8000/v1` | Speaches server URL (required when STT_ENABLED=true) | +| `STT_MODEL` | `Systran/faster-whisper-large-v3-turbo` | Whisper model for transcription | +| `STT_LANGUAGE` | `en` | Default language code | + +### Text-to-Speech (TTS) - Default Engine (Kokoro) + +| Variable | Default | Description | +| -------------------- | --------------------------- | --------------------------------------------------- | +| `TTS_ENABLED` | `false` | Enable default TTS engine | +| `TTS_DEFAULT_URL` | `http://kokoro-tts:8880/v1` | Kokoro-FastAPI URL (required when TTS_ENABLED=true) | +| `TTS_DEFAULT_VOICE` | `af_heart` | Default Kokoro voice ID | +| `TTS_DEFAULT_FORMAT` | `mp3` | Default audio output format | + +### Text-to-Speech (TTS) - Premium Engine (Chatterbox) + +| Variable | Default | Description | +| --------------------- | ------------------------------- | ----------------------------------------------------------- | +| `TTS_PREMIUM_ENABLED` | `false` | Enable premium TTS engine | +| `TTS_PREMIUM_URL` | `http://chatterbox-tts:8881/v1` | Chatterbox TTS URL (required when TTS_PREMIUM_ENABLED=true) | + +### Text-to-Speech (TTS) - Fallback Engine (Piper/OpenedAI) + +| Variable | Default | Description | +| ---------------------- | -------------------------------- | ------------------------------------------------------------- | +| `TTS_FALLBACK_ENABLED` | `false` | Enable fallback TTS engine | +| `TTS_FALLBACK_URL` | `http://openedai-speech:8000/v1` | OpenedAI Speech URL (required when TTS_FALLBACK_ENABLED=true) | + +### Service Limits + +| Variable | Default | Description | +| ----------------------------- | ---------- | ---------------------------------------------- | +| `SPEECH_MAX_UPLOAD_SIZE` | `25000000` | Maximum upload file size in bytes (25 MB) | +| `SPEECH_MAX_DURATION_SECONDS` | `600` | Maximum audio duration in seconds (10 minutes) | +| `SPEECH_MAX_TEXT_LENGTH` | `4096` | Maximum text length for TTS in characters | + +### Conditional Validation + +When a service is enabled, its URL variable is required. If missing, the application fails at startup with a message like: + +``` +STT is enabled (STT_ENABLED=true) but required environment variables are missing or empty: STT_BASE_URL. +Either set these variables or disable by setting STT_ENABLED=false. +``` + +Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values default to `false`. + +--- + +## Provider Configuration + +### Kokoro (Default Tier) + +**Engine:** [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI) +**License:** Apache 2.0 +**Requirements:** CPU only +**Docker Image:** `ghcr.io/remsky/kokoro-fastapi:latest-cpu` + +**Capabilities:** + +- 54 built-in voices across 8 languages +- Speed control: 0.25x to 4.0x +- Output formats: mp3, wav, opus, flac +- Voice metadata derived from ID prefix (language, gender, accent) + +**Voice ID Format:** `{lang}{gender}_{name}` + +- First character: language/accent (a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese) +- Second character: gender (f=Female, m=Male) + +**Example voices:** +| Voice ID | Name | Language | Gender | +|----------|------|----------|--------| +| `af_heart` | Heart | en-US | Female | +| `am_adam` | Adam | en-US | Male | +| `bf_alice` | Alice | en-GB | Female | +| `bm_daniel` | Daniel | en-GB | Male | +| `ef_dora` | Dora | es | Female | +| `ff_camille` | Camille | fr | Female | +| `jf_alpha` | Alpha | ja | Female | +| `zf_xiaobei` | Xiaobei | zh | Female | + +### Chatterbox (Premium Tier) + +**Engine:** [Chatterbox TTS Server](https://github.com/devnen/chatterbox-tts-server) +**License:** Proprietary +**Requirements:** NVIDIA GPU with CUDA +**Docker Image:** `devnen/chatterbox-tts-server:latest` + +**Capabilities:** + +- Voice cloning via reference audio sample +- Emotion exaggeration control (0.0 - 1.0) +- Cross-language voice transfer (23 languages) +- Higher quality synthesis than default tier + +**Supported Languages:** +en, fr, de, es, it, pt, nl, pl, ru, uk, ja, zh, ko, ar, hi, tr, sv, da, fi, no, cs, el, ro + +**Extended Options (Chatterbox-specific):** + +| Option | Type | Description | +| --------------------- | ------ | --------------------------------------------------------- | +| `referenceAudio` | Buffer | Audio sample for voice cloning (5-30 seconds recommended) | +| `emotionExaggeration` | number | Emotion intensity 0.0-1.0 (clamped) | + +These are passed as extra body parameters to the OpenAI-compatible endpoint. Reference audio is base64-encoded before sending. + +### Piper (Fallback Tier) + +**Engine:** [Piper](https://github.com/rhasspy/piper) via [OpenedAI Speech](https://github.com/matatonic/openedai-speech) +**License:** GPL (OpenedAI Speech) +**Requirements:** CPU only (runs on Raspberry Pi) +**Docker Image:** Use OpenedAI Speech image + +**Capabilities:** + +- 100+ voices across 40+ languages +- 6 standard OpenAI voice names (mapped to Piper voices) +- Output formats: mp3, wav, opus, flac +- Ultra-lightweight, designed for low-resource environments + +**Standard Voice Mapping:** + +| OpenAI Voice | Piper Voice | Gender | Description | +| ------------ | -------------------- | ------ | --------------------- | +| `alloy` | en_US-amy-medium | Female | Warm, balanced | +| `echo` | en_US-ryan-medium | Male | Clear, articulate | +| `fable` | en_GB-alan-medium | Male | British narrator | +| `onyx` | en_US-danny-low | Male | Deep, resonant | +| `nova` | en_US-lessac-medium | Female | Expressive, versatile | +| `shimmer` | en_US-kristin-medium | Female | Bright, energetic | + +### Speaches (STT) + +**Engine:** [Speaches](https://github.com/speaches-ai/speaches) (faster-whisper backend) +**License:** MIT +**Requirements:** CPU (GPU optional for faster inference) +**Docker Image:** `ghcr.io/speaches-ai/speaches:latest` + +**Capabilities:** + +- OpenAI-compatible `/v1/audio/transcriptions` endpoint +- Whisper models via faster-whisper +- Verbose JSON response with segments and timestamps +- Language detection + +**Default model:** `Systran/faster-whisper-large-v3-turbo` + +--- + +## Voice Cloning Setup (Chatterbox) + +Voice cloning is available through the Chatterbox premium TTS provider. + +### Prerequisites + +1. NVIDIA GPU with CUDA support +2. `nvidia-container-toolkit` installed on the Docker host +3. Docker runtime configured for GPU access +4. TTS premium tier enabled (`TTS_PREMIUM_ENABLED=true`) + +### Basic Voice Cloning + +Provide a reference audio sample (WAV or MP3, 5-30 seconds) when calling synthesize: + +```typescript +import { SpeechService } from "./speech.service"; +import type { ChatterboxSynthesizeOptions } from "./interfaces/speech-types"; + +const options: ChatterboxSynthesizeOptions = { + tier: "premium", + referenceAudio: myAudioBuffer, // 5-30 second audio sample + emotionExaggeration: 0.5, // 0.0 = neutral, 1.0 = maximum emotion +}; + +const result = await speechService.synthesize("Hello, this is my cloned voice!", options); +``` + +### Voice Cloning Tips + +- **Audio quality:** Use clean recordings without background noise +- **Duration:** 5-30 seconds works best; shorter clips may produce lower quality +- **Format:** WAV provides the best quality; MP3 is also accepted +- **Emotion:** Start with 0.5 (moderate) and adjust from there +- **Cross-language:** You can clone a voice in one language and synthesize in another + +--- + +## Docker Compose Setup + +### Development (Local) + +Speech services are defined in a separate overlay file `docker-compose.speech.yml`. This keeps them optional and separate from core services. + +**Start basic speech services (STT + default TTS):** + +```bash +# Using docker compose directly +docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d + +# Using Makefile +make speech-up +``` + +**Start with premium TTS (requires NVIDIA GPU):** + +```bash +docker compose -f docker-compose.yml -f docker-compose.speech.yml --profile premium-tts up -d +``` + +**Stop speech services:** + +```bash +# Using docker compose directly +docker compose -f docker-compose.yml -f docker-compose.speech.yml down --remove-orphans + +# Using Makefile +make speech-down +``` + +**View logs:** + +```bash +make speech-logs +``` + +### Development Services + +| Service | Container | Port | Image | +| -------------- | --------------------- | ------------------------------- | ------------------------------------------ | +| Speaches (STT) | mosaic-speaches | 8090 (host) -> 8000 (container) | `ghcr.io/speaches-ai/speaches:latest` | +| Kokoro TTS | mosaic-kokoro-tts | 8880 (host) -> 8880 (container) | `ghcr.io/remsky/kokoro-fastapi:latest-cpu` | +| Chatterbox TTS | mosaic-chatterbox-tts | 8881 (host) -> 8000 (container) | `devnen/chatterbox-tts-server:latest` | + +### Production (Docker Swarm) + +For production deployments, use `docker/docker-compose.sample.speech.yml`. This file is designed for Docker Swarm with Traefik integration. + +**Required environment variables:** + +```bash +STT_DOMAIN=stt.example.com +TTS_DOMAIN=tts.example.com +``` + +**Optional environment variables:** + +```bash +WHISPER_MODEL=Systran/faster-whisper-large-v3-turbo +CHATTERBOX_TTS_DOMAIN=tts-premium.example.com +TRAEFIK_ENTRYPOINT=websecure +TRAEFIK_CERTRESOLVER=letsencrypt +TRAEFIK_DOCKER_NETWORK=traefik-public +TRAEFIK_TLS_ENABLED=true +``` + +**Deploy:** + +```bash +docker stack deploy -c docker/docker-compose.sample.speech.yml speech +``` + +**Connecting to Mosaic Stack:** Set the speech URLs in your Mosaic Stack `.env`: + +```bash +# Same Docker network +STT_BASE_URL=http://speaches:8000/v1 +TTS_DEFAULT_URL=http://kokoro-tts:8880/v1 + +# External / different network +STT_BASE_URL=https://stt.example.com/v1 +TTS_DEFAULT_URL=https://tts.example.com/v1 +``` + +### Health Checks + +All speech containers include health checks: + +| Service | Endpoint | Interval | Start Period | +| -------------- | ------------------------------ | -------- | ------------ | +| Speaches | `http://localhost:8000/health` | 30s | 120s | +| Kokoro TTS | `http://localhost:8880/health` | 30s | 120s | +| Chatterbox TTS | `http://localhost:8000/health` | 30s | 180s | + +Chatterbox has a longer start period (180s) because GPU model loading takes additional time. + +--- + +## GPU VRAM Budget + +Only Chatterbox requires GPU resources. The other providers (Speaches, Kokoro, Piper) are CPU-only. + +### Chatterbox VRAM Requirements + +| Component | Approximate VRAM | +| ----------------------- | ------------------ | +| Chatterbox TTS model | ~2-4 GB | +| Voice cloning inference | ~1-2 GB additional | +| **Total recommended** | **4-6 GB** | + +### Shared GPU Considerations + +If running multiple GPU services (e.g., Ollama for LLM + Chatterbox for TTS): + +| Service | VRAM Usage | Notes | +| -------------------- | ----------- | --------------------------------- | +| Ollama (7B model) | ~4-6 GB | Depends on model size | +| Ollama (13B model) | ~8-10 GB | Larger models need more | +| Chatterbox TTS | ~4-6 GB | Voice cloning is memory-intensive | +| **Combined minimum** | **8-12 GB** | For 7B LLM + Chatterbox | + +**Recommendations:** + +- 8 GB VRAM: Adequate for small LLM + Chatterbox (may need to alternate) +- 12 GB VRAM: Comfortable for 7B LLM + Chatterbox simultaneously +- 24 GB VRAM: Supports larger LLMs + Chatterbox with headroom + +If VRAM is limited, consider: + +1. Disabling Chatterbox (`TTS_PREMIUM_ENABLED=false`) and using Kokoro (CPU) as default +2. Using the fallback chain so Kokoro handles requests when Chatterbox is busy +3. Running Chatterbox on a separate GPU host + +### Docker Swarm GPU Scheduling + +For Docker Swarm deployments with GPU, configure generic resources on the node: + +```json +// /etc/docker/daemon.json +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime" + } + }, + "node-generic-resources": ["NVIDIA-GPU=0"] +} +``` + +See the [Docker GPU Swarm documentation](https://docs.docker.com/engine/daemon/nvidia-gpu/#configure-gpus-for-docker-swarm) for details. + +--- + +## Frontend Integration + +Speech services are consumed from the frontend through the REST API and WebSocket gateway. + +### REST API Usage + +**Transcribe audio:** + +```typescript +async function transcribeAudio(file: File, token: string, workspaceId: string) { + const formData = new FormData(); + formData.append("file", file); + formData.append("language", "en"); + + const response = await fetch("/api/speech/transcribe", { + method: "POST", + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + }, + body: formData, + }); + + const { data } = await response.json(); + return data.text; +} +``` + +**Synthesize speech:** + +```typescript +async function synthesizeSpeech( + text: string, + token: string, + workspaceId: string, + voice = "af_heart" +) { + const response = await fetch("/api/speech/synthesize", { + method: "POST", + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + "Content-Type": "application/json", + }, + body: JSON.stringify({ text, voice, format: "mp3" }), + }); + + const audioBlob = await response.blob(); + const audioUrl = URL.createObjectURL(audioBlob); + const audio = new Audio(audioUrl); + audio.play(); +} +``` + +**List voices:** + +```typescript +async function listVoices(token: string, workspaceId: string, tier?: string) { + const url = tier ? `/api/speech/voices?tier=${tier}` : "/api/speech/voices"; + + const response = await fetch(url, { + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + }, + }); + + const { data } = await response.json(); + return data; // VoiceInfo[] +} +``` + +### WebSocket Streaming Usage + +For real-time transcription using the browser's MediaRecorder API: + +```typescript +import { io } from "socket.io-client"; + +function createSpeechSocket(token: string) { + const socket = io("/speech", { + auth: { token }, + }); + + let mediaRecorder: MediaRecorder | null = null; + + async function startRecording() { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaRecorder = new MediaRecorder(stream, { + mimeType: "audio/webm;codecs=opus", + }); + + socket.emit("start-transcription", { language: "en" }); + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + event.data.arrayBuffer().then((buffer) => { + socket.emit("audio-chunk", new Uint8Array(buffer)); + }); + } + }; + + mediaRecorder.start(250); // Send chunks every 250ms + } + + async function stopRecording(): Promise { + return new Promise((resolve, reject) => { + socket.once("transcription-final", (result) => { + resolve(result.text); + }); + + socket.once("transcription-error", ({ message }) => { + reject(new Error(message)); + }); + + if (mediaRecorder) { + mediaRecorder.stop(); + mediaRecorder.stream.getTracks().forEach((track) => track.stop()); + mediaRecorder = null; + } + + socket.emit("stop-transcription"); + }); + } + + return { socket, startRecording, stopRecording }; +} +``` + +### Check Speech Availability + +Before showing speech UI elements, check provider availability: + +```typescript +async function checkSpeechHealth(token: string, workspaceId: string) { + const response = await fetch("/api/speech/health", { + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + }, + }); + + const { data } = await response.json(); + return { + canTranscribe: data.stt.available, + canSynthesize: data.tts.available, + }; +} +```