From 24065aa1999ae9468a0de9016a183be23d5a195f Mon Sep 17 00:00:00 2001
From: Jason Woltje <jason@diversecanvas.com>
Date: Sun, 15 Feb 2026 03:23:22 -0600
Subject: [PATCH] docs(#406): add speech services documentation

Comprehensive documentation for the speech services module:
- docs/SPEECH.md: Architecture, API reference, WebSocket protocol,
  environment variables, provider configuration, Docker setup,
  GPU VRAM budget, and frontend integration examples
- apps/api/src/speech/AGENTS.md: Module structure, provider pattern,
  how to add new providers, gotchas, and test patterns
- README.md: Speech capabilities section with quick start

Fixes #406

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 README.md                     |  50 +-
 apps/api/src/speech/AGENTS.md | 247 +++++++++
 docs/SPEECH.md                | 929 ++++++++++++++++++++++++++++++++++
 3 files changed, 1213 insertions(+), 13 deletions(-)
 create mode 100644 apps/api/src/speech/AGENTS.md
 create mode 100644 docs/SPEECH.md

diff --git a/README.md b/README.md
index 65b2ab2..a93c803 100644
--- a/README.md
+++ b/README.md
@@ -19,19 +19,20 @@ Mosaic Stack is a modern, PDA-friendly platform designed to help users manage th
 
 ## Technology Stack
 
-| Layer          | Technology                                   |
-| -------------- | -------------------------------------------- |
-| **Frontend**   | Next.js 16 + React + TailwindCSS + Shadcn/ui |
-| **Backend**    | NestJS + Prisma ORM                          |
-| **Database**   | PostgreSQL 17 + pgvector                     |
-| **Cache**      | Valkey (Redis-compatible)                    |
-| **Auth**       | Authentik (OIDC) via BetterAuth              |
-| **AI**         | Ollama (local or remote)                     |
-| **Messaging**  | MoltBot (stock + plugins)                    |
-| **Real-time**  | WebSockets (Socket.io)                       |
-| **Monorepo**   | pnpm workspaces + TurboRepo                  |
-| **Testing**    | Vitest + Playwright                          |
-| **Deployment** | Docker + docker-compose                      |
+| Layer          | Technology                                     |
+| -------------- | ---------------------------------------------- |
+| **Frontend**   | Next.js 16 + React + TailwindCSS + Shadcn/ui   |
+| **Backend**    | NestJS + Prisma ORM                            |
+| **Database**   | PostgreSQL 17 + pgvector                       |
+| **Cache**      | Valkey (Redis-compatible)                      |
+| **Auth**       | Authentik (OIDC) via BetterAuth                |
+| **AI**         | Ollama (local or remote)                       |
+| **Messaging**  | MoltBot (stock + plugins)                      |
+| **Real-time**  | WebSockets (Socket.io)                         |
+| **Speech**     | Speaches (STT) + Kokoro/Chatterbox/Piper (TTS) |
+| **Monorepo**   | pnpm workspaces + TurboRepo                    |
+| **Testing**    | Vitest + Playwright                            |
+| **Deployment** | Docker + docker-compose                        |
 
 ## Quick Start
 
@@ -356,6 +357,29 @@ Mosaic Stack includes a sophisticated agent orchestration system for autonomous
 
 See [Agent Orchestration Design](docs/design/agent-orchestration.md) for architecture details.
 
+## Speech Services
+
+Mosaic Stack includes integrated speech-to-text (STT) and text-to-speech (TTS) capabilities through a modular provider architecture. Each component is optional and independently configurable.
+
+- **Speech-to-Text** - Transcribe audio files and real-time audio streams using Whisper (via Speaches)
+- **Text-to-Speech** - Synthesize speech with 54+ voices across 8 languages (via Kokoro, CPU-based)
+- **Premium Voice Cloning** - Clone voices from audio samples with emotion control (via Chatterbox, GPU)
+- **Fallback TTS** - Ultra-lightweight CPU fallback for low-resource environments (via Piper/OpenedAI Speech)
+- **WebSocket Streaming** - Real-time streaming transcription via Socket.IO `/speech` namespace
+- **Automatic Fallback** - TTS tier system with graceful degradation (premium -> default -> fallback)
+
+**Quick Start:**
+
+```bash
+# Start speech services alongside core stack
+make speech-up
+
+# Or with Docker Compose directly
+docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d
+```
+
+See [Speech Services Documentation](docs/SPEECH.md) for architecture details, API reference, provider configuration, and deployment options.
+
 ## Current Implementation Status
 
 ### ✅ Completed (v0.0.1-0.0.6)
diff --git a/apps/api/src/speech/AGENTS.md b/apps/api/src/speech/AGENTS.md
new file mode 100644
index 0000000..04b6d97
--- /dev/null
+++ b/apps/api/src/speech/AGENTS.md
@@ -0,0 +1,247 @@
+# speech — Agent Context
+
+> Part of the `apps/api/src` layer. Speech-to-text (STT) and text-to-speech (TTS) services.
+
+## Module Structure
+
+```
+speech/
+├── speech.module.ts           # NestJS module (conditional provider registration)
+├── speech.config.ts           # Environment validation + typed config (registerAs)
+├── speech.config.spec.ts      # 51 config validation tests
+├── speech.constants.ts        # NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS)
+├── speech.controller.ts       # REST endpoints (transcribe, synthesize, voices, health)
+├── speech.controller.spec.ts  # Controller tests
+├── speech.service.ts          # High-level service with fallback orchestration
+├── speech.service.spec.ts     # Service tests
+├── speech.gateway.ts          # WebSocket gateway (/speech namespace)
+├── speech.gateway.spec.ts     # Gateway tests
+├── dto/
+│   ├── transcribe.dto.ts      # Transcription request DTO (class-validator)
+│   ├── synthesize.dto.ts      # Synthesis request DTO (class-validator)
+│   └── index.ts               # Barrel export
+├── interfaces/
+│   ├── speech-types.ts        # Shared types (SpeechTier, AudioFormat, options, results)
+│   ├── stt-provider.interface.ts  # ISTTProvider contract
+│   ├── tts-provider.interface.ts  # ITTSProvider contract
+│   └── index.ts               # Barrel export
+├── pipes/
+│   ├── audio-validation.pipe.ts   # Validates uploaded audio (MIME type, size)
+│   ├── audio-validation.pipe.spec.ts
+│   ├── text-validation.pipe.ts    # Validates TTS text input (non-empty, max length)
+│   ├── text-validation.pipe.spec.ts
+│   └── index.ts               # Barrel export
+└── providers/
+    ├── base-tts.provider.ts       # Abstract base class (OpenAI SDK + common logic)
+    ├── base-tts.provider.spec.ts
+    ├── kokoro-tts.provider.ts     # Default tier (CPU, 54 voices, 8 languages)
+    ├── kokoro-tts.provider.spec.ts
+    ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control)
+    ├── chatterbox-tts.provider.spec.ts
+    ├── piper-tts.provider.ts      # Fallback tier (CPU, lightweight, Raspberry Pi)
+    ├── piper-tts.provider.spec.ts
+    ├── speaches-stt.provider.ts   # STT provider (Whisper via Speaches)
+    ├── speaches-stt.provider.spec.ts
+    ├── tts-provider.factory.ts    # Factory: creates providers from config
+    └── tts-provider.factory.spec.ts
+```
+
+## Codebase Patterns
+
+### Provider Pattern (BaseTTSProvider + Factory)
+
+All TTS providers extend `BaseTTSProvider`:
+
+```typescript
+export class MyNewProvider extends BaseTTSProvider {
+  readonly name = "my-provider";
+  readonly tier: SpeechTier = "default"; // or "premium" or "fallback"
+
+  constructor(baseURL: string) {
+    super(baseURL, "default-voice-id", "mp3");
+  }
+
+  // Override listVoices() for custom voice catalog
+  override listVoices(): Promise<VoiceInfo[]> { ... }
+
+  // Override synthesize() only if non-standard API behavior is needed
+  // (see ChatterboxTTSProvider for example with extra body params)
+}
+```
+
+The base class handles:
+
+- OpenAI SDK client creation with custom `baseURL` and `apiKey: "not-needed"`
+- Standard `synthesize()` via `client.audio.speech.create()`
+- Default `listVoices()` returning just the default voice
+- `isHealthy()` via GET to the `/v1/models` endpoint
+
+### Config Pattern
+
+Config follows the existing pattern (`auth.config.ts`, `federation.config.ts`):
+
+- Export `isSttEnabled()`, `isTtsEnabled()`, etc. (boolean checks from env)
+- Export `validateSpeechConfig()` (called at module init, throws on missing required vars)
+- Export `getSpeechConfig()` (typed config object with defaults)
+- Export `speechConfig = registerAs("speech", ...)` for NestJS ConfigModule
+
+Boolean env parsing: `value === "true" || value === "1"`. No default-true.
+
+### Conditional Provider Registration
+
+In `speech.module.ts`:
+
+- STT provider uses `isSttEnabled()` at module definition time to decide whether to register
+- TTS providers use a factory function injected with `ConfigService`
+- `@Optional()` decorator on `SpeechService`'s `sttProvider` handles the case where STT is disabled
+
+### Injection Tokens
+
+```typescript
+// speech.constants.ts
+export const STT_PROVIDER = Symbol("STT_PROVIDER"); // ISTTProvider
+export const TTS_PROVIDERS = Symbol("TTS_PROVIDERS"); // Map<SpeechTier, ITTSProvider>
+```
+
+### Fallback Chain
+
+TTS fallback order: `premium` -> `default` -> `fallback`
+
+- Chain starts at the requested tier and goes downward
+- Only tiers that are both enabled AND have a registered provider are attempted
+- `ServiceUnavailableException` if all providers fail
+
+### WebSocket Gateway
+
+- Separate `/speech` namespace (not on the main gateway)
+- Authentication mirrors the main WS gateway pattern (token extraction from handshake)
+- One session per client, accumulates audio chunks in memory
+- Chunks concatenated and transcribed on `stop-transcription`
+- Session cleanup on disconnect
+
+## How to Add a New TTS Provider
+
+1. **Create the provider class** in `providers/`:
+
+```typescript
+// providers/my-tts.provider.ts
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier } from "../interfaces/speech-types";
+
+export class MyTtsProvider extends BaseTTSProvider {
+  readonly name = "my-provider";
+  readonly tier: SpeechTier = "default"; // Choose tier
+
+  constructor(baseURL: string) {
+    super(baseURL, "default-voice", "mp3");
+  }
+
+  override listVoices(): Promise<VoiceInfo[]> {
+    // Return your voice catalog
+  }
+}
+```
+
+2. **Add env vars** to `speech.config.ts`:
+   - Add enabled check function
+   - Add URL to validation in `validateSpeechConfig()`
+   - Add config section in `getSpeechConfig()`
+
+3. **Register in factory** (`tts-provider.factory.ts`):
+
+```typescript
+if (config.tts.myTier.enabled) {
+  const provider = new MyTtsProvider(config.tts.myTier.url);
+  providers.set("myTier", provider);
+}
+```
+
+4. **Add env vars** to `.env.example`
+
+5. **Write tests** following existing patterns (mock OpenAI SDK, test synthesis + listVoices + isHealthy)
+
+## How to Add a New STT Provider
+
+1. **Implement `ISTTProvider`** (does not use a base class -- STT has only one implementation currently)
+2. **Add config section** similar to `stt` in `speech.config.ts`
+3. **Register** in `speech.module.ts` providers array with `STT_PROVIDER` token
+4. **Write tests** following `speaches-stt.provider.spec.ts` pattern
+
+## Common Gotchas
+
+- **OpenAI SDK `apiKey`**: Self-hosted services do not require an API key. Use `apiKey: "not-needed"` when creating the OpenAI client.
+- **`toFile()` import**: The `toFile` helper is imported from `"openai"` (not from a subpath). Used in the STT provider to convert Buffer to a File-like object for multipart upload.
+- **Health check URL**: `BaseTTSProvider.isHealthy()` calls `GET /v1/models`. The base URL is expected to end with `/v1`.
+- **Voice ID prefix parsing**: Kokoro voice IDs encode language + gender in first two characters. See `parseVoicePrefix()` in `kokoro-tts.provider.ts`.
+- **Chatterbox extra body params**: The `reference_audio` (base64) and `exaggeration` fields are passed via the OpenAI SDK by casting the request body. This works because the SDK passes through unknown fields.
+- **WebSocket auth**: The gateway checks `auth.token`, then `query.token`, then `Authorization` header (in that order). Match this in test setup.
+- **Config validation timing**: `validateSpeechConfig()` runs at module init (`onModuleInit`), not at provider construction. This means a misconfigured provider will fail at startup, not at first request.
+
+## Test Patterns
+
+### Mocking OpenAI SDK
+
+All provider tests mock the OpenAI SDK. Pattern:
+
+```typescript
+vi.mock("openai", () => ({
+  default: vi.fn().mockImplementation(() => ({
+    audio: {
+      speech: {
+        create: vi.fn().mockResolvedValue({
+          arrayBuffer: () => Promise.resolve(new ArrayBuffer(10)),
+        }),
+      },
+      transcriptions: {
+        create: vi.fn().mockResolvedValue({
+          text: "transcribed text",
+          language: "en",
+          duration: 3.5,
+        }),
+      },
+    },
+    models: { list: vi.fn().mockResolvedValue({ data: [] }) },
+  })),
+}));
+```
+
+### Mocking Config Injection
+
+```typescript
+const mockConfig: SpeechConfig = {
+  stt: { enabled: true, baseUrl: "http://test:8000/v1", model: "test-model", language: "en" },
+  tts: {
+    default: { enabled: true, url: "http://test:8880/v1", voice: "af_heart", format: "mp3" },
+    premium: { enabled: false, url: "" },
+    fallback: { enabled: false, url: "" },
+  },
+  limits: { maxUploadSize: 25000000, maxDurationSeconds: 600, maxTextLength: 4096 },
+};
+```
+
+### Config Test Pattern
+
+`speech.config.spec.ts` saves and restores `process.env` around each test:
+
+```typescript
+let savedEnv: NodeJS.ProcessEnv;
+beforeEach(() => {
+  savedEnv = { ...process.env };
+});
+afterEach(() => {
+  process.env = savedEnv;
+});
+```
+
+## Key Files
+
+| File                                | Purpose                                                                  |
+| ----------------------------------- | ------------------------------------------------------------------------ |
+| `speech.module.ts`                  | Module registration with conditional providers                           |
+| `speech.config.ts`                  | All speech env vars + validation (51 tests)                              |
+| `speech.service.ts`                 | Core service: transcribe, synthesize (with fallback), listVoices         |
+| `speech.controller.ts`              | REST endpoints: POST transcribe, POST synthesize, GET voices, GET health |
+| `speech.gateway.ts`                 | WebSocket streaming transcription (/speech namespace)                    |
+| `providers/base-tts.provider.ts`    | Abstract base for all TTS providers (OpenAI SDK wrapper)                 |
+| `providers/tts-provider.factory.ts` | Creates provider instances from config                                   |
+| `interfaces/speech-types.ts`        | All shared types: SpeechTier, AudioFormat, options, results              |
diff --git a/docs/SPEECH.md b/docs/SPEECH.md
new file mode 100644
index 0000000..3ea7dd4
--- /dev/null
+++ b/docs/SPEECH.md
@@ -0,0 +1,929 @@
+# Speech Services
+
+Mosaic Stack provides integrated speech-to-text (STT) and text-to-speech (TTS) services through a provider abstraction layer. Speech services are optional and modular -- each component can be independently enabled, disabled, or pointed at external infrastructure.
+
+## Table of Contents
+
+- [Architecture Overview](#architecture-overview)
+- [Provider Abstraction](#provider-abstraction)
+- [TTS Tier System and Fallback Chain](#tts-tier-system-and-fallback-chain)
+- [API Endpoint Reference](#api-endpoint-reference)
+- [WebSocket Streaming Protocol](#websocket-streaming-protocol)
+- [Environment Variable Reference](#environment-variable-reference)
+- [Provider Configuration](#provider-configuration)
+- [Voice Cloning Setup (Chatterbox)](#voice-cloning-setup-chatterbox)
+- [Docker Compose Setup](#docker-compose-setup)
+- [GPU VRAM Budget](#gpu-vram-budget)
+- [Frontend Integration](#frontend-integration)
+
+---
+
+## Architecture Overview
+
+```
+                          +-------------------+
+                          |  SpeechController |
+                          |  (REST endpoints) |
+                          +--------+----------+
+                                   |
+                    +--------------+--------------+
+                    |         SpeechService        |
+                    |  (provider selection,         |
+                    |   fallback orchestration)     |
+                    +---------+----------+---------+
+                              |          |
+                 +------------+    +-----+-------+
+                 |                 |             |
+          +------+------+   +-----+-----+ +-----+-----+
+          | STT Provider|   |TTS Provider| |TTS Provider|
+          | (Speaches)  |   |Map<Tier,P> | |Map<Tier,P> |
+          +------+------+   +-----+-----+ +-----+-----+
+                 |                 |             |
+          +------+------+   +-----+-----+ +-----+-----+
+          | Speaches    |   | Kokoro    | | Chatterbox |
+          | (Whisper)   |   | (default) | | (premium)  |
+          +-------------+   +-----------+ +-----+------+
+                                                |
+                                          +-----+-----+
+                                          |   Piper   |
+                                          | (fallback)|
+                                          +-----------+
+
+          +-------------------+
+          |  SpeechGateway    |
+          |  (WebSocket /speech)
+          +--------+----------+
+                   |
+          Uses SpeechService.transcribe()
+```
+
+The speech module (`apps/api/src/speech/`) is a self-contained NestJS module consisting of:
+
+| Component  | File                   | Purpose                                    |
+| ---------- | ---------------------- | ------------------------------------------ |
+| Module     | `speech.module.ts`     | Registers providers, controllers, gateway  |
+| Config     | `speech.config.ts`     | Environment validation and typed config    |
+| Service    | `speech.service.ts`    | High-level speech operations with fallback |
+| Controller | `speech.controller.ts` | REST API endpoints                         |
+| Gateway    | `speech.gateway.ts`    | WebSocket streaming transcription          |
+| Constants  | `speech.constants.ts`  | NestJS injection tokens                    |
+
+### Key Design Decisions
+
+1. **OpenAI-compatible APIs**: All providers (Speaches, Kokoro, Chatterbox, Piper/OpenedAI) expose OpenAI-compatible endpoints. The official OpenAI SDK is used as the HTTP client with a custom `baseURL`.
+
+2. **Provider abstraction**: STT and TTS providers implement well-defined interfaces (`ISTTProvider`, `ITTSProvider`). New providers can be added without modifying the service layer.
+
+3. **Conditional registration**: Providers are only instantiated when their corresponding `*_ENABLED` flag is `true`. The STT provider uses NestJS `@Optional()` injection.
+
+4. **Fail-fast validation**: Configuration is validated at module initialization. If a service is enabled but its URL is missing, the application fails on startup with a descriptive error.
+
+---
+
+## Provider Abstraction
+
+### STT Provider Interface
+
+```typescript
+interface ISTTProvider {
+  readonly name: string;
+  transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult>;
+  isHealthy(): Promise<boolean>;
+}
+```
+
+Currently implemented by `SpeachesSttProvider` which connects to a Speaches (faster-whisper) server.
+
+### TTS Provider Interface
+
+```typescript
+interface ITTSProvider {
+  readonly name: string;
+  readonly tier: SpeechTier;
+  synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult>;
+  listVoices(): Promise<VoiceInfo[]>;
+  isHealthy(): Promise<boolean>;
+}
+```
+
+All TTS providers extend `BaseTTSProvider`, an abstract class that implements common OpenAI-compatible synthesis logic. Concrete providers only need to set `name` and `tier` and optionally override `listVoices()` or `synthesize()`.
+
+### Provider Registration
+
+Providers are created by the `TTS Provider Factory` (`providers/tts-provider.factory.ts`) based on configuration:
+
+| Tier       | Provider Class          | Engine                    | Requirements |
+| ---------- | ----------------------- | ------------------------- | ------------ |
+| `default`  | `KokoroTtsProvider`     | Kokoro-FastAPI            | CPU only     |
+| `premium`  | `ChatterboxTTSProvider` | Chatterbox TTS Server     | NVIDIA GPU   |
+| `fallback` | `PiperTtsProvider`      | Piper via OpenedAI Speech | CPU only     |
+
+---
+
+## TTS Tier System and Fallback Chain
+
+TTS uses a tiered architecture with automatic fallback:
+
+```
+Request with tier="premium"
+    |
+    v
+[premium] Chatterbox available? --yes--> Use Chatterbox
+    |                                         |
+    no                                   (success/fail)
+    |
+    v
+[default] Kokoro available? ------yes--> Use Kokoro
+    |                                         |
+    no                                   (success/fail)
+    |
+    v
+[fallback] Piper available? -----yes--> Use Piper
+    |                                         |
+    no                                   (success/fail)
+    |
+    v
+ServiceUnavailableException
+```
+
+**Fallback order:** `premium` -> `default` -> `fallback`
+
+The fallback chain starts from the requested tier and proceeds downward. A tier is only attempted if:
+
+1. It is enabled in configuration (`TTS_ENABLED`, `TTS_PREMIUM_ENABLED`, `TTS_FALLBACK_ENABLED`)
+2. A provider is registered for that tier
+
+If no tier is specified in the request, `default` is used as the starting point.
+
+---
+
+## API Endpoint Reference
+
+All speech endpoints are under `/api/speech/` and require authentication (Bearer token) plus workspace context (`x-workspace-id` header).
+
+### POST /api/speech/transcribe
+
+Transcribe an uploaded audio file to text.
+
+**Authentication:** Bearer token + workspace membership
+**Content-Type:** `multipart/form-data`
+
+**Form Fields:**
+
+| Field         | Type   | Required | Description                                            |
+| ------------- | ------ | -------- | ------------------------------------------------------ |
+| `file`        | File   | Yes      | Audio file (max 25 MB)                                 |
+| `language`    | string | No       | Language code (e.g., "en", "fr"). Default: from config |
+| `model`       | string | No       | Whisper model override. Default: from config           |
+| `prompt`      | string | No       | Prompt to guide transcription (max 1000 chars)         |
+| `temperature` | number | No       | Temperature 0.0-1.0. Lower = more deterministic        |
+
+**Accepted Audio Formats:**
+`audio/wav`, `audio/mp3`, `audio/mpeg`, `audio/webm`, `audio/ogg`, `audio/flac`, `audio/x-m4a`
+
+**Response:**
+
+```json
+{
+  "data": {
+    "text": "Hello, this is a transcription test.",
+    "language": "en",
+    "durationSeconds": 3.5,
+    "confidence": 0.95,
+    "segments": [
+      {
+        "text": "Hello, this is a transcription test.",
+        "start": 0.0,
+        "end": 3.5,
+        "confidence": 0.95
+      }
+    ]
+  }
+}
+```
+
+**Example:**
+
+```bash
+curl -X POST http://localhost:3001/api/speech/transcribe \
+  -H "Authorization: Bearer YOUR_TOKEN" \
+  -H "x-workspace-id: WORKSPACE_ID" \
+  -F "file=@recording.wav" \
+  -F "language=en"
+```
+
+### POST /api/speech/synthesize
+
+Synthesize text to audio using TTS providers.
+
+**Authentication:** Bearer token + workspace membership
+**Content-Type:** `application/json`
+
+**Request Body:**
+
+| Field    | Type   | Required | Description                                                 |
+| -------- | ------ | -------- | ----------------------------------------------------------- |
+| `text`   | string | Yes      | Text to synthesize (max 4096 chars)                         |
+| `voice`  | string | No       | Voice ID. Default: from config (e.g., "af_heart")           |
+| `speed`  | number | No       | Speed multiplier 0.5-2.0. Default: 1.0                      |
+| `format` | string | No       | Output format: mp3, wav, opus, flac, aac, pcm. Default: mp3 |
+| `tier`   | string | No       | Provider tier: default, premium, fallback. Default: default |
+
+**Response:** Binary audio data with appropriate `Content-Type` header.
+
+| Format | Content-Type |
+| ------ | ------------ |
+| mp3    | `audio/mpeg` |
+| wav    | `audio/wav`  |
+| opus   | `audio/opus` |
+| flac   | `audio/flac` |
+| aac    | `audio/aac`  |
+| pcm    | `audio/pcm`  |
+
+**Example:**
+
+```bash
+curl -X POST http://localhost:3001/api/speech/synthesize \
+  -H "Authorization: Bearer YOUR_TOKEN" \
+  -H "x-workspace-id: WORKSPACE_ID" \
+  -H "Content-Type: application/json" \
+  -d '{"text": "Hello world", "voice": "af_heart", "format": "mp3"}' \
+  --output speech.mp3
+```
+
+### GET /api/speech/voices
+
+List available TTS voices across all tiers.
+
+**Authentication:** Bearer token + workspace access
+**Query Parameters:**
+
+| Parameter | Type   | Required | Description                                |
+| --------- | ------ | -------- | ------------------------------------------ |
+| `tier`    | string | No       | Filter by tier: default, premium, fallback |
+
+**Response:**
+
+```json
+{
+  "data": [
+    {
+      "id": "af_heart",
+      "name": "Heart (American Female)",
+      "language": "en-US",
+      "tier": "default",
+      "isDefault": true
+    },
+    {
+      "id": "am_adam",
+      "name": "Adam (American Male)",
+      "language": "en-US",
+      "tier": "default",
+      "isDefault": false
+    }
+  ]
+}
+```
+
+**Example:**
+
+```bash
+curl -X GET 'http://localhost:3001/api/speech/voices?tier=default' \
+  -H "Authorization: Bearer YOUR_TOKEN" \
+  -H "x-workspace-id: WORKSPACE_ID"
+```
+
+### GET /api/speech/health
+
+Check availability of STT and TTS providers.
+
+**Authentication:** Bearer token + workspace access
+
+**Response:**
+
+```json
+{
+  "data": {
+    "stt": { "available": true },
+    "tts": { "available": true }
+  }
+}
+```
+
+---
+
+## WebSocket Streaming Protocol
+
+The speech module provides a WebSocket gateway at namespace `/speech` for real-time streaming transcription. Audio chunks are accumulated on the server and transcribed when the session is stopped.
+
+### Connection
+
+Connect to the `/speech` namespace with authentication:
+
+```typescript
+import { io } from "socket.io-client";
+
+const socket = io("http://localhost:3001/speech", {
+  auth: { token: "YOUR_SESSION_TOKEN" },
+});
+```
+
+**Authentication methods** (checked in order):
+
+1. `auth.token` in handshake
+2. `query.token` in handshake URL
+3. `Authorization: Bearer <token>` header
+
+Connection is rejected if:
+
+- No valid token is provided
+- Session verification fails
+- User has no workspace membership
+
+**Connection timeout:** 5 seconds for authentication.
+
+### Protocol Flow
+
+```
+Client                          Server
+  |                               |
+  |--- connect (with token) ----->|
+  |                               |  (authenticate, check workspace)
+  |<--- connected ----------------|
+  |                               |
+  |--- start-transcription ------>|  { language?: "en" }
+  |<--- transcription-started ----|  { sessionId, language }
+  |                               |
+  |--- audio-chunk -------------->|  (Buffer/Uint8Array)
+  |--- audio-chunk -------------->|  (Buffer/Uint8Array)
+  |--- audio-chunk -------------->|  (Buffer/Uint8Array)
+  |                               |
+  |--- stop-transcription ------->|
+  |                               |  (concatenate chunks, transcribe)
+  |<--- transcription-final ------|  { text, language, durationSeconds, ... }
+  |                               |
+```
+
+### Client Events (emit)
+
+| Event                 | Payload                  | Description                              |
+| --------------------- | ------------------------ | ---------------------------------------- |
+| `start-transcription` | `{ language?: string }`  | Begin a new transcription session        |
+| `audio-chunk`         | `Buffer` or `Uint8Array` | Send audio data chunk                    |
+| `stop-transcription`  | (none)                   | Stop recording and trigger transcription |
+
+### Server Events (listen)
+
+| Event                   | Payload                                                     | Description                |
+| ----------------------- | ----------------------------------------------------------- | -------------------------- |
+| `transcription-started` | `{ sessionId, language }`                                   | Session created            |
+| `transcription-final`   | `{ text, language, durationSeconds, confidence, segments }` | Transcription result       |
+| `transcription-error`   | `{ message }`                                               | Error during transcription |
+
+### Session Management
+
+- One active transcription session per client connection
+- Starting a new session replaces any existing session
+- Sessions are cleaned up on client disconnect
+- Audio chunks are accumulated in memory
+- Total accumulated size is capped by `SPEECH_MAX_UPLOAD_SIZE` (default: 25 MB)
+
+### Example Client Usage
+
+```typescript
+import { io } from "socket.io-client";
+
+const socket = io("http://localhost:3001/speech", {
+  auth: { token: sessionToken },
+});
+
+// Start recording
+socket.emit("start-transcription", { language: "en" });
+
+socket.on("transcription-started", ({ sessionId }) => {
+  console.log("Session started:", sessionId);
+});
+
+// Stream audio chunks from MediaRecorder
+mediaRecorder.ondataavailable = (event) => {
+  if (event.data.size > 0) {
+    event.data.arrayBuffer().then((buffer) => {
+      socket.emit("audio-chunk", new Uint8Array(buffer));
+    });
+  }
+};
+
+// Stop and get result
+socket.emit("stop-transcription");
+
+socket.on("transcription-final", (result) => {
+  console.log("Transcription:", result.text);
+  console.log("Duration:", result.durationSeconds, "seconds");
+});
+
+socket.on("transcription-error", ({ message }) => {
+  console.error("Transcription error:", message);
+});
+```
+
+---
+
+## Environment Variable Reference
+
+### Speech-to-Text (STT)
+
+| Variable       | Default                                 | Description                                          |
+| -------------- | --------------------------------------- | ---------------------------------------------------- |
+| `STT_ENABLED`  | `false`                                 | Enable speech-to-text transcription                  |
+| `STT_BASE_URL` | `http://speaches:8000/v1`               | Speaches server URL (required when STT_ENABLED=true) |
+| `STT_MODEL`    | `Systran/faster-whisper-large-v3-turbo` | Whisper model for transcription                      |
+| `STT_LANGUAGE` | `en`                                    | Default language code                                |
+
+### Text-to-Speech (TTS) - Default Engine (Kokoro)
+
+| Variable             | Default                     | Description                                         |
+| -------------------- | --------------------------- | --------------------------------------------------- |
+| `TTS_ENABLED`        | `false`                     | Enable default TTS engine                           |
+| `TTS_DEFAULT_URL`    | `http://kokoro-tts:8880/v1` | Kokoro-FastAPI URL (required when TTS_ENABLED=true) |
+| `TTS_DEFAULT_VOICE`  | `af_heart`                  | Default Kokoro voice ID                             |
+| `TTS_DEFAULT_FORMAT` | `mp3`                       | Default audio output format                         |
+
+### Text-to-Speech (TTS) - Premium Engine (Chatterbox)
+
+| Variable              | Default                         | Description                                                 |
+| --------------------- | ------------------------------- | ----------------------------------------------------------- |
+| `TTS_PREMIUM_ENABLED` | `false`                         | Enable premium TTS engine                                   |
+| `TTS_PREMIUM_URL`     | `http://chatterbox-tts:8881/v1` | Chatterbox TTS URL (required when TTS_PREMIUM_ENABLED=true) |
+
+### Text-to-Speech (TTS) - Fallback Engine (Piper/OpenedAI)
+
+| Variable               | Default                          | Description                                                   |
+| ---------------------- | -------------------------------- | ------------------------------------------------------------- |
+| `TTS_FALLBACK_ENABLED` | `false`                          | Enable fallback TTS engine                                    |
+| `TTS_FALLBACK_URL`     | `http://openedai-speech:8000/v1` | OpenedAI Speech URL (required when TTS_FALLBACK_ENABLED=true) |
+
+### Service Limits
+
+| Variable                      | Default    | Description                                    |
+| ----------------------------- | ---------- | ---------------------------------------------- |
+| `SPEECH_MAX_UPLOAD_SIZE`      | `25000000` | Maximum upload file size in bytes (25 MB)      |
+| `SPEECH_MAX_DURATION_SECONDS` | `600`      | Maximum audio duration in seconds (10 minutes) |
+| `SPEECH_MAX_TEXT_LENGTH`      | `4096`     | Maximum text length for TTS in characters      |
+
+### Conditional Validation
+
+When a service is enabled, its URL variable is required. If missing, the application fails at startup with a message like:
+
+```
+STT is enabled (STT_ENABLED=true) but required environment variables are missing or empty: STT_BASE_URL.
+Either set these variables or disable by setting STT_ENABLED=false.
+```
+
+Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values default to `false`.
+
+---
+
+## Provider Configuration
+
+### Kokoro (Default Tier)
+
+**Engine:** [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI)
+**License:** Apache 2.0
+**Requirements:** CPU only
+**Docker Image:** `ghcr.io/remsky/kokoro-fastapi:latest-cpu`
+
+**Capabilities:**
+
+- 54 built-in voices across 8 languages
+- Speed control: 0.25x to 4.0x
+- Output formats: mp3, wav, opus, flac
+- Voice metadata derived from ID prefix (language, gender, accent)
+
+**Voice ID Format:** `{lang}{gender}_{name}`
+
+- First character: language/accent (a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese)
+- Second character: gender (f=Female, m=Male)
+
+**Example voices:**
+| Voice ID | Name | Language | Gender |
+|----------|------|----------|--------|
+| `af_heart` | Heart | en-US | Female |
+| `am_adam` | Adam | en-US | Male |
+| `bf_alice` | Alice | en-GB | Female |
+| `bm_daniel` | Daniel | en-GB | Male |
+| `ef_dora` | Dora | es | Female |
+| `ff_camille` | Camille | fr | Female |
+| `jf_alpha` | Alpha | ja | Female |
+| `zf_xiaobei` | Xiaobei | zh | Female |
+
+### Chatterbox (Premium Tier)
+
+**Engine:** [Chatterbox TTS Server](https://github.com/devnen/chatterbox-tts-server)
+**License:** Proprietary
+**Requirements:** NVIDIA GPU with CUDA
+**Docker Image:** `devnen/chatterbox-tts-server:latest`
+
+**Capabilities:**
+
+- Voice cloning via reference audio sample
+- Emotion exaggeration control (0.0 - 1.0)
+- Cross-language voice transfer (23 languages)
+- Higher quality synthesis than default tier
+
+**Supported Languages:**
+en, fr, de, es, it, pt, nl, pl, ru, uk, ja, zh, ko, ar, hi, tr, sv, da, fi, no, cs, el, ro
+
+**Extended Options (Chatterbox-specific):**
+
+| Option                | Type   | Description                                               |
+| --------------------- | ------ | --------------------------------------------------------- |
+| `referenceAudio`      | Buffer | Audio sample for voice cloning (5-30 seconds recommended) |
+| `emotionExaggeration` | number | Emotion intensity 0.0-1.0 (clamped)                       |
+
+These are passed as extra body parameters to the OpenAI-compatible endpoint. Reference audio is base64-encoded before sending.
+
+### Piper (Fallback Tier)
+
+**Engine:** [Piper](https://github.com/rhasspy/piper) via [OpenedAI Speech](https://github.com/matatonic/openedai-speech)
+**License:** GPL (OpenedAI Speech)
+**Requirements:** CPU only (runs on Raspberry Pi)
+**Docker Image:** Use OpenedAI Speech image
+
+**Capabilities:**
+
+- 100+ voices across 40+ languages
+- 6 standard OpenAI voice names (mapped to Piper voices)
+- Output formats: mp3, wav, opus, flac
+- Ultra-lightweight, designed for low-resource environments
+
+**Standard Voice Mapping:**
+
+| OpenAI Voice | Piper Voice          | Gender | Description           |
+| ------------ | -------------------- | ------ | --------------------- |
+| `alloy`      | en_US-amy-medium     | Female | Warm, balanced        |
+| `echo`       | en_US-ryan-medium    | Male   | Clear, articulate     |
+| `fable`      | en_GB-alan-medium    | Male   | British narrator      |
+| `onyx`       | en_US-danny-low      | Male   | Deep, resonant        |
+| `nova`       | en_US-lessac-medium  | Female | Expressive, versatile |
+| `shimmer`    | en_US-kristin-medium | Female | Bright, energetic     |
+
+### Speaches (STT)
+
+**Engine:** [Speaches](https://github.com/speaches-ai/speaches) (faster-whisper backend)
+**License:** MIT
+**Requirements:** CPU (GPU optional for faster inference)
+**Docker Image:** `ghcr.io/speaches-ai/speaches:latest`
+
+**Capabilities:**
+
+- OpenAI-compatible `/v1/audio/transcriptions` endpoint
+- Whisper models via faster-whisper
+- Verbose JSON response with segments and timestamps
+- Language detection
+
+**Default model:** `Systran/faster-whisper-large-v3-turbo`
+
+---
+
+## Voice Cloning Setup (Chatterbox)
+
+Voice cloning is available through the Chatterbox premium TTS provider.
+
+### Prerequisites
+
+1. NVIDIA GPU with CUDA support
+2. `nvidia-container-toolkit` installed on the Docker host
+3. Docker runtime configured for GPU access
+4. TTS premium tier enabled (`TTS_PREMIUM_ENABLED=true`)
+
+### Basic Voice Cloning
+
+Provide a reference audio sample (WAV or MP3, 5-30 seconds) when calling synthesize:
+
+```typescript
+import { SpeechService } from "./speech.service";
+import type { ChatterboxSynthesizeOptions } from "./interfaces/speech-types";
+
+const options: ChatterboxSynthesizeOptions = {
+  tier: "premium",
+  referenceAudio: myAudioBuffer, // 5-30 second audio sample
+  emotionExaggeration: 0.5, // 0.0 = neutral, 1.0 = maximum emotion
+};
+
+const result = await speechService.synthesize("Hello, this is my cloned voice!", options);
+```
+
+### Voice Cloning Tips
+
+- **Audio quality:** Use clean recordings without background noise
+- **Duration:** 5-30 seconds works best; shorter clips may produce lower quality
+- **Format:** WAV provides the best quality; MP3 is also accepted
+- **Emotion:** Start with 0.5 (moderate) and adjust from there
+- **Cross-language:** You can clone a voice in one language and synthesize in another
+
+---
+
+## Docker Compose Setup
+
+### Development (Local)
+
+Speech services are defined in a separate overlay file `docker-compose.speech.yml`. This keeps them optional and separate from core services.
+
+**Start basic speech services (STT + default TTS):**
+
+```bash
+# Using docker compose directly
+docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d
+
+# Using Makefile
+make speech-up
+```
+
+**Start with premium TTS (requires NVIDIA GPU):**
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.speech.yml --profile premium-tts up -d
+```
+
+**Stop speech services:**
+
+```bash
+# Using docker compose directly
+docker compose -f docker-compose.yml -f docker-compose.speech.yml down --remove-orphans
+
+# Using Makefile
+make speech-down
+```
+
+**View logs:**
+
+```bash
+make speech-logs
+```
+
+### Development Services
+
+| Service        | Container             | Port                            | Image                                      |
+| -------------- | --------------------- | ------------------------------- | ------------------------------------------ |
+| Speaches (STT) | mosaic-speaches       | 8090 (host) -> 8000 (container) | `ghcr.io/speaches-ai/speaches:latest`      |
+| Kokoro TTS     | mosaic-kokoro-tts     | 8880 (host) -> 8880 (container) | `ghcr.io/remsky/kokoro-fastapi:latest-cpu` |
+| Chatterbox TTS | mosaic-chatterbox-tts | 8881 (host) -> 8000 (container) | `devnen/chatterbox-tts-server:latest`      |
+
+### Production (Docker Swarm)
+
+For production deployments, use `docker/docker-compose.sample.speech.yml`. This file is designed for Docker Swarm with Traefik integration.
+
+**Required environment variables:**
+
+```bash
+STT_DOMAIN=stt.example.com
+TTS_DOMAIN=tts.example.com
+```
+
+**Optional environment variables:**
+
+```bash
+WHISPER_MODEL=Systran/faster-whisper-large-v3-turbo
+CHATTERBOX_TTS_DOMAIN=tts-premium.example.com
+TRAEFIK_ENTRYPOINT=websecure
+TRAEFIK_CERTRESOLVER=letsencrypt
+TRAEFIK_DOCKER_NETWORK=traefik-public
+TRAEFIK_TLS_ENABLED=true
+```
+
+**Deploy:**
+
+```bash
+docker stack deploy -c docker/docker-compose.sample.speech.yml speech
+```
+
+**Connecting to Mosaic Stack:** Set the speech URLs in your Mosaic Stack `.env`:
+
+```bash
+# Same Docker network
+STT_BASE_URL=http://speaches:8000/v1
+TTS_DEFAULT_URL=http://kokoro-tts:8880/v1
+
+# External / different network
+STT_BASE_URL=https://stt.example.com/v1
+TTS_DEFAULT_URL=https://tts.example.com/v1
+```
+
+### Health Checks
+
+All speech containers include health checks:
+
+| Service        | Endpoint                       | Interval | Start Period |
+| -------------- | ------------------------------ | -------- | ------------ |
+| Speaches       | `http://localhost:8000/health` | 30s      | 120s         |
+| Kokoro TTS     | `http://localhost:8880/health` | 30s      | 120s         |
+| Chatterbox TTS | `http://localhost:8000/health` | 30s      | 180s         |
+
+Chatterbox has a longer start period (180s) because GPU model loading takes additional time.
+
+---
+
+## GPU VRAM Budget
+
+Only Chatterbox requires GPU resources. The other providers (Speaches, Kokoro, Piper) are CPU-only.
+
+### Chatterbox VRAM Requirements
+
+| Component               | Approximate VRAM   |
+| ----------------------- | ------------------ |
+| Chatterbox TTS model    | ~2-4 GB            |
+| Voice cloning inference | ~1-2 GB additional |
+| **Total recommended**   | **4-6 GB**         |
+
+### Shared GPU Considerations
+
+If running multiple GPU services (e.g., Ollama for LLM + Chatterbox for TTS):
+
+| Service              | VRAM Usage  | Notes                             |
+| -------------------- | ----------- | --------------------------------- |
+| Ollama (7B model)    | ~4-6 GB     | Depends on model size             |
+| Ollama (13B model)   | ~8-10 GB    | Larger models need more           |
+| Chatterbox TTS       | ~4-6 GB     | Voice cloning is memory-intensive |
+| **Combined minimum** | **8-12 GB** | For 7B LLM + Chatterbox           |
+
+**Recommendations:**
+
+- 8 GB VRAM: Adequate for small LLM + Chatterbox (may need to alternate)
+- 12 GB VRAM: Comfortable for 7B LLM + Chatterbox simultaneously
+- 24 GB VRAM: Supports larger LLMs + Chatterbox with headroom
+
+If VRAM is limited, consider:
+
+1. Disabling Chatterbox (`TTS_PREMIUM_ENABLED=false`) and using Kokoro (CPU) as default
+2. Using the fallback chain so Kokoro handles requests when Chatterbox is busy
+3. Running Chatterbox on a separate GPU host
+
+### Docker Swarm GPU Scheduling
+
+For Docker Swarm deployments with GPU, configure generic resources on the node:
+
+```json
+// /etc/docker/daemon.json
+{
+  "runtimes": {
+    "nvidia": {
+      "path": "nvidia-container-runtime"
+    }
+  },
+  "node-generic-resources": ["NVIDIA-GPU=0"]
+}
+```
+
+See the [Docker GPU Swarm documentation](https://docs.docker.com/engine/daemon/nvidia-gpu/#configure-gpus-for-docker-swarm) for details.
+
+---
+
+## Frontend Integration
+
+Speech services are consumed from the frontend through the REST API and WebSocket gateway.
+
+### REST API Usage
+
+**Transcribe audio:**
+
+```typescript
+async function transcribeAudio(file: File, token: string, workspaceId: string) {
+  const formData = new FormData();
+  formData.append("file", file);
+  formData.append("language", "en");
+
+  const response = await fetch("/api/speech/transcribe", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+    },
+    body: formData,
+  });
+
+  const { data } = await response.json();
+  return data.text;
+}
+```
+
+**Synthesize speech:**
+
+```typescript
+async function synthesizeSpeech(
+  text: string,
+  token: string,
+  workspaceId: string,
+  voice = "af_heart"
+) {
+  const response = await fetch("/api/speech/synthesize", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({ text, voice, format: "mp3" }),
+  });
+
+  const audioBlob = await response.blob();
+  const audioUrl = URL.createObjectURL(audioBlob);
+  const audio = new Audio(audioUrl);
+  audio.play();
+}
+```
+
+**List voices:**
+
+```typescript
+async function listVoices(token: string, workspaceId: string, tier?: string) {
+  const url = tier ? `/api/speech/voices?tier=${tier}` : "/api/speech/voices";
+
+  const response = await fetch(url, {
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+    },
+  });
+
+  const { data } = await response.json();
+  return data; // VoiceInfo[]
+}
+```
+
+### WebSocket Streaming Usage
+
+For real-time transcription using the browser's MediaRecorder API:
+
+```typescript
+import { io } from "socket.io-client";
+
+function createSpeechSocket(token: string) {
+  const socket = io("/speech", {
+    auth: { token },
+  });
+
+  let mediaRecorder: MediaRecorder | null = null;
+
+  async function startRecording() {
+    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    mediaRecorder = new MediaRecorder(stream, {
+      mimeType: "audio/webm;codecs=opus",
+    });
+
+    socket.emit("start-transcription", { language: "en" });
+
+    mediaRecorder.ondataavailable = (event) => {
+      if (event.data.size > 0) {
+        event.data.arrayBuffer().then((buffer) => {
+          socket.emit("audio-chunk", new Uint8Array(buffer));
+        });
+      }
+    };
+
+    mediaRecorder.start(250); // Send chunks every 250ms
+  }
+
+  async function stopRecording(): Promise<string> {
+    return new Promise((resolve, reject) => {
+      socket.once("transcription-final", (result) => {
+        resolve(result.text);
+      });
+
+      socket.once("transcription-error", ({ message }) => {
+        reject(new Error(message));
+      });
+
+      if (mediaRecorder) {
+        mediaRecorder.stop();
+        mediaRecorder.stream.getTracks().forEach((track) => track.stop());
+        mediaRecorder = null;
+      }
+
+      socket.emit("stop-transcription");
+    });
+  }
+
+  return { socket, startRecording, stopRecording };
+}
+```
+
+### Check Speech Availability
+
+Before showing speech UI elements, check provider availability:
+
+```typescript
+async function checkSpeechHealth(token: string, workspaceId: string) {
+  const response = await fetch("/api/speech/health", {
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+    },
+  });
+
+  const { data } = await response.json();
+  return {
+    canTranscribe: data.stt.available,
+    canSynthesize: data.tts.available,
+  };
+}
+```