2026-02-17 01:04:47 +00:00
62 changed files with 13673 additions and 82 deletions
--- a/.env.example
+++ b/.env.example
@@ -366,6 +366,45 @@ OLLAMA_MODEL=llama3.1:latest
 # Get your API key from: https://platform.openai.com/api-keys
 # OPENAI_API_KEY=sk-...

+# ======================
+# Speech Services (STT / TTS)
+# ======================
+# Speech-to-Text (STT) - Whisper via Speaches
+# Set STT_ENABLED=true to enable speech-to-text transcription
+# STT_BASE_URL is required when STT_ENABLED=true
+STT_ENABLED=true
+STT_BASE_URL=http://speaches:8000/v1
+STT_MODEL=Systran/faster-whisper-large-v3-turbo
+STT_LANGUAGE=en
+
+# Text-to-Speech (TTS) - Default Engine (Kokoro)
+# Set TTS_ENABLED=true to enable text-to-speech synthesis
+# TTS_DEFAULT_URL is required when TTS_ENABLED=true
+TTS_ENABLED=true
+TTS_DEFAULT_URL=http://kokoro-tts:8880/v1
+TTS_DEFAULT_VOICE=af_heart
+TTS_DEFAULT_FORMAT=mp3
+
+# Text-to-Speech (TTS) - Premium Engine (Chatterbox) - Optional
+# Higher quality voice cloning engine, disabled by default
+# TTS_PREMIUM_URL is required when TTS_PREMIUM_ENABLED=true
+TTS_PREMIUM_ENABLED=false
+TTS_PREMIUM_URL=http://chatterbox-tts:8881/v1
+
+# Text-to-Speech (TTS) - Fallback Engine (Piper/OpenedAI) - Optional
+# Lightweight fallback engine, disabled by default
+# TTS_FALLBACK_URL is required when TTS_FALLBACK_ENABLED=true
+TTS_FALLBACK_ENABLED=false
+TTS_FALLBACK_URL=http://openedai-speech:8000/v1
+
+# Speech Service Limits
+# Maximum upload file size in bytes (default: 25MB)
+SPEECH_MAX_UPLOAD_SIZE=25000000
+# Maximum audio duration in seconds (default: 600 = 10 minutes)
+SPEECH_MAX_DURATION_SECONDS=600
+# Maximum text length for TTS in characters (default: 4096)
+SPEECH_MAX_TEXT_LENGTH=4096
+
 # ======================
 # Mosaic Telemetry (Task Completion Tracking & Predictions)
 # ======================
--- a/17
+++ b/17
@@ -1,4 +1,4 @@
-.PHONY: help install dev build test docker-up docker-down docker-logs docker-ps docker-build docker-restart docker-test clean matrix-up matrix-down matrix-logs matrix-setup-bot
+.PHONY: help install dev build test docker-up docker-down docker-logs docker-ps docker-build docker-restart docker-test speech-up speech-down speech-logs clean matrix-up matrix-down matrix-logs matrix-setup-bot

 # Default target
 help:
@@ -24,6 +24,11 @@ help:
 	@echo "  make docker-test            Run Docker smoke test"
 	@echo "  make docker-test-traefik    Run Traefik integration tests"
 	@echo ""
+	@echo "Speech Services:"
+	@echo "  make speech-up              Start speech services (STT + TTS)"
+	@echo "  make speech-down            Stop speech services"
+	@echo "  make speech-logs            View speech service logs"
+	@echo ""
 	@echo "Matrix Dev Environment:"
 	@echo "  make matrix-up              Start Matrix services (Synapse + Element)"
 	@echo "  make matrix-down            Stop Matrix services"
@@ -91,6 +96,16 @@ docker-test:
 docker-test-traefik:
 	./tests/integration/docker/traefik.test.sh all

+# Speech services
+speech-up:
+	docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d speaches kokoro-tts
+
+speech-down:
+	docker compose -f docker-compose.yml -f docker-compose.speech.yml down --remove-orphans
+
+speech-logs:
+	docker compose -f docker-compose.yml -f docker-compose.speech.yml logs -f speaches kokoro-tts
+
 # Matrix Dev Environment
 matrix-up:
 	docker compose -f docker/docker-compose.yml -f docker/docker-compose.matrix.yml up -d
--- a/README.md
+++ b/README.md
@@ -19,19 +19,20 @@ Mosaic Stack is a modern, PDA-friendly platform designed to help users manage th

 ## Technology Stack

-| Layer          | Technology                                   |
-| -------------- | -------------------------------------------- |
-| **Frontend**   | Next.js 16 + React + TailwindCSS + Shadcn/ui |
-| **Backend**    | NestJS + Prisma ORM                          |
-| **Database**   | PostgreSQL 17 + pgvector                     |
-| **Cache**      | Valkey (Redis-compatible)                    |
-| **Auth**       | Authentik (OIDC) via BetterAuth              |
-| **AI**         | Ollama (local or remote)                     |
-| **Messaging**  | MoltBot (stock + plugins)                    |
-| **Real-time**  | WebSockets (Socket.io)                       |
-| **Monorepo**   | pnpm workspaces + TurboRepo                  |
-| **Testing**    | Vitest + Playwright                          |
-| **Deployment** | Docker + docker-compose                      |
+| Layer          | Technology                                     |
+| -------------- | ---------------------------------------------- |
+| **Frontend**   | Next.js 16 + React + TailwindCSS + Shadcn/ui   |
+| **Backend**    | NestJS + Prisma ORM                            |
+| **Database**   | PostgreSQL 17 + pgvector                       |
+| **Cache**      | Valkey (Redis-compatible)                      |
+| **Auth**       | Authentik (OIDC) via BetterAuth                |
+| **AI**         | Ollama (local or remote)                       |
+| **Messaging**  | MoltBot (stock + plugins)                      |
+| **Real-time**  | WebSockets (Socket.io)                         |
+| **Speech**     | Speaches (STT) + Kokoro/Chatterbox/Piper (TTS) |
+| **Monorepo**   | pnpm workspaces + TurboRepo                    |
+| **Testing**    | Vitest + Playwright                            |
+| **Deployment** | Docker + docker-compose                        |

 ## Quick Start

@@ -356,6 +357,29 @@ Mosaic Stack includes a sophisticated agent orchestration system for autonomous

 See [Agent Orchestration Design](docs/design/agent-orchestration.md) for architecture details.

+## Speech Services
+
+Mosaic Stack includes integrated speech-to-text (STT) and text-to-speech (TTS) capabilities through a modular provider architecture. Each component is optional and independently configurable.
+
+- **Speech-to-Text** - Transcribe audio files and real-time audio streams using Whisper (via Speaches)
+- **Text-to-Speech** - Synthesize speech with 54+ voices across 8 languages (via Kokoro, CPU-based)
+- **Premium Voice Cloning** - Clone voices from audio samples with emotion control (via Chatterbox, GPU)
+- **Fallback TTS** - Ultra-lightweight CPU fallback for low-resource environments (via Piper/OpenedAI Speech)
+- **WebSocket Streaming** - Real-time streaming transcription via Socket.IO `/speech` namespace
+- **Automatic Fallback** - TTS tier system with graceful degradation (premium -> default -> fallback)
+
+**Quick Start:**
+
+```bash
+# Start speech services alongside core stack
+make speech-up
+
+# Or with Docker Compose directly
+docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d
+```
+
+See [Speech Services Documentation](docs/SPEECH.md) for architecture details, API reference, provider configuration, and deployment options.
+
 ## Current Implementation Status

 ### ✅ Completed (v0.0.1-0.0.6)
--- a/apps/api/AGENTS.md
+++ b/apps/api/AGENTS.md
@@ -4,15 +4,22 @@

 ## Patterns

-<!-- Add module-specific patterns as you discover them -->
+- **Config validation pattern**: Config files use exported validation functions + typed getter functions (not class-validator). See `auth.config.ts`, `federation.config.ts`, `speech/speech.config.ts`. Pattern: export `isXEnabled()`, `validateXConfig()`, and `getXConfig()` functions.
+- **Config registerAs**: `speech.config.ts` also exports a `registerAs("speech", ...)` factory for NestJS ConfigModule namespaced injection. Use `ConfigModule.forFeature(speechConfig)` in module imports and access via `this.config.get<string>('speech.stt.baseUrl')`.
+- **Conditional config validation**: When a service has an enabled flag (e.g., `STT_ENABLED`), URL/connection vars are only required when enabled. Validation throws with a helpful message suggesting how to disable.
+- **Boolean env parsing**: Use `value === "true" || value === "1"` pattern. No default-true -- all services default to disabled when env var is unset.

 ## Gotchas

-<!-- Add things that trip up agents in this module -->
+- **Prisma client must be generated** before `tsc --noEmit` will pass. Run `pnpm prisma:generate` first. Pre-existing type errors from Prisma are expected in worktrees without generated client.
+- **Pre-commit hooks**: lint-staged runs on staged files. If other packages' files are staged, their lint must pass too. Only stage files you intend to commit.
+- **vitest runs all test files**: Even when targeting a specific test file, vitest loads all spec files. Many will fail if Prisma client isn't generated -- this is expected. Check only your target file's pass/fail status.

 ## Key Files

-| File | Purpose |
-| ---- | ------- |
-
-<!-- Add important files in this directory -->
+| File                                  | Purpose                                                                |
+| ------------------------------------- | ---------------------------------------------------------------------- |
+| `src/speech/speech.config.ts`         | Speech services env var validation and typed config (STT, TTS, limits) |
+| `src/speech/speech.config.spec.ts`    | Unit tests for speech config validation (51 tests)                     |
+| `src/auth/auth.config.ts`             | Auth/OIDC config validation (reference pattern)                        |
+| `src/federation/federation.config.ts` | Federation config validation (reference pattern)                       |
--- a/apps/api/src/app.module.ts
+++ b/apps/api/src/app.module.ts
@@ -38,6 +38,7 @@ import { CoordinatorIntegrationModule } from "./coordinator-integration/coordina
 import { FederationModule } from "./federation/federation.module";
 import { CredentialsModule } from "./credentials/credentials.module";
 import { MosaicTelemetryModule } from "./mosaic-telemetry";
+import { SpeechModule } from "./speech/speech.module";
 import { RlsContextInterceptor } from "./common/interceptors/rls-context.interceptor";

@Module({
@@ -99,6 +100,7 @@ import { RlsContextInterceptor } from "./common/interceptors/rls-context.interce
    FederationModule,
    CredentialsModule,
    MosaicTelemetryModule,
+    SpeechModule,
  ],
  controllers: [AppController, CsrfController],
  providers: [
--- a/apps/api/src/speech/AGENTS.md
+++ b/apps/api/src/speech/AGENTS.md
@@ -0,0 +1,247 @@
+# speech — Agent Context
+
+> Part of the `apps/api/src` layer. Speech-to-text (STT) and text-to-speech (TTS) services.
+
+## Module Structure
+
+```
+speech/
+├── speech.module.ts           # NestJS module (conditional provider registration)
+├── speech.config.ts           # Environment validation + typed config (registerAs)
+├── speech.config.spec.ts      # 51 config validation tests
+├── speech.constants.ts        # NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS)
+├── speech.controller.ts       # REST endpoints (transcribe, synthesize, voices, health)
+├── speech.controller.spec.ts  # Controller tests
+├── speech.service.ts          # High-level service with fallback orchestration
+├── speech.service.spec.ts     # Service tests
+├── speech.gateway.ts          # WebSocket gateway (/speech namespace)
+├── speech.gateway.spec.ts     # Gateway tests
+├── dto/
+│   ├── transcribe.dto.ts      # Transcription request DTO (class-validator)
+│   ├── synthesize.dto.ts      # Synthesis request DTO (class-validator)
+│   └── index.ts               # Barrel export
+├── interfaces/
+│   ├── speech-types.ts        # Shared types (SpeechTier, AudioFormat, options, results)
+│   ├── stt-provider.interface.ts  # ISTTProvider contract
+│   ├── tts-provider.interface.ts  # ITTSProvider contract
+│   └── index.ts               # Barrel export
+├── pipes/
+│   ├── audio-validation.pipe.ts   # Validates uploaded audio (MIME type, size)
+│   ├── audio-validation.pipe.spec.ts
+│   ├── text-validation.pipe.ts    # Validates TTS text input (non-empty, max length)
+│   ├── text-validation.pipe.spec.ts
+│   └── index.ts               # Barrel export
+└── providers/
+    ├── base-tts.provider.ts       # Abstract base class (OpenAI SDK + common logic)
+    ├── base-tts.provider.spec.ts
+    ├── kokoro-tts.provider.ts     # Default tier (CPU, 53 voices, 8 languages)
+    ├── kokoro-tts.provider.spec.ts
+    ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control)
+    ├── chatterbox-tts.provider.spec.ts
+    ├── piper-tts.provider.ts      # Fallback tier (CPU, lightweight, Raspberry Pi)
+    ├── piper-tts.provider.spec.ts
+    ├── speaches-stt.provider.ts   # STT provider (Whisper via Speaches)
+    ├── speaches-stt.provider.spec.ts
+    ├── tts-provider.factory.ts    # Factory: creates providers from config
+    └── tts-provider.factory.spec.ts
+```
+
+## Codebase Patterns
+
+### Provider Pattern (BaseTTSProvider + Factory)
+
+All TTS providers extend `BaseTTSProvider`:
+
+```typescript
+export class MyNewProvider extends BaseTTSProvider {
+  readonly name = "my-provider";
+  readonly tier: SpeechTier = "default"; // or "premium" or "fallback"
+
+  constructor(baseURL: string) {
+    super(baseURL, "default-voice-id", "mp3");
+  }
+
+  // Override listVoices() for custom voice catalog
+  override listVoices(): Promise<VoiceInfo[]> { ... }
+
+  // Override synthesize() only if non-standard API behavior is needed
+  // (see ChatterboxTTSProvider for example with extra body params)
+}
+```
+
+The base class handles:
+
+- OpenAI SDK client creation with custom `baseURL` and `apiKey: "not-needed"`
+- Standard `synthesize()` via `client.audio.speech.create()`
+- Default `listVoices()` returning just the default voice
+- `isHealthy()` via GET to the `/v1/models` endpoint
+
+### Config Pattern
+
+Config follows the existing pattern (`auth.config.ts`, `federation.config.ts`):
+
+- Export `isSttEnabled()`, `isTtsEnabled()`, etc. (boolean checks from env)
+- Export `validateSpeechConfig()` (called at module init, throws on missing required vars)
+- Export `getSpeechConfig()` (typed config object with defaults)
+- Export `speechConfig = registerAs("speech", ...)` for NestJS ConfigModule
+
+Boolean env parsing: `value === "true" || value === "1"`. No default-true.
+
+### Conditional Provider Registration
+
+In `speech.module.ts`:
+
+- STT provider uses `isSttEnabled()` at module definition time to decide whether to register
+- TTS providers use a factory function injected with `ConfigService`
+- `@Optional()` decorator on `SpeechService`'s `sttProvider` handles the case where STT is disabled
+
+### Injection Tokens
+
+```typescript
+// speech.constants.ts
+export const STT_PROVIDER = Symbol("STT_PROVIDER"); // ISTTProvider
+export const TTS_PROVIDERS = Symbol("TTS_PROVIDERS"); // Map<SpeechTier, ITTSProvider>
+```
+
+### Fallback Chain
+
+TTS fallback order: `premium` -> `default` -> `fallback`
+
+- Chain starts at the requested tier and goes downward
+- Only tiers that are both enabled AND have a registered provider are attempted
+- `ServiceUnavailableException` if all providers fail
+
+### WebSocket Gateway
+
+- Separate `/speech` namespace (not on the main gateway)
+- Authentication mirrors the main WS gateway pattern (token extraction from handshake)
+- One session per client, accumulates audio chunks in memory
+- Chunks concatenated and transcribed on `stop-transcription`
+- Session cleanup on disconnect
+
+## How to Add a New TTS Provider
+
+1. **Create the provider class** in `providers/`:
+
+```typescript
+// providers/my-tts.provider.ts
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier } from "../interfaces/speech-types";
+
+export class MyTtsProvider extends BaseTTSProvider {
+  readonly name = "my-provider";
+  readonly tier: SpeechTier = "default"; // Choose tier
+
+  constructor(baseURL: string) {
+    super(baseURL, "default-voice", "mp3");
+  }
+
+  override listVoices(): Promise<VoiceInfo[]> {
+    // Return your voice catalog
+  }
+}
+```
+
+2. **Add env vars** to `speech.config.ts`:
+   - Add enabled check function
+   - Add URL to validation in `validateSpeechConfig()`
+   - Add config section in `getSpeechConfig()`
+
+3. **Register in factory** (`tts-provider.factory.ts`):
+
+```typescript
+if (config.tts.myTier.enabled) {
+  const provider = new MyTtsProvider(config.tts.myTier.url);
+  providers.set("myTier", provider);
+}
+```
+
+4. **Add env vars** to `.env.example`
+
+5. **Write tests** following existing patterns (mock OpenAI SDK, test synthesis + listVoices + isHealthy)
+
+## How to Add a New STT Provider
+
+1. **Implement `ISTTProvider`** (does not use a base class -- STT has only one implementation currently)
+2. **Add config section** similar to `stt` in `speech.config.ts`
+3. **Register** in `speech.module.ts` providers array with `STT_PROVIDER` token
+4. **Write tests** following `speaches-stt.provider.spec.ts` pattern
+
+## Common Gotchas
+
+- **OpenAI SDK `apiKey`**: Self-hosted services do not require an API key. Use `apiKey: "not-needed"` when creating the OpenAI client.
+- **`toFile()` import**: The `toFile` helper is imported from `"openai"` (not from a subpath). Used in the STT provider to convert Buffer to a File-like object for multipart upload.
+- **Health check URL**: `BaseTTSProvider.isHealthy()` calls `GET /v1/models`. The base URL is expected to end with `/v1`.
+- **Voice ID prefix parsing**: Kokoro voice IDs encode language + gender in first two characters. See `parseVoicePrefix()` in `kokoro-tts.provider.ts`.
+- **Chatterbox extra body params**: The `reference_audio` (base64) and `exaggeration` fields are passed via the OpenAI SDK by casting the request body. This works because the SDK passes through unknown fields.
+- **WebSocket auth**: The gateway checks `auth.token`, then `query.token`, then `Authorization` header (in that order). Match this in test setup.
+- **Config validation timing**: `validateSpeechConfig()` runs at module init (`onModuleInit`), not at provider construction. This means a misconfigured provider will fail at startup, not at first request.
+
+## Test Patterns
+
+### Mocking OpenAI SDK
+
+All provider tests mock the OpenAI SDK. Pattern:
+
+```typescript
+vi.mock("openai", () => ({
+  default: vi.fn().mockImplementation(() => ({
+    audio: {
+      speech: {
+        create: vi.fn().mockResolvedValue({
+          arrayBuffer: () => Promise.resolve(new ArrayBuffer(10)),
+        }),
+      },
+      transcriptions: {
+        create: vi.fn().mockResolvedValue({
+          text: "transcribed text",
+          language: "en",
+          duration: 3.5,
+        }),
+      },
+    },
+    models: { list: vi.fn().mockResolvedValue({ data: [] }) },
+  })),
+}));
+```
+
+### Mocking Config Injection
+
+```typescript
+const mockConfig: SpeechConfig = {
+  stt: { enabled: true, baseUrl: "http://test:8000/v1", model: "test-model", language: "en" },
+  tts: {
+    default: { enabled: true, url: "http://test:8880/v1", voice: "af_heart", format: "mp3" },
+    premium: { enabled: false, url: "" },
+    fallback: { enabled: false, url: "" },
+  },
+  limits: { maxUploadSize: 25000000, maxDurationSeconds: 600, maxTextLength: 4096 },
+};
+```
+
+### Config Test Pattern
+
+`speech.config.spec.ts` saves and restores `process.env` around each test:
+
+```typescript
+let savedEnv: NodeJS.ProcessEnv;
+beforeEach(() => {
+  savedEnv = { ...process.env };
+});
+afterEach(() => {
+  process.env = savedEnv;
+});
+```
+
+## Key Files
+
+| File                                | Purpose                                                                  |
+| ----------------------------------- | ------------------------------------------------------------------------ |
+| `speech.module.ts`                  | Module registration with conditional providers                           |
+| `speech.config.ts`                  | All speech env vars + validation (51 tests)                              |
+| `speech.service.ts`                 | Core service: transcribe, synthesize (with fallback), listVoices         |
+| `speech.controller.ts`              | REST endpoints: POST transcribe, POST synthesize, GET voices, GET health |
+| `speech.gateway.ts`                 | WebSocket streaming transcription (/speech namespace)                    |
+| `providers/base-tts.provider.ts`    | Abstract base for all TTS providers (OpenAI SDK wrapper)                 |
+| `providers/tts-provider.factory.ts` | Creates provider instances from config                                   |
+| `interfaces/speech-types.ts`        | All shared types: SpeechTier, AudioFormat, options, results              |
--- a/apps/api/src/speech/dto/index.ts
+++ b/apps/api/src/speech/dto/index.ts
@@ -0,0 +1,8 @@
+/**
+ * Speech DTOs barrel export
+ *
+ * Issue #398
+ */
+
+export { TranscribeDto } from "./transcribe.dto";
+export { SynthesizeDto } from "./synthesize.dto";
--- a/apps/api/src/speech/dto/synthesize.dto.ts
+++ b/apps/api/src/speech/dto/synthesize.dto.ts
@@ -0,0 +1,69 @@
+/**
+ * SynthesizeDto
+ *
+ * DTO for text-to-speech synthesis requests.
+ * Text and option fields are validated by class-validator decorators.
+ * Additional options control voice, speed, format, and tier selection.
+ *
+ * Issue #398
+ */
+
+import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
+import { Type } from "class-transformer";
+import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types";
+import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";
+
+export class SynthesizeDto {
+  /**
+   * Text to convert to speech.
+   * Validated by class-validator decorators for type and maximum length.
+   */
+  @IsString({ message: "text must be a string" })
+  @MaxLength(4096, { message: "text must not exceed 4096 characters" })
+  text!: string;
+
+  /**
+   * Voice ID to use for synthesis.
+   * Available voices depend on the selected tier and provider.
+   * If omitted, the default voice from speech config is used.
+   */
+  @IsOptional()
+  @IsString({ message: "voice must be a string" })
+  @MaxLength(100, { message: "voice must not exceed 100 characters" })
+  voice?: string;
+
+  /**
+   * Speech speed multiplier (0.5 to 2.0).
+   * 1.0 is normal speed, <1.0 is slower, >1.0 is faster.
+   */
+  @IsOptional()
+  @Type(() => Number)
+  @IsNumber({}, { message: "speed must be a number" })
+  @Min(0.5, { message: "speed must be at least 0.5" })
+  @Max(2.0, { message: "speed must not exceed 2.0" })
+  speed?: number;
+
+  /**
+   * Desired audio output format.
+   * Supported: mp3, wav, opus, flac, aac, pcm.
+   * If omitted, the default format from speech config is used.
+   */
+  @IsOptional()
+  @IsString({ message: "format must be a string" })
+  @IsIn(AUDIO_FORMATS, {
+    message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`,
+  })
+  format?: AudioFormat;
+
+  /**
+   * TTS tier to use for synthesis.
+   * Controls which provider is used: default (Kokoro), premium (Chatterbox), or fallback (Piper).
+   * If the selected tier is unavailable, the service falls back to the next available tier.
+   */
+  @IsOptional()
+  @IsString({ message: "tier must be a string" })
+  @IsIn(SPEECH_TIERS, {
+    message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`,
+  })
+  tier?: SpeechTier;
+}
--- a/apps/api/src/speech/dto/transcribe.dto.ts
+++ b/apps/api/src/speech/dto/transcribe.dto.ts
@@ -0,0 +1,54 @@
+/**
+ * TranscribeDto
+ *
+ * DTO for speech-to-text transcription requests.
+ * Supports optional language and model overrides.
+ *
+ * The audio file itself is handled by Multer (FileInterceptor)
+ * and validated by AudioValidationPipe.
+ *
+ * Issue #398
+ */
+
+import { IsString, IsOptional, IsNumber, Min, Max, MaxLength } from "class-validator";
+import { Type } from "class-transformer";
+
+export class TranscribeDto {
+  /**
+   * Language code for transcription (e.g., "en", "fr", "de").
+   * If omitted, the default from speech config is used.
+   */
+  @IsOptional()
+  @IsString({ message: "language must be a string" })
+  @MaxLength(10, { message: "language must not exceed 10 characters" })
+  language?: string;
+
+  /**
+   * Model override for transcription.
+   * If omitted, the default model from speech config is used.
+   */
+  @IsOptional()
+  @IsString({ message: "model must be a string" })
+  @MaxLength(200, { message: "model must not exceed 200 characters" })
+  model?: string;
+
+  /**
+   * Optional prompt to guide the transcription model.
+   * Useful for providing context or expected vocabulary.
+   */
+  @IsOptional()
+  @IsString({ message: "prompt must be a string" })
+  @MaxLength(1000, { message: "prompt must not exceed 1000 characters" })
+  prompt?: string;
+
+  /**
+   * Temperature for transcription (0.0 to 1.0).
+   * Lower values produce more deterministic results.
+   */
+  @IsOptional()
+  @Type(() => Number)
+  @IsNumber({}, { message: "temperature must be a number" })
+  @Min(0, { message: "temperature must be at least 0" })
+  @Max(1, { message: "temperature must not exceed 1" })
+  temperature?: number;
+}
--- a/apps/api/src/speech/interfaces/index.ts
+++ b/apps/api/src/speech/interfaces/index.ts
@@ -0,0 +1,19 @@
+/**
+ * Speech interfaces barrel export.
+ *
+ * Issue #389
+ */
+
+export type { ISTTProvider } from "./stt-provider.interface";
+export type { ITTSProvider } from "./tts-provider.interface";
+export { SPEECH_TIERS, AUDIO_FORMATS } from "./speech-types";
+export type {
+  SpeechTier,
+  AudioFormat,
+  TranscribeOptions,
+  TranscriptionResult,
+  TranscriptionSegment,
+  SynthesizeOptions,
+  SynthesisResult,
+  VoiceInfo,
+} from "./speech-types";
--- a/apps/api/src/speech/interfaces/speech-types.ts
+++ b/apps/api/src/speech/interfaces/speech-types.ts
@@ -0,0 +1,178 @@
+/**
+ * Speech Types
+ *
+ * Shared types for speech-to-text (STT) and text-to-speech (TTS) services.
+ * Used by provider interfaces and the SpeechService.
+ *
+ * Issue #389
+ */
+
+// ==========================================
+// Enums / Discriminators
+// ==========================================
+
+/**
+ * Canonical array of TTS provider tiers.
+ * Determines which TTS engine is used for synthesis.
+ *
+ * - default: Primary TTS engine (e.g., Kokoro)
+ * - premium: Higher quality TTS engine (e.g., Chatterbox)
+ * - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
+ */
+export const SPEECH_TIERS = ["default", "premium", "fallback"] as const;
+export type SpeechTier = (typeof SPEECH_TIERS)[number];
+
+/**
+ * Canonical array of audio output formats for TTS synthesis.
+ */
+export const AUDIO_FORMATS = ["mp3", "wav", "opus", "flac", "aac", "pcm"] as const;
+export type AudioFormat = (typeof AUDIO_FORMATS)[number];
+
+// ==========================================
+// STT Types
+// ==========================================
+
+/**
+ * Options for speech-to-text transcription.
+ */
+export interface TranscribeOptions {
+  /** Language code (e.g., "en", "fr", "de") */
+  language?: string;
+
+  /** Model to use for transcription */
+  model?: string;
+
+  /** MIME type of the audio (e.g., "audio/mp3", "audio/wav") */
+  mimeType?: string;
+
+  /** Optional prompt to guide transcription */
+  prompt?: string;
+
+  /** Temperature for transcription (0.0 - 1.0) */
+  temperature?: number;
+}
+
+/**
+ * Result of a speech-to-text transcription.
+ */
+export interface TranscriptionResult {
+  /** Transcribed text */
+  text: string;
+
+  /** Language detected or used */
+  language: string;
+
+  /** Duration of the audio in seconds */
+  durationSeconds?: number;
+
+  /** Confidence score (0.0 - 1.0, if available) */
+  confidence?: number;
+
+  /** Individual word or segment timings (if available) */
+  segments?: TranscriptionSegment[];
+}
+
+/**
+ * A segment within a transcription result.
+ */
+export interface TranscriptionSegment {
+  /** Segment text */
+  text: string;
+
+  /** Start time in seconds */
+  start: number;
+
+  /** End time in seconds */
+  end: number;
+
+  /** Confidence for this segment */
+  confidence?: number;
+}
+
+// ==========================================
+// TTS Types
+// ==========================================
+
+/**
+ * Options for text-to-speech synthesis.
+ */
+export interface SynthesizeOptions {
+  /** Voice ID to use */
+  voice?: string;
+
+  /** Desired audio format */
+  format?: AudioFormat;
+
+  /** Speech speed multiplier (0.5 - 2.0) */
+  speed?: number;
+
+  /** Preferred TTS tier */
+  tier?: SpeechTier;
+}
+
+/**
+ * Result of a text-to-speech synthesis.
+ */
+export interface SynthesisResult {
+  /** Synthesized audio data */
+  audio: Buffer;
+
+  /** Audio format of the result */
+  format: AudioFormat;
+
+  /** Voice used for synthesis */
+  voice: string;
+
+  /** Tier that produced the synthesis */
+  tier: SpeechTier;
+
+  /** Duration of the generated audio in seconds (if available) */
+  durationSeconds?: number;
+}
+
+/**
+ * Extended options for Chatterbox TTS synthesis.
+ *
+ * Chatterbox supports voice cloning via a reference audio buffer and
+ * emotion exaggeration control. These are passed as extra body parameters
+ * to the OpenAI-compatible API.
+ *
+ * Issue #394
+ */
+export interface ChatterboxSynthesizeOptions extends SynthesizeOptions {
+  /**
+   * Reference audio buffer for voice cloning.
+   * When provided, Chatterbox will clone the voice from this audio sample.
+   * Should be a WAV or MP3 file of 5-30 seconds for best results.
+   */
+  referenceAudio?: Buffer;
+
+  /**
+   * Emotion exaggeration factor (0.0 to 1.0).
+   * Controls how much emotional expression is applied to the synthesized speech.
+   * - 0.0: Neutral, minimal emotion
+   * - 0.5: Moderate emotion (default when not specified)
+   * - 1.0: Maximum emotion exaggeration
+   */
+  emotionExaggeration?: number;
+}
+
+/**
+ * Information about an available TTS voice.
+ */
+export interface VoiceInfo {
+  /** Voice identifier */
+  id: string;
+
+  /** Human-readable voice name */
+  name: string;
+
+  /** Language code */
+  language?: string;
+
+  /** Tier this voice belongs to */
+  tier: SpeechTier;
+
+  /** Whether this is the default voice for its tier */
+  isDefault?: boolean;
+}
--- a/apps/api/src/speech/interfaces/stt-provider.interface.ts
+++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts
@@ -0,0 +1,52 @@
+/**
+ * STT Provider Interface
+ *
+ * Defines the contract for speech-to-text provider implementations.
+ * All STT providers (e.g., Speaches/faster-whisper) must implement this interface.
+ *
+ * Issue #389
+ */
+
+import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
+
+/**
+ * Interface for speech-to-text providers.
+ *
+ * Implementations wrap an OpenAI-compatible API endpoint for transcription.
+ *
+ * @example
+ * ```typescript
+ * class SpeachesSttProvider implements ISTTProvider {
+ *   readonly name = "speaches";
+ *
+ *   async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
+ *     // Call speaches API via OpenAI SDK
+ *   }
+ *
+ *   async isHealthy(): Promise<boolean> {
+ *     // Check endpoint health
+ *   }
+ * }
+ * ```
+ */
+export interface ISTTProvider {
+  /** Provider name for logging and identification */
+  readonly name: string;
+
+  /**
+   * Transcribe audio data to text.
+   *
+   * @param audio - Raw audio data as a Buffer
+   * @param options - Optional transcription parameters
+   * @returns Transcription result with text and metadata
+   * @throws {Error} If transcription fails
+   */
+  transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult>;
+
+  /**
+   * Check if the provider is healthy and available.
+   *
+   * @returns true if the provider endpoint is reachable and ready
+   */
+  isHealthy(): Promise<boolean>;
+}
--- a/apps/api/src/speech/interfaces/tts-provider.interface.ts
+++ b/apps/api/src/speech/interfaces/tts-provider.interface.ts
@@ -0,0 +1,68 @@
+/**
+ * TTS Provider Interface
+ *
+ * Defines the contract for text-to-speech provider implementations.
+ * All TTS providers (e.g., Kokoro, Chatterbox, Piper/OpenedAI) must implement this interface.
+ *
+ * Issue #389
+ */
+
+import type { SynthesizeOptions, SynthesisResult, VoiceInfo, SpeechTier } from "./speech-types";
+
+/**
+ * Interface for text-to-speech providers.
+ *
+ * Implementations wrap an OpenAI-compatible API endpoint for speech synthesis.
+ * Each provider is associated with a SpeechTier (default, premium, fallback).
+ *
+ * @example
+ * ```typescript
+ * class KokoroProvider implements ITTSProvider {
+ *   readonly name = "kokoro";
+ *   readonly tier = "default";
+ *
+ *   async synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult> {
+ *     // Call Kokoro API via OpenAI SDK
+ *   }
+ *
+ *   async listVoices(): Promise<VoiceInfo[]> {
+ *     // Return available voices
+ *   }
+ *
+ *   async isHealthy(): Promise<boolean> {
+ *     // Check endpoint health
+ *   }
+ * }
+ * ```
+ */
+export interface ITTSProvider {
+  /** Provider name for logging and identification */
+  readonly name: string;
+
+  /** Tier this provider serves (default, premium, fallback) */
+  readonly tier: SpeechTier;
+
+  /**
+   * Synthesize text to audio.
+   *
+   * @param text - Text to convert to speech
+   * @param options - Optional synthesis parameters (voice, format, speed)
+   * @returns Synthesis result with audio buffer and metadata
+   * @throws {Error} If synthesis fails
+   */
+  synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult>;
+
+  /**
+   * List available voices for this provider.
+   *
+   * @returns Array of voice information objects
+   */
+  listVoices(): Promise<VoiceInfo[]>;
+
+  /**
+   * Check if the provider is healthy and available.
+   *
+   * @returns true if the provider endpoint is reachable and ready
+   */
+  isHealthy(): Promise<boolean>;
+}
--- a/apps/api/src/speech/pipes/audio-validation.pipe.spec.ts
+++ b/apps/api/src/speech/pipes/audio-validation.pipe.spec.ts
@@ -0,0 +1,205 @@
+/**
+ * AudioValidationPipe Tests
+ *
+ * Issue #398: Validates uploaded audio files for MIME type and file size.
+ * Tests cover valid types, invalid types, size limits, and edge cases.
+ */
+
+import { describe, it, expect, beforeEach } from "vitest";
+import { BadRequestException } from "@nestjs/common";
+import { AudioValidationPipe } from "./audio-validation.pipe";
+
+/**
+ * Helper to create a mock Express.Multer.File object.
+ */
+function createMockFile(overrides: Partial<Express.Multer.File> = {}): Express.Multer.File {
+  return {
+    fieldname: "file",
+    originalname: "test.mp3",
+    encoding: "7bit",
+    mimetype: "audio/mpeg",
+    size: 1024,
+    destination: "",
+    filename: "",
+    path: "",
+    buffer: Buffer.from("fake-audio-data"),
+    stream: undefined as never,
+    ...overrides,
+  };
+}
+
+describe("AudioValidationPipe", () => {
+  // ==========================================
+  // Default config (25MB max)
+  // ==========================================
+  describe("with default config", () => {
+    let pipe: AudioValidationPipe;
+
+    beforeEach(() => {
+      pipe = new AudioValidationPipe();
+    });
+
+    // ==========================================
+    // MIME type validation
+    // ==========================================
+    describe("MIME type validation", () => {
+      it("should accept audio/wav", () => {
+        const file = createMockFile({ mimetype: "audio/wav" });
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should accept audio/mp3", () => {
+        const file = createMockFile({ mimetype: "audio/mp3" });
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should accept audio/mpeg", () => {
+        const file = createMockFile({ mimetype: "audio/mpeg" });
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should accept audio/webm", () => {
+        const file = createMockFile({ mimetype: "audio/webm" });
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should accept audio/ogg", () => {
+        const file = createMockFile({ mimetype: "audio/ogg" });
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should accept audio/flac", () => {
+        const file = createMockFile({ mimetype: "audio/flac" });
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should accept audio/x-m4a", () => {
+        const file = createMockFile({ mimetype: "audio/x-m4a" });
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should reject unsupported MIME types with descriptive error", () => {
+        const file = createMockFile({ mimetype: "video/mp4" });
+        expect(() => pipe.transform(file)).toThrow(BadRequestException);
+        expect(() => pipe.transform(file)).toThrow(/Unsupported audio format.*video\/mp4/);
+      });
+
+      it("should reject application/octet-stream", () => {
+        const file = createMockFile({ mimetype: "application/octet-stream" });
+        expect(() => pipe.transform(file)).toThrow(BadRequestException);
+      });
+
+      it("should reject text/plain", () => {
+        const file = createMockFile({ mimetype: "text/plain" });
+        expect(() => pipe.transform(file)).toThrow(BadRequestException);
+      });
+
+      it("should reject image/png", () => {
+        const file = createMockFile({ mimetype: "image/png" });
+        expect(() => pipe.transform(file)).toThrow(BadRequestException);
+      });
+
+      it("should include supported formats in error message", () => {
+        const file = createMockFile({ mimetype: "video/mp4" });
+        try {
+          pipe.transform(file);
+          expect.fail("Expected BadRequestException");
+        } catch (error) {
+          expect(error).toBeInstanceOf(BadRequestException);
+          const response = (error as BadRequestException).getResponse();
+          const message =
+            typeof response === "string" ? response : (response as Record<string, unknown>).message;
+          expect(message).toContain("audio/wav");
+          expect(message).toContain("audio/mpeg");
+        }
+      });
+    });
+
+    // ==========================================
+    // File size validation
+    // ==========================================
+    describe("file size validation", () => {
+      it("should accept files under the size limit", () => {
+        const file = createMockFile({ size: 1024 * 1024 }); // 1MB
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should accept files exactly at the size limit", () => {
+        const file = createMockFile({ size: 25_000_000 }); // 25MB (default)
+        expect(pipe.transform(file)).toBe(file);
+      });
+
+      it("should reject files exceeding the size limit", () => {
+        const file = createMockFile({ size: 25_000_001 }); // 1 byte over
+        expect(() => pipe.transform(file)).toThrow(BadRequestException);
+        expect(() => pipe.transform(file)).toThrow(/exceeds maximum/);
+      });
+
+      it("should include human-readable sizes in error message", () => {
+        const file = createMockFile({ size: 30_000_000 }); // 30MB
+        try {
+          pipe.transform(file);
+          expect.fail("Expected BadRequestException");
+        } catch (error) {
+          expect(error).toBeInstanceOf(BadRequestException);
+          const response = (error as BadRequestException).getResponse();
+          const message =
+            typeof response === "string" ? response : (response as Record<string, unknown>).message;
+          // Should show something like "28.6 MB" and "23.8 MB"
+          expect(message).toContain("MB");
+        }
+      });
+
+      it("should accept zero-size files (MIME check still applies)", () => {
+        const file = createMockFile({ size: 0 });
+        expect(pipe.transform(file)).toBe(file);
+      });
+    });
+
+    // ==========================================
+    // Edge cases
+    // ==========================================
+    describe("edge cases", () => {
+      it("should throw if no file is provided (null)", () => {
+        expect(() => pipe.transform(null as unknown as Express.Multer.File)).toThrow(
+          BadRequestException
+        );
+        expect(() => pipe.transform(null as unknown as Express.Multer.File)).toThrow(
+          /No audio file provided/
+        );
+      });
+
+      it("should throw if no file is provided (undefined)", () => {
+        expect(() => pipe.transform(undefined as unknown as Express.Multer.File)).toThrow(
+          BadRequestException
+        );
+      });
+    });
+  });
+
+  // ==========================================
+  // Custom config
+  // ==========================================
+  describe("with custom config", () => {
+    it("should use custom max file size", () => {
+      const pipe = new AudioValidationPipe({ maxFileSize: 1_000_000 }); // 1MB
+      const smallFile = createMockFile({ size: 500_000 });
+      expect(pipe.transform(smallFile)).toBe(smallFile);
+
+      const largeFile = createMockFile({ size: 1_000_001 });
+      expect(() => pipe.transform(largeFile)).toThrow(BadRequestException);
+    });
+
+    it("should allow overriding accepted MIME types", () => {
+      const pipe = new AudioValidationPipe({
+        allowedMimeTypes: ["audio/wav"],
+      });
+
+      const wavFile = createMockFile({ mimetype: "audio/wav" });
+      expect(pipe.transform(wavFile)).toBe(wavFile);
+
+      const mp3File = createMockFile({ mimetype: "audio/mpeg" });
+      expect(() => pipe.transform(mp3File)).toThrow(BadRequestException);
+    });
+  });
+});
--- a/apps/api/src/speech/pipes/audio-validation.pipe.ts
+++ b/apps/api/src/speech/pipes/audio-validation.pipe.ts
@@ -0,0 +1,102 @@
+/**
+ * AudioValidationPipe
+ *
+ * NestJS PipeTransform that validates uploaded audio files.
+ * Checks MIME type against an allow-list and file size against a configurable maximum.
+ *
+ * Usage:
+ * ```typescript
+ * @Post('transcribe')
+ * @UseInterceptors(FileInterceptor('file'))
+ * async transcribe(
+ *   @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File,
+ * ) { ... }
+ * ```
+ *
+ * Issue #398
+ */
+
+import { BadRequestException } from "@nestjs/common";
+import type { PipeTransform } from "@nestjs/common";
+
+/**
+ * Default accepted MIME types for audio uploads.
+ */
+const DEFAULT_ALLOWED_MIME_TYPES: readonly string[] = [
+  "audio/wav",
+  "audio/mp3",
+  "audio/mpeg",
+  "audio/webm",
+  "audio/ogg",
+  "audio/flac",
+  "audio/x-m4a",
+] as const;
+
+/**
+ * Default maximum upload size in bytes (25 MB).
+ */
+const DEFAULT_MAX_FILE_SIZE = 25_000_000;
+
+/**
+ * Options for customizing AudioValidationPipe behavior.
+ */
+export interface AudioValidationPipeOptions {
+  /** Maximum file size in bytes. Defaults to 25 MB. */
+  maxFileSize?: number;
+
+  /** List of accepted MIME types. Defaults to common audio formats. */
+  allowedMimeTypes?: string[];
+}
+
+/**
+ * Format bytes into a human-readable string (e.g., "25.0 MB").
+ */
+function formatBytes(bytes: number): string {
+  if (bytes < 1024) {
+    return `${String(bytes)} B`;
+  }
+  if (bytes < 1024 * 1024) {
+    return `${(bytes / 1024).toFixed(1)} KB`;
+  }
+  return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+}
+
+export class AudioValidationPipe implements PipeTransform<Express.Multer.File | undefined> {
+  private readonly maxFileSize: number;
+  private readonly allowedMimeTypes: readonly string[];
+
+  constructor(options?: AudioValidationPipeOptions) {
+    this.maxFileSize = options?.maxFileSize ?? DEFAULT_MAX_FILE_SIZE;
+    this.allowedMimeTypes = options?.allowedMimeTypes ?? DEFAULT_ALLOWED_MIME_TYPES;
+  }
+
+  /**
+   * Validate the uploaded file's MIME type and size.
+   *
+   * @param file - The uploaded file from Multer
+   * @returns The validated file, unchanged
+   * @throws {BadRequestException} If the file is missing, has an unsupported MIME type, or exceeds the size limit
+   */
+  transform(file: Express.Multer.File | undefined): Express.Multer.File {
+    if (!file) {
+      throw new BadRequestException("No audio file provided");
+    }
+
+    // Validate MIME type
+    if (!this.allowedMimeTypes.includes(file.mimetype)) {
+      throw new BadRequestException(
+        `Unsupported audio format: ${file.mimetype}. ` +
+          `Supported formats: ${this.allowedMimeTypes.join(", ")}`
+      );
+    }
+
+    // Validate file size
+    if (file.size > this.maxFileSize) {
+      throw new BadRequestException(
+        `File size ${formatBytes(file.size)} exceeds maximum allowed size of ${formatBytes(this.maxFileSize)}`
+      );
+    }
+
+    return file;
+  }
+}
--- a/apps/api/src/speech/pipes/index.ts
+++ b/apps/api/src/speech/pipes/index.ts
@@ -0,0 +1,10 @@
+/**
+ * Speech Pipes barrel export
+ *
+ * Issue #398
+ */
+
+export { AudioValidationPipe } from "./audio-validation.pipe";
+export type { AudioValidationPipeOptions } from "./audio-validation.pipe";
+export { TextValidationPipe } from "./text-validation.pipe";
+export type { TextValidationPipeOptions } from "./text-validation.pipe";
--- a/apps/api/src/speech/pipes/text-validation.pipe.spec.ts
+++ b/apps/api/src/speech/pipes/text-validation.pipe.spec.ts
@@ -0,0 +1,136 @@
+/**
+ * TextValidationPipe Tests
+ *
+ * Issue #398: Validates text input for TTS synthesis.
+ * Tests cover text length, empty text, whitespace, and configurable limits.
+ */
+
+import { describe, it, expect, beforeEach } from "vitest";
+import { BadRequestException } from "@nestjs/common";
+import { TextValidationPipe } from "./text-validation.pipe";
+
+describe("TextValidationPipe", () => {
+  // ==========================================
+  // Default config (4096 max length)
+  // ==========================================
+  describe("with default config", () => {
+    let pipe: TextValidationPipe;
+
+    beforeEach(() => {
+      pipe = new TextValidationPipe();
+    });
+
+    // ==========================================
+    // Valid text
+    // ==========================================
+    describe("valid text", () => {
+      it("should accept normal text", () => {
+        const text = "Hello, world!";
+        expect(pipe.transform(text)).toBe(text);
+      });
+
+      it("should accept text at exactly the max length", () => {
+        const text = "a".repeat(4096);
+        expect(pipe.transform(text)).toBe(text);
+      });
+
+      it("should accept single character text", () => {
+        expect(pipe.transform("a")).toBe("a");
+      });
+
+      it("should accept text with unicode characters", () => {
+        const text = "Hello, world! 你好世界";
+        expect(pipe.transform(text)).toBe(text);
+      });
+
+      it("should accept multi-line text", () => {
+        const text = "Line one.\nLine two.\nLine three.";
+        expect(pipe.transform(text)).toBe(text);
+      });
+    });
+
+    // ==========================================
+    // Text length validation
+    // ==========================================
+    describe("text length validation", () => {
+      it("should reject text exceeding max length", () => {
+        const text = "a".repeat(4097);
+        expect(() => pipe.transform(text)).toThrow(BadRequestException);
+        expect(() => pipe.transform(text)).toThrow(/exceeds maximum/);
+      });
+
+      it("should include length details in error message", () => {
+        const text = "a".repeat(5000);
+        try {
+          pipe.transform(text);
+          expect.fail("Expected BadRequestException");
+        } catch (error) {
+          expect(error).toBeInstanceOf(BadRequestException);
+          const response = (error as BadRequestException).getResponse();
+          const message =
+            typeof response === "string" ? response : (response as Record<string, unknown>).message;
+          expect(message).toContain("5000");
+          expect(message).toContain("4096");
+        }
+      });
+    });
+
+    // ==========================================
+    // Empty text validation
+    // ==========================================
+    describe("empty text validation", () => {
+      it("should reject empty string", () => {
+        expect(() => pipe.transform("")).toThrow(BadRequestException);
+        expect(() => pipe.transform("")).toThrow(/Text cannot be empty/);
+      });
+
+      it("should reject whitespace-only string", () => {
+        expect(() => pipe.transform("   ")).toThrow(BadRequestException);
+        expect(() => pipe.transform("   ")).toThrow(/Text cannot be empty/);
+      });
+
+      it("should reject tabs and newlines only", () => {
+        expect(() => pipe.transform("\t\n\r")).toThrow(BadRequestException);
+      });
+
+      it("should reject null", () => {
+        expect(() => pipe.transform(null as unknown as string)).toThrow(BadRequestException);
+      });
+
+      it("should reject undefined", () => {
+        expect(() => pipe.transform(undefined as unknown as string)).toThrow(BadRequestException);
+      });
+    });
+
+    // ==========================================
+    // Text with leading/trailing whitespace
+    // ==========================================
+    describe("whitespace handling", () => {
+      it("should accept text with leading/trailing whitespace (preserves it)", () => {
+        const text = "  Hello, world!  ";
+        expect(pipe.transform(text)).toBe(text);
+      });
+    });
+  });
+
+  // ==========================================
+  // Custom config
+  // ==========================================
+  describe("with custom config", () => {
+    it("should use custom max text length", () => {
+      const pipe = new TextValidationPipe({ maxTextLength: 100 });
+
+      const shortText = "Hello";
+      expect(pipe.transform(shortText)).toBe(shortText);
+
+      const longText = "a".repeat(101);
+      expect(() => pipe.transform(longText)).toThrow(BadRequestException);
+    });
+
+    it("should accept text at exact custom limit", () => {
+      const pipe = new TextValidationPipe({ maxTextLength: 50 });
+      const text = "a".repeat(50);
+      expect(pipe.transform(text)).toBe(text);
+    });
+  });
+});
--- a/apps/api/src/speech/pipes/text-validation.pipe.ts
+++ b/apps/api/src/speech/pipes/text-validation.pipe.ts
@@ -0,0 +1,65 @@
+/**
+ * TextValidationPipe
+ *
+ * NestJS PipeTransform that validates text input for TTS synthesis.
+ * Checks that text is non-empty and within the configurable maximum length.
+ *
+ * Usage:
+ * ```typescript
+ * @Post('synthesize')
+ * async synthesize(
+ *   @Body('text', new TextValidationPipe()) text: string,
+ * ) { ... }
+ * ```
+ *
+ * Issue #398
+ */
+
+import { BadRequestException } from "@nestjs/common";
+import type { PipeTransform } from "@nestjs/common";
+
+/**
+ * Default maximum text length for TTS input (4096 characters).
+ */
+const DEFAULT_MAX_TEXT_LENGTH = 4096;
+
+/**
+ * Options for customizing TextValidationPipe behavior.
+ */
+export interface TextValidationPipeOptions {
+  /** Maximum text length in characters. Defaults to 4096. */
+  maxTextLength?: number;
+}
+
+export class TextValidationPipe implements PipeTransform<string | null | undefined> {
+  private readonly maxTextLength: number;
+
+  constructor(options?: TextValidationPipeOptions) {
+    this.maxTextLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
+  }
+
+  /**
+   * Validate the text input for TTS synthesis.
+   *
+   * @param text - The text to validate
+   * @returns The validated text, unchanged
+   * @throws {BadRequestException} If text is empty, whitespace-only, or exceeds the max length
+   */
+  transform(text: string | null | undefined): string {
+    if (text === null || text === undefined) {
+      throw new BadRequestException("Text cannot be empty");
+    }
+
+    if (text.trim().length === 0) {
+      throw new BadRequestException("Text cannot be empty");
+    }
+
+    if (text.length > this.maxTextLength) {
+      throw new BadRequestException(
+        `Text length ${String(text.length)} exceeds maximum allowed length of ${String(this.maxTextLength)} characters`
+      );
+    }
+
+    return text;
+  }
+}
--- a/apps/api/src/speech/providers/base-tts.provider.spec.ts
+++ b/apps/api/src/speech/providers/base-tts.provider.spec.ts
@@ -0,0 +1,329 @@
+/**
+ * BaseTTSProvider Unit Tests
+ *
+ * Tests the abstract base class for OpenAI-compatible TTS providers.
+ * Uses a concrete test implementation to exercise the base class logic.
+ *
+ * Issue #391
+ */
+
+import { describe, it, expect, beforeEach, vi, type Mock } from "vitest";
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier, SynthesizeOptions, AudioFormat } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+const mockCreate = vi.fn();
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      speech: {
+        create: mockCreate,
+      },
+    };
+  }
+  return { default: MockOpenAI };
+});
+
+// ==========================================
+// Concrete test implementation
+// ==========================================
+
+class TestTTSProvider extends BaseTTSProvider {
+  readonly name = "test-provider";
+  readonly tier: SpeechTier = "default";
+
+  constructor(baseURL: string, defaultVoice?: string, defaultFormat?: AudioFormat) {
+    super(baseURL, defaultVoice, defaultFormat);
+  }
+}
+
+// ==========================================
+// Test helpers
+// ==========================================
+
+/**
+ * Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return.
+ * The OpenAI SDK returns a Response object with arrayBuffer() method.
+ */
+function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } {
+  return {
+    arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer),
+  };
+}
+
+describe("BaseTTSProvider", () => {
+  let provider: TestTTSProvider;
+
+  const testBaseURL = "http://localhost:8880/v1";
+  const testVoice = "af_heart";
+  const testFormat: AudioFormat = "mp3";
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    provider = new TestTTSProvider(testBaseURL, testVoice, testFormat);
+  });
+
+  // ==========================================
+  // Constructor
+  // ==========================================
+
+  describe("constructor", () => {
+    it("should create an instance with provided configuration", () => {
+      expect(provider).toBeDefined();
+      expect(provider.name).toBe("test-provider");
+      expect(provider.tier).toBe("default");
+    });
+
+    it("should use default voice 'alloy' when none provided", () => {
+      const defaultProvider = new TestTTSProvider(testBaseURL);
+      expect(defaultProvider).toBeDefined();
+    });
+
+    it("should use default format 'mp3' when none provided", () => {
+      const defaultProvider = new TestTTSProvider(testBaseURL, "voice-1");
+      expect(defaultProvider).toBeDefined();
+    });
+  });
+
+  // ==========================================
+  // synthesize()
+  // ==========================================
+
+  describe("synthesize", () => {
+    it("should synthesize text and return a SynthesisResult with audio buffer", async () => {
+      const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const result = await provider.synthesize("Hello, world!");
+
+      expect(result).toBeDefined();
+      expect(result.audio).toBeInstanceOf(Buffer);
+      expect(result.audio.length).toBe(audioBytes.length);
+      expect(result.format).toBe("mp3");
+      expect(result.voice).toBe("af_heart");
+      expect(result.tier).toBe("default");
+    });
+
+    it("should pass correct parameters to OpenAI SDK", async () => {
+      const audioBytes = new Uint8Array([0x01, 0x02]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      await provider.synthesize("Test text");
+
+      expect(mockCreate).toHaveBeenCalledWith({
+        model: "tts-1",
+        input: "Test text",
+        voice: "af_heart",
+        response_format: "mp3",
+        speed: 1.0,
+      });
+    });
+
+    it("should use custom voice from options", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: SynthesizeOptions = { voice: "custom_voice" };
+      const result = await provider.synthesize("Hello", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "custom_voice" }));
+      expect(result.voice).toBe("custom_voice");
+    });
+
+    it("should use custom format from options", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: SynthesizeOptions = { format: "wav" };
+      const result = await provider.synthesize("Hello", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "wav" }));
+      expect(result.format).toBe("wav");
+    });
+
+    it("should use custom speed from options", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: SynthesizeOptions = { speed: 1.5 };
+      await provider.synthesize("Hello", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ speed: 1.5 }));
+    });
+
+    it("should throw an error when synthesis fails", async () => {
+      mockCreate.mockRejectedValue(new Error("Connection refused"));
+
+      await expect(provider.synthesize("Hello")).rejects.toThrow(
+        "TTS synthesis failed for test-provider: Connection refused"
+      );
+    });
+
+    it("should throw an error when response arrayBuffer fails", async () => {
+      const mockResponse = {
+        arrayBuffer: vi.fn().mockRejectedValue(new Error("Read error")),
+      };
+      mockCreate.mockResolvedValue(mockResponse);
+
+      await expect(provider.synthesize("Hello")).rejects.toThrow(
+        "TTS synthesis failed for test-provider: Read error"
+      );
+    });
+
+    it("should handle empty text input gracefully", async () => {
+      const audioBytes = new Uint8Array([]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const result = await provider.synthesize("");
+
+      expect(result.audio).toBeInstanceOf(Buffer);
+      expect(result.audio.length).toBe(0);
+    });
+
+    it("should handle non-Error exceptions", async () => {
+      mockCreate.mockRejectedValue("string error");
+
+      await expect(provider.synthesize("Hello")).rejects.toThrow(
+        "TTS synthesis failed for test-provider: string error"
+      );
+    });
+  });
+
+  // ==========================================
+  // listVoices()
+  // ==========================================
+
+  describe("listVoices", () => {
+    it("should return default voice list with the configured default voice", async () => {
+      const voices = await provider.listVoices();
+
+      expect(voices).toBeInstanceOf(Array);
+      expect(voices.length).toBeGreaterThan(0);
+
+      const defaultVoice = voices.find((v) => v.isDefault === true);
+      expect(defaultVoice).toBeDefined();
+      expect(defaultVoice?.id).toBe("af_heart");
+      expect(defaultVoice?.tier).toBe("default");
+    });
+
+    it("should set tier correctly on all returned voices", async () => {
+      const voices = await provider.listVoices();
+
+      for (const voice of voices) {
+        expect(voice.tier).toBe("default");
+      }
+    });
+  });
+
+  // ==========================================
+  // isHealthy()
+  // ==========================================
+
+  describe("isHealthy", () => {
+    it("should return true when the TTS server is reachable", async () => {
+      // Mock global fetch for health check
+      const mockFetch = vi.fn().mockResolvedValue({
+        ok: true,
+        status: 200,
+      });
+      vi.stubGlobal("fetch", mockFetch);
+
+      const healthy = await provider.isHealthy();
+
+      expect(healthy).toBe(true);
+      expect(mockFetch).toHaveBeenCalled();
+
+      vi.unstubAllGlobals();
+    });
+
+    it("should return false when the TTS server is unreachable", async () => {
+      const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED"));
+      vi.stubGlobal("fetch", mockFetch);
+
+      const healthy = await provider.isHealthy();
+
+      expect(healthy).toBe(false);
+
+      vi.unstubAllGlobals();
+    });
+
+    it("should return false when the TTS server returns an error status", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({
+        ok: false,
+        status: 503,
+      });
+      vi.stubGlobal("fetch", mockFetch);
+
+      const healthy = await provider.isHealthy();
+
+      expect(healthy).toBe(false);
+
+      vi.unstubAllGlobals();
+    });
+
+    it("should use the base URL for the health check", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200 });
+      vi.stubGlobal("fetch", mockFetch);
+
+      await provider.isHealthy();
+
+      // Should call a health-related endpoint at the base URL
+      const calledUrl = mockFetch.mock.calls[0][0] as string;
+      expect(calledUrl).toContain("localhost:8880");
+
+      vi.unstubAllGlobals();
+    });
+
+    it("should set a timeout for the health check", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200 });
+      vi.stubGlobal("fetch", mockFetch);
+
+      await provider.isHealthy();
+
+      // Should pass an AbortSignal for timeout
+      const fetchOptions = mockFetch.mock.calls[0][1] as RequestInit;
+      expect(fetchOptions.signal).toBeDefined();
+
+      vi.unstubAllGlobals();
+    });
+  });
+
+  // ==========================================
+  // Default values
+  // ==========================================
+
+  describe("default values", () => {
+    it("should use 'alloy' as default voice when none specified", async () => {
+      const defaultProvider = new TestTTSProvider(testBaseURL);
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      await defaultProvider.synthesize("Hello");
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "alloy" }));
+    });
+
+    it("should use 'mp3' as default format when none specified", async () => {
+      const defaultProvider = new TestTTSProvider(testBaseURL);
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      await defaultProvider.synthesize("Hello");
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" }));
+    });
+
+    it("should use speed 1.0 as default speed", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      await provider.synthesize("Hello");
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ speed: 1.0 }));
+    });
+  });
+});
--- a/apps/api/src/speech/providers/base-tts.provider.ts
+++ b/apps/api/src/speech/providers/base-tts.provider.ts
@@ -0,0 +1,189 @@
+/**
+ * Base TTS Provider
+ *
+ * Abstract base class implementing common OpenAI-compatible TTS logic.
+ * All concrete TTS providers (Kokoro, Chatterbox, Piper) extend this class.
+ *
+ * Uses the OpenAI SDK with a configurable baseURL to communicate with
+ * OpenAI-compatible speech synthesis endpoints.
+ *
+ * Issue #391
+ */
+
+import { Logger } from "@nestjs/common";
+import OpenAI from "openai";
+import type { ITTSProvider } from "../interfaces/tts-provider.interface";
+import type {
+  SpeechTier,
+  SynthesizeOptions,
+  SynthesisResult,
+  VoiceInfo,
+  AudioFormat,
+} from "../interfaces/speech-types";
+
+/** Default TTS model identifier used for OpenAI-compatible APIs */
+const DEFAULT_MODEL = "tts-1";
+
+/** Default voice when none is configured */
+const DEFAULT_VOICE = "alloy";
+
+/** Default audio format */
+const DEFAULT_FORMAT: AudioFormat = "mp3";
+
+/** Default speech speed multiplier */
+const DEFAULT_SPEED = 1.0;
+
+/** Health check timeout in milliseconds */
+const HEALTH_CHECK_TIMEOUT_MS = 5000;
+
+/**
+ * Abstract base class for OpenAI-compatible TTS providers.
+ *
+ * Provides common logic for:
+ * - Synthesizing text to audio via OpenAI SDK's audio.speech.create()
+ * - Listing available voices (with a default implementation)
+ * - Health checking the TTS endpoint
+ *
+ * Subclasses must set `name` and `tier` properties and may override
+ * `listVoices()` to provide provider-specific voice lists.
+ *
+ * @example
+ * ```typescript
+ * class KokoroProvider extends BaseTTSProvider {
+ *   readonly name = "kokoro";
+ *   readonly tier: SpeechTier = "default";
+ *
+ *   constructor(baseURL: string) {
+ *     super(baseURL, "af_heart", "mp3");
+ *   }
+ * }
+ * ```
+ */
+export abstract class BaseTTSProvider implements ITTSProvider {
+  abstract readonly name: string;
+  abstract readonly tier: SpeechTier;
+
+  protected readonly logger: Logger;
+  protected readonly client: OpenAI;
+  protected readonly baseURL: string;
+  protected readonly defaultVoice: string;
+  protected readonly defaultFormat: AudioFormat;
+
+  /**
+   * Create a new BaseTTSProvider.
+   *
+   * @param baseURL - The base URL for the OpenAI-compatible TTS endpoint
+   * @param defaultVoice - Default voice ID to use when none is specified in options
+   * @param defaultFormat - Default audio format to use when none is specified in options
+   */
+  constructor(
+    baseURL: string,
+    defaultVoice: string = DEFAULT_VOICE,
+    defaultFormat: AudioFormat = DEFAULT_FORMAT
+  ) {
+    this.baseURL = baseURL;
+    this.defaultVoice = defaultVoice;
+    this.defaultFormat = defaultFormat;
+    this.logger = new Logger(this.constructor.name);
+
+    this.client = new OpenAI({
+      baseURL,
+      apiKey: "not-needed", // Self-hosted services don't require an API key
+    });
+  }
+
+  /**
+   * Synthesize text to audio using the OpenAI-compatible TTS endpoint.
+   *
+   * Calls `client.audio.speech.create()` with the provided text and options,
+   * then converts the response to a Buffer.
+   *
+   * @param text - Text to convert to speech
+   * @param options - Optional synthesis parameters (voice, format, speed)
+   * @returns Synthesis result with audio buffer and metadata
+   * @throws {Error} If synthesis fails
+   */
+  async synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult> {
+    const voice = options?.voice ?? this.defaultVoice;
+    const format = options?.format ?? this.defaultFormat;
+    const speed = options?.speed ?? DEFAULT_SPEED;
+
+    try {
+      const response = await this.client.audio.speech.create({
+        model: DEFAULT_MODEL,
+        input: text,
+        voice,
+        response_format: format,
+        speed,
+      });
+
+      const arrayBuffer = await response.arrayBuffer();
+      const audio = Buffer.from(arrayBuffer);
+
+      return {
+        audio,
+        format,
+        voice,
+        tier: this.tier,
+      };
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`TTS synthesis failed: ${message}`);
+      throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
+    }
+  }
+
+  /**
+   * List available voices for this provider.
+   *
+   * Default implementation returns the configured default voice.
+   * Subclasses should override this to provide a full voice list
+   * from their specific TTS engine.
+   *
+   * @returns Array of voice information objects
+   */
+  listVoices(): Promise<VoiceInfo[]> {
+    return Promise.resolve([
+      {
+        id: this.defaultVoice,
+        name: this.defaultVoice,
+        tier: this.tier,
+        isDefault: true,
+      },
+    ]);
+  }
+
+  /**
+   * Check if the TTS server is reachable and healthy.
+   *
+   * Performs a simple HTTP request to the base URL's models endpoint
+   * to verify the server is running and responding.
+   *
+   * @returns true if the server is reachable, false otherwise
+   */
+  async isHealthy(): Promise<boolean> {
+    try {
+      // Extract the base URL without the /v1 path for health checking
+      const healthUrl = this.baseURL.replace(/\/v1\/?$/, "/v1/models");
+      const controller = new AbortController();
+      const timeoutId = setTimeout(() => {
+        controller.abort();
+      }, HEALTH_CHECK_TIMEOUT_MS);
+
+      try {
+        const response = await fetch(healthUrl, {
+          method: "GET",
+          signal: controller.signal,
+        });
+
+        return response.ok;
+      } finally {
+        clearTimeout(timeoutId);
+      }
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.warn(`Health check failed for ${this.name}: ${message}`);
+      return false;
+    }
+  }
+}
--- a/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts
+++ b/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts
@@ -0,0 +1,436 @@
+/**
+ * ChatterboxTTSProvider Unit Tests
+ *
+ * Tests the premium-tier TTS provider with voice cloning and
+ * emotion exaggeration support for Chatterbox.
+ *
+ * Issue #394
+ */
+
+import { describe, it, expect, beforeEach, vi, type Mock } from "vitest";
+import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
+import type { ChatterboxSynthesizeOptions, AudioFormat } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+const mockCreate = vi.fn();
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      speech: {
+        create: mockCreate,
+      },
+    };
+  }
+  return { default: MockOpenAI };
+});
+
+// ==========================================
+// Test helpers
+// ==========================================
+
+/**
+ * Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return.
+ */
+function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } {
+  return {
+    arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer),
+  };
+}
+
+describe("ChatterboxTTSProvider", () => {
+  let provider: ChatterboxTTSProvider;
+
+  const testBaseURL = "http://chatterbox-tts:8881/v1";
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    provider = new ChatterboxTTSProvider(testBaseURL);
+  });
+
+  // ==========================================
+  // Provider identity
+  // ==========================================
+
+  describe("provider identity", () => {
+    it("should have name 'chatterbox'", () => {
+      expect(provider.name).toBe("chatterbox");
+    });
+
+    it("should have tier 'premium'", () => {
+      expect(provider.tier).toBe("premium");
+    });
+  });
+
+  // ==========================================
+  // Constructor
+  // ==========================================
+
+  describe("constructor", () => {
+    it("should create an instance with the provided baseURL", () => {
+      expect(provider).toBeDefined();
+    });
+
+    it("should use 'default' as the default voice", async () => {
+      const audioBytes = new Uint8Array([0x01, 0x02]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const result = await provider.synthesize("Hello");
+
+      expect(result.voice).toBe("default");
+    });
+
+    it("should use 'wav' as the default format", async () => {
+      const audioBytes = new Uint8Array([0x01, 0x02]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const result = await provider.synthesize("Hello");
+
+      expect(result.format).toBe("wav");
+    });
+  });
+
+  // ==========================================
+  // synthesize() — basic (no Chatterbox-specific options)
+  // ==========================================
+
+  describe("synthesize (basic)", () => {
+    it("should synthesize text and return a SynthesisResult", async () => {
+      const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const result = await provider.synthesize("Hello, world!");
+
+      expect(result).toBeDefined();
+      expect(result.audio).toBeInstanceOf(Buffer);
+      expect(result.audio.length).toBe(audioBytes.length);
+      expect(result.format).toBe("wav");
+      expect(result.voice).toBe("default");
+      expect(result.tier).toBe("premium");
+    });
+
+    it("should pass correct base parameters to OpenAI SDK when no extra options", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      await provider.synthesize("Test text");
+
+      expect(mockCreate).toHaveBeenCalledWith({
+        model: "tts-1",
+        input: "Test text",
+        voice: "default",
+        response_format: "wav",
+        speed: 1.0,
+      });
+    });
+
+    it("should use custom voice from options", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: ChatterboxSynthesizeOptions = { voice: "cloned_voice_1" };
+      const result = await provider.synthesize("Hello", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "cloned_voice_1" }));
+      expect(result.voice).toBe("cloned_voice_1");
+    });
+
+    it("should use custom format from options", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: ChatterboxSynthesizeOptions = { format: "mp3" as AudioFormat };
+      const result = await provider.synthesize("Hello", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" }));
+      expect(result.format).toBe("mp3");
+    });
+
+    it("should throw on synthesis failure", async () => {
+      mockCreate.mockRejectedValue(new Error("GPU out of memory"));
+
+      await expect(provider.synthesize("Hello")).rejects.toThrow(
+        "TTS synthesis failed for chatterbox: GPU out of memory"
+      );
+    });
+  });
+
+  // ==========================================
+  // synthesize() — voice cloning (referenceAudio)
+  // ==========================================
+
+  describe("synthesize (voice cloning)", () => {
+    it("should pass referenceAudio as base64 in extra body params", async () => {
+      const audioBytes = new Uint8Array([0x01, 0x02]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const referenceAudio = Buffer.from("fake-audio-data-for-cloning");
+      const options: ChatterboxSynthesizeOptions = {
+        referenceAudio,
+      };
+
+      await provider.synthesize("Clone my voice", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(
+        expect.objectContaining({
+          input: "Clone my voice",
+          reference_audio: referenceAudio.toString("base64"),
+        })
+      );
+    });
+
+    it("should not include reference_audio when referenceAudio is not provided", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      await provider.synthesize("No cloning");
+
+      const callArgs = mockCreate.mock.calls[0][0] as Record<string, unknown>;
+      expect(callArgs).not.toHaveProperty("reference_audio");
+    });
+  });
+
+  // ==========================================
+  // synthesize() — emotion exaggeration
+  // ==========================================
+
+  describe("synthesize (emotion exaggeration)", () => {
+    it("should pass emotionExaggeration as exaggeration in extra body params", async () => {
+      const audioBytes = new Uint8Array([0x01, 0x02]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: ChatterboxSynthesizeOptions = {
+        emotionExaggeration: 0.7,
+      };
+
+      await provider.synthesize("Very emotional text", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(
+        expect.objectContaining({
+          exaggeration: 0.7,
+        })
+      );
+    });
+
+    it("should not include exaggeration when emotionExaggeration is not provided", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      await provider.synthesize("Neutral text");
+
+      const callArgs = mockCreate.mock.calls[0][0] as Record<string, unknown>;
+      expect(callArgs).not.toHaveProperty("exaggeration");
+    });
+
+    it("should accept emotionExaggeration of 0.0", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: ChatterboxSynthesizeOptions = {
+        emotionExaggeration: 0.0,
+      };
+
+      await provider.synthesize("Minimal emotion", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(
+        expect.objectContaining({
+          exaggeration: 0.0,
+        })
+      );
+    });
+
+    it("should accept emotionExaggeration of 1.0", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: ChatterboxSynthesizeOptions = {
+        emotionExaggeration: 1.0,
+      };
+
+      await provider.synthesize("Maximum emotion", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(
+        expect.objectContaining({
+          exaggeration: 1.0,
+        })
+      );
+    });
+
+    it("should clamp emotionExaggeration above 1.0 to 1.0", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: ChatterboxSynthesizeOptions = {
+        emotionExaggeration: 1.5,
+      };
+
+      await provider.synthesize("Over the top", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(
+        expect.objectContaining({
+          exaggeration: 1.0,
+        })
+      );
+    });
+
+    it("should clamp emotionExaggeration below 0.0 to 0.0", async () => {
+      const audioBytes = new Uint8Array([0x01]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const options: ChatterboxSynthesizeOptions = {
+        emotionExaggeration: -0.5,
+      };
+
+      await provider.synthesize("Negative emotion", options);
+
+      expect(mockCreate).toHaveBeenCalledWith(
+        expect.objectContaining({
+          exaggeration: 0.0,
+        })
+      );
+    });
+  });
+
+  // ==========================================
+  // synthesize() — combined options
+  // ==========================================
+
+  describe("synthesize (combined options)", () => {
+    it("should handle referenceAudio and emotionExaggeration together", async () => {
+      const audioBytes = new Uint8Array([0x01, 0x02, 0x03]);
+      mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes));
+
+      const referenceAudio = Buffer.from("reference-audio-sample");
+      const options: ChatterboxSynthesizeOptions = {
+        voice: "custom_voice",
+        format: "mp3",
+        speed: 0.9,
+        referenceAudio,
+        emotionExaggeration: 0.6,
+      };
+
+      const result = await provider.synthesize("Full options test", options);
+
+      expect(mockCreate).toHaveBeenCalledWith({
+        model: "tts-1",
+        input: "Full options test",
+        voice: "custom_voice",
+        response_format: "mp3",
+        speed: 0.9,
+        reference_audio: referenceAudio.toString("base64"),
+        exaggeration: 0.6,
+      });
+
+      expect(result.audio).toBeInstanceOf(Buffer);
+      expect(result.voice).toBe("custom_voice");
+      expect(result.format).toBe("mp3");
+      expect(result.tier).toBe("premium");
+    });
+  });
+
+  // ==========================================
+  // isHealthy() — graceful degradation
+  // ==========================================
+
+  describe("isHealthy (graceful degradation)", () => {
+    it("should return true when the Chatterbox server is reachable", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({
+        ok: true,
+        status: 200,
+      });
+      vi.stubGlobal("fetch", mockFetch);
+
+      const healthy = await provider.isHealthy();
+
+      expect(healthy).toBe(true);
+
+      vi.unstubAllGlobals();
+    });
+
+    it("should return false when GPU is unavailable (server unreachable)", async () => {
+      const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED"));
+      vi.stubGlobal("fetch", mockFetch);
+
+      const healthy = await provider.isHealthy();
+
+      expect(healthy).toBe(false);
+
+      vi.unstubAllGlobals();
+    });
+
+    it("should return false when the server returns 503 (GPU overloaded)", async () => {
+      const mockFetch = vi.fn().mockResolvedValue({
+        ok: false,
+        status: 503,
+      });
+      vi.stubGlobal("fetch", mockFetch);
+
+      const healthy = await provider.isHealthy();
+
+      expect(healthy).toBe(false);
+
+      vi.unstubAllGlobals();
+    });
+
+    it("should return false on timeout (slow GPU response)", async () => {
+      const mockFetch = vi
+        .fn()
+        .mockRejectedValue(new Error("AbortError: The operation was aborted"));
+      vi.stubGlobal("fetch", mockFetch);
+
+      const healthy = await provider.isHealthy();
+
+      expect(healthy).toBe(false);
+
+      vi.unstubAllGlobals();
+    });
+  });
+
+  // ==========================================
+  // listVoices()
+  // ==========================================
+
+  describe("listVoices", () => {
+    it("should return the default voice in the premium tier", async () => {
+      const voices = await provider.listVoices();
+
+      expect(voices).toBeInstanceOf(Array);
+      expect(voices.length).toBeGreaterThan(0);
+
+      const defaultVoice = voices.find((v) => v.isDefault === true);
+      expect(defaultVoice).toBeDefined();
+      expect(defaultVoice?.id).toBe("default");
+      expect(defaultVoice?.tier).toBe("premium");
+    });
+
+    it("should set tier to 'premium' on all voices", async () => {
+      const voices = await provider.listVoices();
+
+      for (const voice of voices) {
+        expect(voice.tier).toBe("premium");
+      }
+    });
+  });
+
+  // ==========================================
+  // supportedLanguages
+  // ==========================================
+
+  describe("supportedLanguages", () => {
+    it("should expose a list of supported languages for cross-language transfer", () => {
+      const languages = provider.supportedLanguages;
+
+      expect(languages).toBeInstanceOf(Array);
+      expect(languages.length).toBe(23);
+      expect(languages).toContain("en");
+      expect(languages).toContain("fr");
+      expect(languages).toContain("de");
+      expect(languages).toContain("es");
+      expect(languages).toContain("ja");
+      expect(languages).toContain("zh");
+    });
+  });
+});
--- a/apps/api/src/speech/providers/chatterbox-tts.provider.ts
+++ b/apps/api/src/speech/providers/chatterbox-tts.provider.ts
@@ -0,0 +1,169 @@
+/**
+ * Chatterbox TTS Provider
+ *
+ * Premium-tier TTS provider with voice cloning and emotion exaggeration support.
+ * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body
+ * parameters for voice cloning (reference_audio) and emotion control (exaggeration).
+ *
+ * Key capabilities:
+ * - Voice cloning via reference audio sample
+ * - Emotion exaggeration control (0.0 - 1.0)
+ * - Cross-language voice transfer (23 languages)
+ * - Graceful degradation when GPU is unavailable (isHealthy returns false)
+ *
+ * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true.
+ *
+ * Issue #394
+ */
+
+import type { SpeechCreateParams } from "openai/resources/audio/speech";
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types";
+import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types";
+
+/** Default voice for Chatterbox */
+const CHATTERBOX_DEFAULT_VOICE = "default";
+
+/** Default audio format for Chatterbox (WAV for highest quality) */
+const CHATTERBOX_DEFAULT_FORMAT = "wav" as const;
+
+/** Default TTS model identifier */
+const DEFAULT_MODEL = "tts-1";
+
+/** Default speech speed multiplier */
+const DEFAULT_SPEED = 1.0;
+
+/**
+ * Languages supported by Chatterbox for cross-language voice transfer.
+ * Chatterbox supports 23 languages for voice cloning and synthesis.
+ */
+const SUPPORTED_LANGUAGES: readonly string[] = [
+  "en", // English
+  "fr", // French
+  "de", // German
+  "es", // Spanish
+  "it", // Italian
+  "pt", // Portuguese
+  "nl", // Dutch
+  "pl", // Polish
+  "ru", // Russian
+  "uk", // Ukrainian
+  "ja", // Japanese
+  "zh", // Chinese
+  "ko", // Korean
+  "ar", // Arabic
+  "hi", // Hindi
+  "tr", // Turkish
+  "sv", // Swedish
+  "da", // Danish
+  "fi", // Finnish
+  "no", // Norwegian
+  "cs", // Czech
+  "el", // Greek
+  "ro", // Romanian
+] as const;
+
+/**
+ * Chatterbox TTS provider (premium tier).
+ *
+ * Extends BaseTTSProvider with voice cloning and emotion exaggeration support.
+ * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional
+ * body parameters for its advanced features.
+ *
+ * @example
+ * ```typescript
+ * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1");
+ *
+ * // Basic synthesis
+ * const result = await provider.synthesize("Hello!");
+ *
+ * // Voice cloning with emotion
+ * const clonedResult = await provider.synthesize("Hello!", {
+ *   referenceAudio: myAudioBuffer,
+ *   emotionExaggeration: 0.7,
+ * });
+ * ```
+ */
+export class ChatterboxTTSProvider extends BaseTTSProvider {
+  readonly name = "chatterbox";
+  readonly tier: SpeechTier = "premium";
+
+  /**
+   * Languages supported for cross-language voice transfer.
+   */
+  readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES;
+
+  constructor(baseURL: string) {
+    super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT);
+  }
+
+  /**
+   * Synthesize text to audio with optional voice cloning and emotion control.
+   *
+   * Overrides the base synthesize() to support Chatterbox-specific options:
+   * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64)
+   * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped)
+   *
+   * These are passed as extra body parameters to the OpenAI-compatible endpoint,
+   * which Chatterbox's API accepts alongside the standard parameters.
+   *
+   * @param text - Text to convert to speech
+   * @param options - Synthesis options, optionally including Chatterbox-specific params
+   * @returns Synthesis result with audio buffer and metadata
+   * @throws {Error} If synthesis fails (e.g., GPU unavailable)
+   */
+  async synthesize(
+    text: string,
+    options?: SynthesizeOptions | ChatterboxSynthesizeOptions
+  ): Promise<SynthesisResult> {
+    const voice = options?.voice ?? this.defaultVoice;
+    const format = options?.format ?? this.defaultFormat;
+    const speed = options?.speed ?? DEFAULT_SPEED;
+
+    // Build the request body with standard OpenAI-compatible params
+    const requestBody: Record<string, unknown> = {
+      model: DEFAULT_MODEL,
+      input: text,
+      voice,
+      response_format: format,
+      speed,
+    };
+
+    // Add Chatterbox-specific params if provided
+    const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined;
+
+    if (chatterboxOptions?.referenceAudio) {
+      requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64");
+    }
+
+    if (chatterboxOptions?.emotionExaggeration !== undefined) {
+      // Clamp to valid range [0.0, 1.0]
+      requestBody.exaggeration = Math.max(
+        0.0,
+        Math.min(1.0, chatterboxOptions.emotionExaggeration)
+      );
+    }
+
+    try {
+      // Use the OpenAI SDK's create method, passing extra params
+      // The OpenAI SDK allows additional body params to be passed through
+      const response = await this.client.audio.speech.create(
+        requestBody as unknown as SpeechCreateParams
+      );
+
+      const arrayBuffer = await response.arrayBuffer();
+      const audio = Buffer.from(arrayBuffer);
+
+      return {
+        audio,
+        format,
+        voice,
+        tier: this.tier,
+      };
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`TTS synthesis failed: ${message}`);
+      throw new Error(`TTS synthesis failed for ${this.name}: ${message}`);
+    }
+  }
+}
--- a/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts
@@ -0,0 +1,316 @@
+/**
+ * KokoroTtsProvider Unit Tests
+ *
+ * Tests the Kokoro-FastAPI TTS provider with full voice catalog,
+ * voice metadata parsing, and Kokoro-specific feature constants.
+ *
+ * Issue #393
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import {
+  KokoroTtsProvider,
+  KOKORO_SUPPORTED_FORMATS,
+  KOKORO_SPEED_RANGE,
+  KOKORO_VOICES,
+  parseVoicePrefix,
+} from "./kokoro-tts.provider";
+import type { VoiceInfo } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      speech: {
+        create: vi.fn(),
+      },
+    };
+  }
+  return { default: MockOpenAI };
+});
+
+// ==========================================
+// Provider identity
+// ==========================================
+
+describe("KokoroTtsProvider", () => {
+  const testBaseURL = "http://kokoro-tts:8880/v1";
+  let provider: KokoroTtsProvider;
+
+  beforeEach(() => {
+    provider = new KokoroTtsProvider(testBaseURL);
+  });
+
+  describe("provider identity", () => {
+    it("should have name 'kokoro'", () => {
+      expect(provider.name).toBe("kokoro");
+    });
+
+    it("should have tier 'default'", () => {
+      expect(provider.tier).toBe("default");
+    });
+  });
+
+  // ==========================================
+  // listVoices()
+  // ==========================================
+
+  describe("listVoices", () => {
+    let voices: VoiceInfo[];
+
+    beforeEach(async () => {
+      voices = await provider.listVoices();
+    });
+
+    it("should return an array of VoiceInfo objects", () => {
+      expect(voices).toBeInstanceOf(Array);
+      expect(voices.length).toBeGreaterThan(0);
+    });
+
+    it("should return at least 10 voices", () => {
+      // The issue specifies at least: af_heart, af_bella, af_nicole, af_sarah, af_sky,
+      // am_adam, am_michael, bf_emma, bf_isabella, bm_george, bm_lewis
+      expect(voices.length).toBeGreaterThanOrEqual(10);
+    });
+
+    it("should set tier to 'default' on all voices", () => {
+      for (const voice of voices) {
+        expect(voice.tier).toBe("default");
+      }
+    });
+
+    it("should have exactly one default voice", () => {
+      const defaults = voices.filter((v) => v.isDefault === true);
+      expect(defaults.length).toBe(1);
+    });
+
+    it("should mark af_heart as the default voice", () => {
+      const defaultVoice = voices.find((v) => v.isDefault === true);
+      expect(defaultVoice).toBeDefined();
+      expect(defaultVoice?.id).toBe("af_heart");
+    });
+
+    it("should have an id and name for every voice", () => {
+      for (const voice of voices) {
+        expect(voice.id).toBeTruthy();
+        expect(voice.name).toBeTruthy();
+      }
+    });
+
+    it("should set language on every voice", () => {
+      for (const voice of voices) {
+        expect(voice.language).toBeTruthy();
+      }
+    });
+
+    // ==========================================
+    // Required voices from the issue
+    // ==========================================
+
+    describe("required voices", () => {
+      const requiredVoiceIds = [
+        "af_heart",
+        "af_bella",
+        "af_nicole",
+        "af_sarah",
+        "af_sky",
+        "am_adam",
+        "am_michael",
+        "bf_emma",
+        "bf_isabella",
+        "bm_george",
+        "bm_lewis",
+      ];
+
+      it.each(requiredVoiceIds)("should include voice '%s'", (voiceId) => {
+        const voice = voices.find((v) => v.id === voiceId);
+        expect(voice).toBeDefined();
+      });
+    });
+
+    // ==========================================
+    // Voice metadata from prefix
+    // ==========================================
+
+    describe("voice metadata from prefix", () => {
+      it("should set language to 'en-US' for af_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "af_heart");
+        expect(voice?.language).toBe("en-US");
+      });
+
+      it("should set language to 'en-US' for am_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "am_adam");
+        expect(voice?.language).toBe("en-US");
+      });
+
+      it("should set language to 'en-GB' for bf_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "bf_emma");
+        expect(voice?.language).toBe("en-GB");
+      });
+
+      it("should set language to 'en-GB' for bm_ prefix voices", () => {
+        const voice = voices.find((v) => v.id === "bm_george");
+        expect(voice?.language).toBe("en-GB");
+      });
+
+      it("should include gender in voice name for af_ prefix", () => {
+        const voice = voices.find((v) => v.id === "af_heart");
+        expect(voice?.name).toContain("Female");
+      });
+
+      it("should include gender in voice name for am_ prefix", () => {
+        const voice = voices.find((v) => v.id === "am_adam");
+        expect(voice?.name).toContain("Male");
+      });
+
+      it("should include gender in voice name for bf_ prefix", () => {
+        const voice = voices.find((v) => v.id === "bf_emma");
+        expect(voice?.name).toContain("Female");
+      });
+
+      it("should include gender in voice name for bm_ prefix", () => {
+        const voice = voices.find((v) => v.id === "bm_george");
+        expect(voice?.name).toContain("Male");
+      });
+    });
+
+    // ==========================================
+    // Voice name formatting
+    // ==========================================
+
+    describe("voice name formatting", () => {
+      it("should capitalize the voice name portion", () => {
+        const voice = voices.find((v) => v.id === "af_heart");
+        expect(voice?.name).toContain("Heart");
+      });
+
+      it("should include the accent/language label in the name", () => {
+        const afVoice = voices.find((v) => v.id === "af_heart");
+        expect(afVoice?.name).toContain("American");
+
+        const bfVoice = voices.find((v) => v.id === "bf_emma");
+        expect(bfVoice?.name).toContain("British");
+      });
+    });
+  });
+
+  // ==========================================
+  // Custom constructor
+  // ==========================================
+
+  describe("constructor", () => {
+    it("should accept custom default voice", () => {
+      const customProvider = new KokoroTtsProvider(testBaseURL, "af_bella");
+      expect(customProvider).toBeDefined();
+    });
+
+    it("should accept custom default format", () => {
+      const customProvider = new KokoroTtsProvider(testBaseURL, "af_heart", "wav");
+      expect(customProvider).toBeDefined();
+    });
+
+    it("should use af_heart as default voice when none specified", () => {
+      const defaultProvider = new KokoroTtsProvider(testBaseURL);
+      expect(defaultProvider).toBeDefined();
+    });
+  });
+});
+
+// ==========================================
+// parseVoicePrefix utility
+// ==========================================
+
+describe("parseVoicePrefix", () => {
+  it("should parse af_ as American English Female", () => {
+    const result = parseVoicePrefix("af_heart");
+    expect(result.language).toBe("en-US");
+    expect(result.gender).toBe("female");
+    expect(result.accent).toBe("American");
+  });
+
+  it("should parse am_ as American English Male", () => {
+    const result = parseVoicePrefix("am_adam");
+    expect(result.language).toBe("en-US");
+    expect(result.gender).toBe("male");
+    expect(result.accent).toBe("American");
+  });
+
+  it("should parse bf_ as British English Female", () => {
+    const result = parseVoicePrefix("bf_emma");
+    expect(result.language).toBe("en-GB");
+    expect(result.gender).toBe("female");
+    expect(result.accent).toBe("British");
+  });
+
+  it("should parse bm_ as British English Male", () => {
+    const result = parseVoicePrefix("bm_george");
+    expect(result.language).toBe("en-GB");
+    expect(result.gender).toBe("male");
+    expect(result.accent).toBe("British");
+  });
+
+  it("should return unknown for unrecognized prefix", () => {
+    const result = parseVoicePrefix("xx_unknown");
+    expect(result.language).toBe("unknown");
+    expect(result.gender).toBe("unknown");
+    expect(result.accent).toBe("Unknown");
+  });
+});
+
+// ==========================================
+// Exported constants
+// ==========================================
+
+describe("KOKORO_SUPPORTED_FORMATS", () => {
+  it("should include mp3", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("mp3");
+  });
+
+  it("should include wav", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("wav");
+  });
+
+  it("should include opus", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("opus");
+  });
+
+  it("should include flac", () => {
+    expect(KOKORO_SUPPORTED_FORMATS).toContain("flac");
+  });
+
+  it("should be a readonly array", () => {
+    expect(Array.isArray(KOKORO_SUPPORTED_FORMATS)).toBe(true);
+  });
+});
+
+describe("KOKORO_SPEED_RANGE", () => {
+  it("should have min speed of 0.25", () => {
+    expect(KOKORO_SPEED_RANGE.min).toBe(0.25);
+  });
+
+  it("should have max speed of 4.0", () => {
+    expect(KOKORO_SPEED_RANGE.max).toBe(4.0);
+  });
+});
+
+describe("KOKORO_VOICES", () => {
+  it("should be a non-empty array", () => {
+    expect(Array.isArray(KOKORO_VOICES)).toBe(true);
+    expect(KOKORO_VOICES.length).toBeGreaterThan(0);
+  });
+
+  it("should contain voice entries with id and label", () => {
+    for (const voice of KOKORO_VOICES) {
+      expect(voice.id).toBeTruthy();
+      expect(voice.label).toBeTruthy();
+    }
+  });
+
+  it("should include voices from multiple language prefixes", () => {
+    const prefixes = new Set(KOKORO_VOICES.map((v) => v.id.substring(0, 2)));
+    expect(prefixes.size).toBeGreaterThanOrEqual(4);
+  });
+});
--- a/apps/api/src/speech/providers/kokoro-tts.provider.ts
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts
@@ -0,0 +1,278 @@
+/**
+ * Kokoro-FastAPI TTS Provider
+ *
+ * Default-tier TTS provider backed by Kokoro-FastAPI.
+ * CPU-based, always available, Apache 2.0 license.
+ *
+ * Features:
+ * - 53 built-in voices across 8 languages
+ * - Speed control: 0.25x to 4.0x
+ * - Output formats: mp3, wav, opus, flac
+ * - Voice metadata derived from ID prefix (language, gender, accent)
+ *
+ * Voice ID format: {prefix}_{name}
+ *   - First character: language/accent code (a=American, b=British, etc.)
+ *   - Second character: gender code (f=Female, m=Male)
+ *
+ * Issue #393
+ */
+
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
+
+// ==========================================
+// Constants
+// ==========================================
+
+/** Audio formats supported by Kokoro-FastAPI */
+export const KOKORO_SUPPORTED_FORMATS: readonly AudioFormat[] = [
+  "mp3",
+  "wav",
+  "opus",
+  "flac",
+] as const;
+
+/** Speed range supported by Kokoro-FastAPI */
+export const KOKORO_SPEED_RANGE = {
+  min: 0.25,
+  max: 4.0,
+} as const;
+
+/** Default voice for Kokoro */
+const KOKORO_DEFAULT_VOICE = "af_heart";
+
+/** Default audio format for Kokoro */
+const KOKORO_DEFAULT_FORMAT: AudioFormat = "mp3";
+
+// ==========================================
+// Voice prefix mapping
+// ==========================================
+
+/**
+ * Mapping of voice ID prefix (first two characters) to language/accent/gender metadata.
+ *
+ * Kokoro voice IDs follow the pattern: {lang}{gender}_{name}
+ * - lang: a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese
+ * - gender: f=Female, m=Male
+ */
+const VOICE_PREFIX_MAP: Record<string, { language: string; gender: string; accent: string }> = {
+  af: { language: "en-US", gender: "female", accent: "American" },
+  am: { language: "en-US", gender: "male", accent: "American" },
+  bf: { language: "en-GB", gender: "female", accent: "British" },
+  bm: { language: "en-GB", gender: "male", accent: "British" },
+  ef: { language: "es", gender: "female", accent: "Spanish" },
+  em: { language: "es", gender: "male", accent: "Spanish" },
+  ff: { language: "fr", gender: "female", accent: "French" },
+  fm: { language: "fr", gender: "male", accent: "French" },
+  hf: { language: "hi", gender: "female", accent: "Hindi" },
+  hm: { language: "hi", gender: "male", accent: "Hindi" },
+  jf: { language: "ja", gender: "female", accent: "Japanese" },
+  jm: { language: "ja", gender: "male", accent: "Japanese" },
+  pf: { language: "pt-BR", gender: "female", accent: "Portuguese" },
+  pm: { language: "pt-BR", gender: "male", accent: "Portuguese" },
+  zf: { language: "zh", gender: "female", accent: "Chinese" },
+  zm: { language: "zh", gender: "male", accent: "Chinese" },
+};
+
+// ==========================================
+// Voice catalog
+// ==========================================
+
+/** Raw voice catalog entry */
+interface KokoroVoiceEntry {
+  /** Voice ID (e.g. "af_heart") */
+  id: string;
+  /** Human-readable label (e.g. "Heart") */
+  label: string;
+}
+
+/**
+ * Complete catalog of Kokoro built-in voices.
+ *
+ * Organized by language/accent prefix:
+ * - af_: American English Female
+ * - am_: American English Male
+ * - bf_: British English Female
+ * - bm_: British English Male
+ * - ef_: Spanish Female
+ * - em_: Spanish Male
+ * - ff_: French Female
+ * - hf_: Hindi Female
+ * - jf_: Japanese Female
+ * - jm_: Japanese Male
+ * - pf_: Portuguese Female
+ * - zf_: Chinese Female
+ * - zm_: Chinese Male
+ */
+export const KOKORO_VOICES: readonly KokoroVoiceEntry[] = [
+  // American English Female (af_)
+  { id: "af_heart", label: "Heart" },
+  { id: "af_alloy", label: "Alloy" },
+  { id: "af_aoede", label: "Aoede" },
+  { id: "af_bella", label: "Bella" },
+  { id: "af_jessica", label: "Jessica" },
+  { id: "af_kore", label: "Kore" },
+  { id: "af_nicole", label: "Nicole" },
+  { id: "af_nova", label: "Nova" },
+  { id: "af_river", label: "River" },
+  { id: "af_sarah", label: "Sarah" },
+  { id: "af_sky", label: "Sky" },
+  // American English Male (am_)
+  { id: "am_adam", label: "Adam" },
+  { id: "am_echo", label: "Echo" },
+  { id: "am_eric", label: "Eric" },
+  { id: "am_fenrir", label: "Fenrir" },
+  { id: "am_liam", label: "Liam" },
+  { id: "am_michael", label: "Michael" },
+  { id: "am_onyx", label: "Onyx" },
+  { id: "am_puck", label: "Puck" },
+  { id: "am_santa", label: "Santa" },
+  // British English Female (bf_)
+  { id: "bf_alice", label: "Alice" },
+  { id: "bf_emma", label: "Emma" },
+  { id: "bf_isabella", label: "Isabella" },
+  { id: "bf_lily", label: "Lily" },
+  // British English Male (bm_)
+  { id: "bm_daniel", label: "Daniel" },
+  { id: "bm_fable", label: "Fable" },
+  { id: "bm_george", label: "George" },
+  { id: "bm_lewis", label: "Lewis" },
+  { id: "bm_oscar", label: "Oscar" },
+  // Spanish Female (ef_)
+  { id: "ef_dora", label: "Dora" },
+  { id: "ef_elena", label: "Elena" },
+  { id: "ef_maria", label: "Maria" },
+  // Spanish Male (em_)
+  { id: "em_alex", label: "Alex" },
+  { id: "em_carlos", label: "Carlos" },
+  { id: "em_santa", label: "Santa" },
+  // French Female (ff_)
+  { id: "ff_camille", label: "Camille" },
+  { id: "ff_siwis", label: "Siwis" },
+  // Hindi Female (hf_)
+  { id: "hf_alpha", label: "Alpha" },
+  { id: "hf_beta", label: "Beta" },
+  // Japanese Female (jf_)
+  { id: "jf_alpha", label: "Alpha" },
+  { id: "jf_gongitsune", label: "Gongitsune" },
+  { id: "jf_nezumi", label: "Nezumi" },
+  { id: "jf_tebukuro", label: "Tebukuro" },
+  // Japanese Male (jm_)
+  { id: "jm_kumo", label: "Kumo" },
+  // Portuguese Female (pf_)
+  { id: "pf_dora", label: "Dora" },
+  // Chinese Female (zf_)
+  { id: "zf_xiaobei", label: "Xiaobei" },
+  { id: "zf_xiaoni", label: "Xiaoni" },
+  { id: "zf_xiaoxiao", label: "Xiaoxiao" },
+  { id: "zf_xiaoyi", label: "Xiaoyi" },
+  // Chinese Male (zm_)
+  { id: "zm_yunjian", label: "Yunjian" },
+  { id: "zm_yunxi", label: "Yunxi" },
+  { id: "zm_yunxia", label: "Yunxia" },
+  { id: "zm_yunyang", label: "Yunyang" },
+] as const;
+
+// ==========================================
+// Prefix parser
+// ==========================================
+
+/** Parsed voice prefix metadata */
+export interface VoicePrefixMetadata {
+  /** BCP 47 language code (e.g. "en-US", "en-GB", "ja") */
+  language: string;
+  /** Gender: "female", "male", or "unknown" */
+  gender: string;
+  /** Human-readable accent label (e.g. "American", "British") */
+  accent: string;
+}
+
+/**
+ * Parse a Kokoro voice ID to extract language, gender, and accent metadata.
+ *
+ * Voice IDs follow the pattern: {lang}{gender}_{name}
+ * The first two characters encode language/accent and gender.
+ *
+ * @param voiceId - Kokoro voice ID (e.g. "af_heart")
+ * @returns Parsed metadata with language, gender, and accent
+ */
+export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
+  const prefix = voiceId.substring(0, 2);
+  const mapping = VOICE_PREFIX_MAP[prefix];
+
+  if (mapping) {
+    return {
+      language: mapping.language,
+      gender: mapping.gender,
+      accent: mapping.accent,
+    };
+  }
+
+  return {
+    language: "unknown",
+    gender: "unknown",
+    accent: "Unknown",
+  };
+}
+
+// ==========================================
+// Provider class
+// ==========================================
+
+/**
+ * Kokoro-FastAPI TTS provider (default tier).
+ *
+ * CPU-based text-to-speech engine with 53 built-in voices across 8 languages.
+ * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
+ *
+ * @example
+ * ```typescript
+ * const kokoro = new KokoroTtsProvider("http://kokoro-tts:8880/v1");
+ * const voices = await kokoro.listVoices();
+ * const result = await kokoro.synthesize("Hello!", { voice: "af_heart" });
+ * ```
+ */
+export class KokoroTtsProvider extends BaseTTSProvider {
+  readonly name = "kokoro";
+  readonly tier: SpeechTier = "default";
+
+  /**
+   * Create a new Kokoro TTS provider.
+   *
+   * @param baseURL - Base URL for the Kokoro-FastAPI endpoint (e.g. "http://kokoro-tts:8880/v1")
+   * @param defaultVoice - Default voice ID (defaults to "af_heart")
+   * @param defaultFormat - Default audio format (defaults to "mp3")
+   */
+  constructor(
+    baseURL: string,
+    defaultVoice: string = KOKORO_DEFAULT_VOICE,
+    defaultFormat: AudioFormat = KOKORO_DEFAULT_FORMAT
+  ) {
+    super(baseURL, defaultVoice, defaultFormat);
+  }
+
+  /**
+   * List all available Kokoro voices with metadata.
+   *
+   * Returns the full catalog of 53 built-in voices with language, gender,
+   * and accent information derived from voice ID prefixes.
+   *
+   * @returns Array of VoiceInfo objects for all Kokoro voices
+   */
+  override listVoices(): Promise<VoiceInfo[]> {
+    const voices: VoiceInfo[] = KOKORO_VOICES.map((entry) => {
+      const metadata = parseVoicePrefix(entry.id);
+      const genderLabel = metadata.gender === "female" ? "Female" : "Male";
+
+      return {
+        id: entry.id,
+        name: `${entry.label} (${metadata.accent} ${genderLabel})`,
+        language: metadata.language,
+        tier: this.tier,
+        isDefault: entry.id === this.defaultVoice,
+      };
+    });
+
+    return Promise.resolve(voices);
+  }
+}
--- a/apps/api/src/speech/providers/piper-tts.provider.spec.ts
+++ b/apps/api/src/speech/providers/piper-tts.provider.spec.ts
@@ -0,0 +1,266 @@
+/**
+ * PiperTtsProvider Unit Tests
+ *
+ * Tests the Piper TTS provider via OpenedAI Speech (fallback tier).
+ * Validates provider identity, OpenAI voice name mapping, voice listing,
+ * and ultra-lightweight CPU-only design characteristics.
+ *
+ * Issue #395
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import {
+  PiperTtsProvider,
+  PIPER_VOICE_MAP,
+  PIPER_SUPPORTED_FORMATS,
+  OPENAI_STANDARD_VOICES,
+} from "./piper-tts.provider";
+import type { VoiceInfo } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      speech: {
+        create: vi.fn(),
+      },
+    };
+  }
+  return { default: MockOpenAI };
+});
+
+// ==========================================
+// Provider identity
+// ==========================================
+
+describe("PiperTtsProvider", () => {
+  const testBaseURL = "http://openedai-speech:8000/v1";
+  let provider: PiperTtsProvider;
+
+  beforeEach(() => {
+    provider = new PiperTtsProvider(testBaseURL);
+  });
+
+  describe("provider identity", () => {
+    it("should have name 'piper'", () => {
+      expect(provider.name).toBe("piper");
+    });
+
+    it("should have tier 'fallback'", () => {
+      expect(provider.tier).toBe("fallback");
+    });
+  });
+
+  // ==========================================
+  // Constructor
+  // ==========================================
+
+  describe("constructor", () => {
+    it("should use 'alloy' as default voice", () => {
+      const newProvider = new PiperTtsProvider(testBaseURL);
+      expect(newProvider).toBeDefined();
+    });
+
+    it("should accept a custom default voice", () => {
+      const customProvider = new PiperTtsProvider(testBaseURL, "nova");
+      expect(customProvider).toBeDefined();
+    });
+
+    it("should accept a custom default format", () => {
+      const customProvider = new PiperTtsProvider(testBaseURL, "alloy", "wav");
+      expect(customProvider).toBeDefined();
+    });
+  });
+
+  // ==========================================
+  // listVoices()
+  // ==========================================
+
+  describe("listVoices", () => {
+    let voices: VoiceInfo[];
+
+    beforeEach(async () => {
+      voices = await provider.listVoices();
+    });
+
+    it("should return an array of VoiceInfo objects", () => {
+      expect(voices).toBeInstanceOf(Array);
+      expect(voices.length).toBeGreaterThan(0);
+    });
+
+    it("should return exactly 6 voices (OpenAI standard set)", () => {
+      expect(voices.length).toBe(6);
+    });
+
+    it("should set tier to 'fallback' on all voices", () => {
+      for (const voice of voices) {
+        expect(voice.tier).toBe("fallback");
+      }
+    });
+
+    it("should have exactly one default voice", () => {
+      const defaults = voices.filter((v) => v.isDefault === true);
+      expect(defaults.length).toBe(1);
+    });
+
+    it("should mark 'alloy' as the default voice", () => {
+      const defaultVoice = voices.find((v) => v.isDefault === true);
+      expect(defaultVoice).toBeDefined();
+      expect(defaultVoice?.id).toBe("alloy");
+    });
+
+    it("should have an id and name for every voice", () => {
+      for (const voice of voices) {
+        expect(voice.id).toBeTruthy();
+        expect(voice.name).toBeTruthy();
+      }
+    });
+
+    it("should set language on every voice", () => {
+      for (const voice of voices) {
+        expect(voice.language).toBeTruthy();
+      }
+    });
+
+    // ==========================================
+    // All 6 OpenAI standard voices present
+    // ==========================================
+
+    describe("OpenAI standard voices", () => {
+      const standardVoiceIds = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"];
+
+      it.each(standardVoiceIds)("should include voice '%s'", (voiceId) => {
+        const voice = voices.find((v) => v.id === voiceId);
+        expect(voice).toBeDefined();
+      });
+    });
+
+    // ==========================================
+    // Voice metadata
+    // ==========================================
+
+    describe("voice metadata", () => {
+      it("should include gender info in voice names", () => {
+        const alloy = voices.find((v) => v.id === "alloy");
+        expect(alloy?.name).toMatch(/Female|Male/);
+      });
+
+      it("should map alloy to a female voice", () => {
+        const alloy = voices.find((v) => v.id === "alloy");
+        expect(alloy?.name).toContain("Female");
+      });
+
+      it("should map echo to a male voice", () => {
+        const echo = voices.find((v) => v.id === "echo");
+        expect(echo?.name).toContain("Male");
+      });
+
+      it("should map fable to a British voice", () => {
+        const fable = voices.find((v) => v.id === "fable");
+        expect(fable?.language).toBe("en-GB");
+      });
+
+      it("should map onyx to a male voice", () => {
+        const onyx = voices.find((v) => v.id === "onyx");
+        expect(onyx?.name).toContain("Male");
+      });
+
+      it("should map nova to a female voice", () => {
+        const nova = voices.find((v) => v.id === "nova");
+        expect(nova?.name).toContain("Female");
+      });
+
+      it("should map shimmer to a female voice", () => {
+        const shimmer = voices.find((v) => v.id === "shimmer");
+        expect(shimmer?.name).toContain("Female");
+      });
+    });
+  });
+});
+
+// ==========================================
+// PIPER_VOICE_MAP
+// ==========================================
+
+describe("PIPER_VOICE_MAP", () => {
+  it("should contain all 6 OpenAI standard voice names", () => {
+    const expectedKeys = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"];
+    for (const key of expectedKeys) {
+      expect(PIPER_VOICE_MAP).toHaveProperty(key);
+    }
+  });
+
+  it("should map each voice to a Piper voice ID", () => {
+    for (const entry of Object.values(PIPER_VOICE_MAP)) {
+      expect(entry.piperVoice).toBeTruthy();
+      expect(typeof entry.piperVoice).toBe("string");
+    }
+  });
+
+  it("should have gender for each voice entry", () => {
+    for (const entry of Object.values(PIPER_VOICE_MAP)) {
+      expect(entry.gender).toMatch(/^(female|male)$/);
+    }
+  });
+
+  it("should have a language for each voice entry", () => {
+    for (const entry of Object.values(PIPER_VOICE_MAP)) {
+      expect(entry.language).toBeTruthy();
+    }
+  });
+
+  it("should have a description for each voice entry", () => {
+    for (const entry of Object.values(PIPER_VOICE_MAP)) {
+      expect(entry.description).toBeTruthy();
+    }
+  });
+});
+
+// ==========================================
+// OPENAI_STANDARD_VOICES
+// ==========================================
+
+describe("OPENAI_STANDARD_VOICES", () => {
+  it("should be an array of 6 voice IDs", () => {
+    expect(Array.isArray(OPENAI_STANDARD_VOICES)).toBe(true);
+    expect(OPENAI_STANDARD_VOICES.length).toBe(6);
+  });
+
+  it("should contain all standard OpenAI voice names", () => {
+    expect(OPENAI_STANDARD_VOICES).toContain("alloy");
+    expect(OPENAI_STANDARD_VOICES).toContain("echo");
+    expect(OPENAI_STANDARD_VOICES).toContain("fable");
+    expect(OPENAI_STANDARD_VOICES).toContain("onyx");
+    expect(OPENAI_STANDARD_VOICES).toContain("nova");
+    expect(OPENAI_STANDARD_VOICES).toContain("shimmer");
+  });
+});
+
+// ==========================================
+// PIPER_SUPPORTED_FORMATS
+// ==========================================
+
+describe("PIPER_SUPPORTED_FORMATS", () => {
+  it("should include mp3", () => {
+    expect(PIPER_SUPPORTED_FORMATS).toContain("mp3");
+  });
+
+  it("should include wav", () => {
+    expect(PIPER_SUPPORTED_FORMATS).toContain("wav");
+  });
+
+  it("should include opus", () => {
+    expect(PIPER_SUPPORTED_FORMATS).toContain("opus");
+  });
+
+  it("should include flac", () => {
+    expect(PIPER_SUPPORTED_FORMATS).toContain("flac");
+  });
+
+  it("should be a readonly array", () => {
+    expect(Array.isArray(PIPER_SUPPORTED_FORMATS)).toBe(true);
+  });
+});
--- a/apps/api/src/speech/providers/piper-tts.provider.ts
+++ b/apps/api/src/speech/providers/piper-tts.provider.ts
@@ -0,0 +1,212 @@
+/**
+ * Piper TTS Provider via OpenedAI Speech
+ *
+ * Fallback-tier TTS provider using Piper via OpenedAI Speech for
+ * ultra-lightweight CPU-only synthesis. Designed for low-resource
+ * environments including Raspberry Pi.
+ *
+ * Features:
+ * - OpenAI-compatible API via OpenedAI Speech server
+ * - 100+ Piper voices across 40+ languages
+ * - 6 standard OpenAI voice names mapped to Piper voices
+ * - Output formats: mp3, wav, opus, flac
+ * - CPU-only, no GPU required
+ * - GPL license (via OpenedAI Speech)
+ *
+ * Voice names use the OpenAI standard set (alloy, echo, fable, onyx,
+ * nova, shimmer) which OpenedAI Speech maps to configured Piper voices.
+ *
+ * Issue #395
+ */
+
+import { BaseTTSProvider } from "./base-tts.provider";
+import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types";
+
+// ==========================================
+// Constants
+// ==========================================
+
+/** Audio formats supported by OpenedAI Speech with Piper backend */
+export const PIPER_SUPPORTED_FORMATS: readonly AudioFormat[] = [
+  "mp3",
+  "wav",
+  "opus",
+  "flac",
+] as const;
+
+/** Default voice for Piper (via OpenedAI Speech) */
+const PIPER_DEFAULT_VOICE = "alloy";
+
+/** Default audio format for Piper */
+const PIPER_DEFAULT_FORMAT: AudioFormat = "mp3";
+
+// ==========================================
+// OpenAI standard voice names
+// ==========================================
+
+/**
+ * The 6 standard OpenAI TTS voice names.
+ * OpenedAI Speech accepts these names and routes them to configured Piper voices.
+ */
+export const OPENAI_STANDARD_VOICES: readonly string[] = [
+  "alloy",
+  "echo",
+  "fable",
+  "onyx",
+  "nova",
+  "shimmer",
+] as const;
+
+// ==========================================
+// Voice mapping
+// ==========================================
+
+/** Metadata for a Piper voice mapped from an OpenAI voice name */
+export interface PiperVoiceMapping {
+  /** The underlying Piper voice ID configured in OpenedAI Speech */
+  piperVoice: string;
+  /** Human-readable description of the voice character */
+  description: string;
+  /** Gender of the voice */
+  gender: "female" | "male";
+  /** BCP 47 language code */
+  language: string;
+}
+
+/** Fallback mapping used when a voice ID is not found in PIPER_VOICE_MAP */
+const DEFAULT_MAPPING: PiperVoiceMapping = {
+  piperVoice: "en_US-amy-medium",
+  description: "Default voice",
+  gender: "female",
+  language: "en-US",
+};
+
+/**
+ * Mapping of OpenAI standard voice names to their default Piper voice
+ * configuration in OpenedAI Speech.
+ *
+ * These are the default mappings that OpenedAI Speech uses when configured
+ * with Piper as the TTS backend. The actual Piper voice used can be
+ * customized in the OpenedAI Speech configuration file.
+ *
+ * Default Piper voice assignments:
+ * - alloy: en_US-amy-medium (warm, balanced female)
+ * - echo: en_US-ryan-medium (clear, articulate male)
+ * - fable: en_GB-alan-medium (British male narrator)
+ * - onyx: en_US-danny-low (deep, resonant male)
+ * - nova: en_US-lessac-medium (expressive female)
+ * - shimmer: en_US-kristin-medium (bright, energetic female)
+ */
+export const PIPER_VOICE_MAP: Record<string, PiperVoiceMapping> = {
+  alloy: {
+    piperVoice: "en_US-amy-medium",
+    description: "Warm, balanced voice",
+    gender: "female",
+    language: "en-US",
+  },
+  echo: {
+    piperVoice: "en_US-ryan-medium",
+    description: "Clear, articulate voice",
+    gender: "male",
+    language: "en-US",
+  },
+  fable: {
+    piperVoice: "en_GB-alan-medium",
+    description: "British narrator voice",
+    gender: "male",
+    language: "en-GB",
+  },
+  onyx: {
+    piperVoice: "en_US-danny-low",
+    description: "Deep, resonant voice",
+    gender: "male",
+    language: "en-US",
+  },
+  nova: {
+    piperVoice: "en_US-lessac-medium",
+    description: "Expressive, versatile voice",
+    gender: "female",
+    language: "en-US",
+  },
+  shimmer: {
+    piperVoice: "en_US-kristin-medium",
+    description: "Bright, energetic voice",
+    gender: "female",
+    language: "en-US",
+  },
+};
+
+// ==========================================
+// Provider class
+// ==========================================
+
+/**
+ * Piper TTS provider via OpenedAI Speech (fallback tier).
+ *
+ * Ultra-lightweight CPU-only text-to-speech engine using Piper voices
+ * through the OpenedAI Speech server's OpenAI-compatible API.
+ *
+ * Designed for:
+ * - CPU-only environments (no GPU required)
+ * - Low-resource devices (Raspberry Pi, ARM SBCs)
+ * - Fallback when primary TTS engines are unavailable
+ * - High-volume, low-latency synthesis needs
+ *
+ * The provider exposes the 6 standard OpenAI voice names (alloy, echo,
+ * fable, onyx, nova, shimmer) which OpenedAI Speech maps to configured
+ * Piper voices. Additional Piper voices (100+ across 40+ languages)
+ * can be accessed by passing the Piper voice ID directly.
+ *
+ * @example
+ * ```typescript
+ * const piper = new PiperTtsProvider("http://openedai-speech:8000/v1");
+ * const voices = await piper.listVoices();
+ * const result = await piper.synthesize("Hello!", { voice: "alloy" });
+ * ```
+ */
+export class PiperTtsProvider extends BaseTTSProvider {
+  readonly name = "piper";
+  readonly tier: SpeechTier = "fallback";
+
+  /**
+   * Create a new Piper TTS provider.
+   *
+   * @param baseURL - Base URL for the OpenedAI Speech endpoint (e.g. "http://openedai-speech:8000/v1")
+   * @param defaultVoice - Default OpenAI voice name (defaults to "alloy")
+   * @param defaultFormat - Default audio format (defaults to "mp3")
+   */
+  constructor(
+    baseURL: string,
+    defaultVoice: string = PIPER_DEFAULT_VOICE,
+    defaultFormat: AudioFormat = PIPER_DEFAULT_FORMAT
+  ) {
+    super(baseURL, defaultVoice, defaultFormat);
+  }
+
+  /**
+   * List available voices with OpenAI-to-Piper mapping metadata.
+   *
+   * Returns the 6 standard OpenAI voice names with information about
+   * the underlying Piper voice, gender, and language. These are the
+   * voices that can be specified in the `voice` parameter of synthesize().
+   *
+   * @returns Array of VoiceInfo objects for all mapped Piper voices
+   */
+  override listVoices(): Promise<VoiceInfo[]> {
+    const voices: VoiceInfo[] = OPENAI_STANDARD_VOICES.map((voiceId) => {
+      const mapping = PIPER_VOICE_MAP[voiceId] ?? DEFAULT_MAPPING;
+      const genderLabel = mapping.gender === "female" ? "Female" : "Male";
+      const label = voiceId.charAt(0).toUpperCase() + voiceId.slice(1);
+
+      return {
+        id: voiceId,
+        name: `${label} (${genderLabel} - ${mapping.description})`,
+        language: mapping.language,
+        tier: this.tier,
+        isDefault: voiceId === this.defaultVoice,
+      };
+    });
+
+    return Promise.resolve(voices);
+  }
+}
--- a/apps/api/src/speech/providers/speaches-stt.provider.spec.ts
+++ b/apps/api/src/speech/providers/speaches-stt.provider.spec.ts
@@ -0,0 +1,468 @@
+/**
+ * SpeachesSttProvider Tests
+ *
+ * TDD tests for the Speaches/faster-whisper STT provider.
+ * Tests cover transcription, error handling, health checks, and config injection.
+ *
+ * Issue #390
+ */
+
+import { describe, it, expect, beforeEach, vi } from "vitest";
+import { SpeachesSttProvider } from "./speaches-stt.provider";
+import type { SpeechConfig } from "../speech.config";
+import type { TranscribeOptions } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => {
+  const mockCreate = vi.fn();
+  const mockModelsList = vi.fn();
+  const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => {
+    return new File([buffer], name);
+  });
+  const mockOpenAIConstructorCalls: Array<Record<string, unknown>> = [];
+  return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls };
+});
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      transcriptions: {
+        create: mockCreate,
+      },
+    };
+    models = {
+      list: mockModelsList,
+    };
+    constructor(config: Record<string, unknown>) {
+      mockOpenAIConstructorCalls.push(config);
+    }
+  }
+  return {
+    default: MockOpenAI,
+    toFile: mockToFile,
+  };
+});
+
+// ==========================================
+// Test helpers
+// ==========================================
+
+function createTestConfig(overrides?: Partial<SpeechConfig["stt"]>): SpeechConfig {
+  return {
+    stt: {
+      enabled: true,
+      baseUrl: "http://speaches:8000/v1",
+      model: "Systran/faster-whisper-large-v3-turbo",
+      language: "en",
+      ...overrides,
+    },
+    tts: {
+      default: { enabled: false, url: "", voice: "", format: "" },
+      premium: { enabled: false, url: "" },
+      fallback: { enabled: false, url: "" },
+    },
+    limits: {
+      maxUploadSize: 25_000_000,
+      maxDurationSeconds: 600,
+      maxTextLength: 4096,
+    },
+  };
+}
+
+function createMockVerboseResponse(overrides?: Record<string, unknown>): Record<string, unknown> {
+  return {
+    text: "Hello, world!",
+    language: "en",
+    duration: 3.5,
+    segments: [
+      {
+        id: 0,
+        text: "Hello, world!",
+        start: 0.0,
+        end: 3.5,
+        avg_logprob: -0.25,
+        compression_ratio: 1.2,
+        no_speech_prob: 0.01,
+        seek: 0,
+        temperature: 0.0,
+        tokens: [1, 2, 3],
+      },
+    ],
+    ...overrides,
+  };
+}
+
+describe("SpeachesSttProvider", () => {
+  let provider: SpeachesSttProvider;
+  let config: SpeechConfig;
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    mockOpenAIConstructorCalls.length = 0;
+    config = createTestConfig();
+    provider = new SpeachesSttProvider(config);
+  });
+
+  // ==========================================
+  // Provider identity
+  // ==========================================
+  describe("name", () => {
+    it("should have the name 'speaches'", () => {
+      expect(provider.name).toBe("speaches");
+    });
+  });
+
+  // ==========================================
+  // transcribe
+  // ==========================================
+  describe("transcribe", () => {
+    it("should call OpenAI audio.transcriptions.create with correct parameters", async () => {
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await provider.transcribe(audio);
+
+      expect(mockCreate).toHaveBeenCalledOnce();
+      const callArgs = mockCreate.mock.calls[0][0];
+      expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo");
+      expect(callArgs.language).toBe("en");
+      expect(callArgs.response_format).toBe("verbose_json");
+    });
+
+    it("should convert Buffer to File using toFile", async () => {
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await provider.transcribe(audio);
+
+      expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", {
+        type: "audio/wav",
+      });
+    });
+
+    it("should return TranscriptionResult with text and language", async () => {
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.text).toBe("Hello, world!");
+      expect(result.language).toBe("en");
+    });
+
+    it("should return durationSeconds from verbose response", async () => {
+      const mockResponse = createMockVerboseResponse({ duration: 5.25 });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.durationSeconds).toBe(5.25);
+    });
+
+    it("should map segments from verbose response", async () => {
+      const mockResponse = createMockVerboseResponse({
+        segments: [
+          {
+            id: 0,
+            text: "Hello,",
+            start: 0.0,
+            end: 1.5,
+            avg_logprob: -0.2,
+            compression_ratio: 1.1,
+            no_speech_prob: 0.01,
+            seek: 0,
+            temperature: 0.0,
+            tokens: [1, 2],
+          },
+          {
+            id: 1,
+            text: " world!",
+            start: 1.5,
+            end: 3.5,
+            avg_logprob: -0.3,
+            compression_ratio: 1.3,
+            no_speech_prob: 0.02,
+            seek: 0,
+            temperature: 0.0,
+            tokens: [3, 4],
+          },
+        ],
+      });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.segments).toHaveLength(2);
+      expect(result.segments?.[0]).toEqual({
+        text: "Hello,",
+        start: 0.0,
+        end: 1.5,
+      });
+      expect(result.segments?.[1]).toEqual({
+        text: " world!",
+        start: 1.5,
+        end: 3.5,
+      });
+    });
+
+    it("should handle response without segments gracefully", async () => {
+      const mockResponse = createMockVerboseResponse({ segments: undefined });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.text).toBe("Hello, world!");
+      expect(result.segments).toBeUndefined();
+    });
+
+    it("should handle response without duration gracefully", async () => {
+      const mockResponse = createMockVerboseResponse({ duration: undefined });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      const result = await provider.transcribe(audio);
+
+      expect(result.text).toBe("Hello, world!");
+      expect(result.durationSeconds).toBeUndefined();
+    });
+
+    // ------------------------------------------
+    // Options override
+    // ------------------------------------------
+    describe("options override", () => {
+      it("should use custom model from options when provided", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { model: "custom-whisper-model" };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.model).toBe("custom-whisper-model");
+      });
+
+      it("should use custom language from options when provided", async () => {
+        const mockResponse = createMockVerboseResponse({ language: "fr" });
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { language: "fr" };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.language).toBe("fr");
+      });
+
+      it("should pass through prompt option", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { prompt: "This is a meeting about project planning." };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.prompt).toBe("This is a meeting about project planning.");
+      });
+
+      it("should pass through temperature option", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { temperature: 0.3 };
+        await provider.transcribe(audio, options);
+
+        const callArgs = mockCreate.mock.calls[0][0];
+        expect(callArgs.temperature).toBe(0.3);
+      });
+
+      it("should use custom mimeType for file conversion when provided", async () => {
+        const mockResponse = createMockVerboseResponse();
+        mockCreate.mockResolvedValueOnce(mockResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const options: TranscribeOptions = { mimeType: "audio/mp3" };
+        await provider.transcribe(audio, options);
+
+        expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", {
+          type: "audio/mp3",
+        });
+      });
+    });
+
+    // ------------------------------------------
+    // Simple response fallback
+    // ------------------------------------------
+    describe("simple response fallback", () => {
+      it("should handle simple Transcription response (text only, no verbose fields)", async () => {
+        // Some configurations may return just { text: "..." } without verbose fields
+        const simpleResponse = { text: "Simple transcription result." };
+        mockCreate.mockResolvedValueOnce(simpleResponse);
+
+        const audio = Buffer.from("fake-audio-data");
+        const result = await provider.transcribe(audio);
+
+        expect(result.text).toBe("Simple transcription result.");
+        expect(result.language).toBe("en"); // Falls back to config language
+        expect(result.durationSeconds).toBeUndefined();
+        expect(result.segments).toBeUndefined();
+      });
+    });
+  });
+
+  // ==========================================
+  // Error handling
+  // ==========================================
+  describe("error handling", () => {
+    it("should throw a descriptive error on connection refused", async () => {
+      const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000");
+      mockCreate.mockRejectedValueOnce(connectionError);
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000"
+      );
+    });
+
+    it("should throw a descriptive error on timeout", async () => {
+      const timeoutError = new Error("Request timed out");
+      mockCreate.mockRejectedValueOnce(timeoutError);
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: Request timed out"
+      );
+    });
+
+    it("should throw a descriptive error on API error", async () => {
+      const apiError = new Error("Invalid model: nonexistent-model");
+      mockCreate.mockRejectedValueOnce(apiError);
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: Invalid model: nonexistent-model"
+      );
+    });
+
+    it("should handle non-Error thrown values", async () => {
+      mockCreate.mockRejectedValueOnce("unexpected string error");
+
+      const audio = Buffer.from("fake-audio-data");
+      await expect(provider.transcribe(audio)).rejects.toThrow(
+        "STT transcription failed: unexpected string error"
+      );
+    });
+  });
+
+  // ==========================================
+  // isHealthy
+  // ==========================================
+  describe("isHealthy", () => {
+    it("should return true when the server is reachable", async () => {
+      mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] });
+
+      const healthy = await provider.isHealthy();
+      expect(healthy).toBe(true);
+    });
+
+    it("should return false when the server is unreachable", async () => {
+      mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED"));
+
+      const healthy = await provider.isHealthy();
+      expect(healthy).toBe(false);
+    });
+
+    it("should not throw on health check failure", async () => {
+      mockModelsList.mockRejectedValueOnce(new Error("Network error"));
+
+      await expect(provider.isHealthy()).resolves.toBe(false);
+    });
+
+    it("should return false on unexpected error types", async () => {
+      mockModelsList.mockRejectedValueOnce("string error");
+
+      const healthy = await provider.isHealthy();
+      expect(healthy).toBe(false);
+    });
+  });
+
+  // ==========================================
+  // Config injection
+  // ==========================================
+  describe("config injection", () => {
+    it("should create OpenAI client with baseURL from config", () => {
+      // The constructor was called in beforeEach
+      expect(mockOpenAIConstructorCalls).toHaveLength(1);
+      expect(mockOpenAIConstructorCalls[0]).toEqual(
+        expect.objectContaining({
+          baseURL: "http://speaches:8000/v1",
+        })
+      );
+    });
+
+    it("should use custom baseURL from config", () => {
+      mockOpenAIConstructorCalls.length = 0;
+      const customConfig = createTestConfig({
+        baseUrl: "http://custom-speaches:9000/v1",
+      });
+      new SpeachesSttProvider(customConfig);
+
+      expect(mockOpenAIConstructorCalls).toHaveLength(1);
+      expect(mockOpenAIConstructorCalls[0]).toEqual(
+        expect.objectContaining({
+          baseURL: "http://custom-speaches:9000/v1",
+        })
+      );
+    });
+
+    it("should use default model from config for transcription", async () => {
+      const customConfig = createTestConfig({
+        model: "Systran/faster-whisper-small",
+      });
+      const customProvider = new SpeachesSttProvider(customConfig);
+
+      const mockResponse = createMockVerboseResponse();
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await customProvider.transcribe(audio);
+
+      const callArgs = mockCreate.mock.calls[0][0];
+      expect(callArgs.model).toBe("Systran/faster-whisper-small");
+    });
+
+    it("should use default language from config for transcription", async () => {
+      const customConfig = createTestConfig({ language: "de" });
+      const customProvider = new SpeachesSttProvider(customConfig);
+
+      const mockResponse = createMockVerboseResponse({ language: "de" });
+      mockCreate.mockResolvedValueOnce(mockResponse);
+
+      const audio = Buffer.from("fake-audio-data");
+      await customProvider.transcribe(audio);
+
+      const callArgs = mockCreate.mock.calls[0][0];
+      expect(callArgs.language).toBe("de");
+    });
+
+    it("should set a dummy API key for local Speaches server", () => {
+      expect(mockOpenAIConstructorCalls).toHaveLength(1);
+      expect(mockOpenAIConstructorCalls[0]).toEqual(
+        expect.objectContaining({
+          apiKey: "not-needed",
+        })
+      );
+    });
+  });
+});
--- a/apps/api/src/speech/providers/speaches-stt.provider.ts
+++ b/apps/api/src/speech/providers/speaches-stt.provider.ts
@@ -0,0 +1,180 @@
+/**
+ * SpeachesSttProvider
+ *
+ * Speech-to-text provider using Speaches (faster-whisper backend).
+ * Connects to the Speaches server via its OpenAI-compatible
+ * `/v1/audio/transcriptions` endpoint using the OpenAI SDK.
+ *
+ * Issue #390
+ */
+
+import { Injectable, Inject, Logger } from "@nestjs/common";
+import OpenAI from "openai";
+import { toFile } from "openai";
+import { speechConfig, type SpeechConfig } from "../speech.config";
+import type { ISTTProvider } from "../interfaces/stt-provider.interface";
+import type {
+  TranscribeOptions,
+  TranscriptionResult,
+  TranscriptionSegment,
+} from "../interfaces/speech-types";
+
+/**
+ * Derive file extension from a MIME type for use in the uploaded file name.
+ */
+function extensionFromMimeType(mimeType: string): string {
+  const mapping: Record<string, string> = {
+    "audio/wav": "wav",
+    "audio/wave": "wav",
+    "audio/x-wav": "wav",
+    "audio/mp3": "mp3",
+    "audio/mpeg": "mp3",
+    "audio/mp4": "mp4",
+    "audio/m4a": "m4a",
+    "audio/ogg": "ogg",
+    "audio/flac": "flac",
+    "audio/webm": "webm",
+    "audio/mpga": "mpga",
+  };
+  return mapping[mimeType] ?? "wav";
+}
+
+/**
+ * STT provider backed by a Speaches (faster-whisper) server.
+ *
+ * Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint,
+ * so we re-use the official OpenAI SDK with a custom `baseURL`.
+ *
+ * @example
+ * ```typescript
+ * const provider = new SpeachesSttProvider(speechConfig);
+ * const result = await provider.transcribe(audioBuffer, { language: "en" });
+ * console.log(result.text);
+ * ```
+ */
+@Injectable()
+export class SpeachesSttProvider implements ISTTProvider {
+  readonly name = "speaches";
+
+  private readonly logger = new Logger(SpeachesSttProvider.name);
+  private readonly client: OpenAI;
+  private readonly config: SpeechConfig;
+
+  constructor(
+    @Inject(speechConfig.KEY)
+    config: SpeechConfig
+  ) {
+    this.config = config;
+
+    this.client = new OpenAI({
+      baseURL: config.stt.baseUrl,
+      apiKey: "not-needed", // Speaches does not require an API key
+    });
+
+    this.logger.log(
+      `Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})`
+    );
+  }
+
+  /**
+   * Transcribe audio data to text using the Speaches server.
+   *
+   * Sends the audio buffer to the `/v1/audio/transcriptions` endpoint
+   * with `response_format=verbose_json` to get segments and duration data.
+   *
+   * @param audio - Raw audio data as a Buffer
+   * @param options - Optional transcription parameters (model, language, prompt, temperature)
+   * @returns Transcription result with text, language, duration, and optional segments
+   * @throws {Error} If transcription fails (connection error, API error, etc.)
+   */
+  async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
+    const model = options?.model ?? this.config.stt.model;
+    const language = options?.language ?? this.config.stt.language;
+    const mimeType = options?.mimeType ?? "audio/wav";
+    const extension = extensionFromMimeType(mimeType);
+
+    try {
+      const file = await toFile(audio, `audio.${extension}`, {
+        type: mimeType,
+      });
+
+      const response = await this.client.audio.transcriptions.create({
+        file,
+        model,
+        language,
+        response_format: "verbose_json",
+        ...(options?.prompt !== undefined ? { prompt: options.prompt } : {}),
+        ...(options?.temperature !== undefined ? { temperature: options.temperature } : {}),
+      });
+
+      return this.mapResponse(response, language);
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`Transcription failed: ${message}`);
+      throw new Error(`STT transcription failed: ${message}`);
+    }
+  }
+
+  /**
+   * Check if the Speaches server is healthy and reachable.
+   *
+   * Attempts to list models from the server. Returns true if the request
+   * succeeds, false otherwise.
+   *
+   * @returns true if the Speaches server is reachable and ready
+   */
+  async isHealthy(): Promise<boolean> {
+    try {
+      await this.client.models.list();
+      return true;
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.warn(`Speaches health check failed: ${message}`);
+      return false;
+    }
+  }
+
+  /**
+   * Map the OpenAI SDK transcription response to our TranscriptionResult type.
+   *
+   * Handles both verbose responses (with duration, segments) and simple
+   * responses (text only).
+   */
+  private mapResponse(
+    response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record<string, unknown>,
+    fallbackLanguage: string
+  ): TranscriptionResult {
+    const text = (response as { text: string }).text;
+    const verboseResponse = response as {
+      text: string;
+      language?: string;
+      duration?: number;
+      segments?: {
+        text: string;
+        start: number;
+        end: number;
+      }[];
+    };
+
+    const result: TranscriptionResult = {
+      text,
+      language: verboseResponse.language ?? fallbackLanguage,
+    };
+
+    if (verboseResponse.duration !== undefined) {
+      result.durationSeconds = verboseResponse.duration;
+    }
+
+    if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) {
+      result.segments = verboseResponse.segments.map(
+        (segment): TranscriptionSegment => ({
+          text: segment.text,
+          start: segment.start,
+          end: segment.end,
+        })
+      );
+    }
+
+    return result;
+  }
+}
--- a/apps/api/src/speech/providers/tts-provider.factory.spec.ts
+++ b/apps/api/src/speech/providers/tts-provider.factory.spec.ts
@@ -0,0 +1,279 @@
+/**
+ * TTS Provider Factory Unit Tests
+ *
+ * Tests the factory that creates and registers TTS providers based on config.
+ *
+ * Issue #391
+ */
+
+import { describe, it, expect, vi } from "vitest";
+import { createTTSProviders } from "./tts-provider.factory";
+import type { SpeechConfig } from "../speech.config";
+import type { SpeechTier } from "../interfaces/speech-types";
+
+// ==========================================
+// Mock OpenAI SDK
+// ==========================================
+
+vi.mock("openai", () => {
+  class MockOpenAI {
+    audio = {
+      speech: {
+        create: vi.fn(),
+      },
+    };
+  }
+  return { default: MockOpenAI };
+});
+
+// ==========================================
+// Test helpers
+// ==========================================
+
+function createTestConfig(overrides?: Partial<SpeechConfig>): SpeechConfig {
+  return {
+    stt: {
+      enabled: false,
+      baseUrl: "http://speaches:8000/v1",
+      model: "whisper",
+      language: "en",
+    },
+    tts: {
+      default: {
+        enabled: false,
+        url: "http://kokoro-tts:8880/v1",
+        voice: "af_heart",
+        format: "mp3",
+      },
+      premium: {
+        enabled: false,
+        url: "http://chatterbox-tts:8881/v1",
+      },
+      fallback: {
+        enabled: false,
+        url: "http://openedai-speech:8000/v1",
+      },
+    },
+    limits: {
+      maxUploadSize: 25_000_000,
+      maxDurationSeconds: 600,
+      maxTextLength: 4096,
+    },
+    ...overrides,
+  };
+}
+
+describe("createTTSProviders", () => {
+  // ==========================================
+  // Empty map when nothing enabled
+  // ==========================================
+
+  describe("when no TTS tiers are enabled", () => {
+    it("should return an empty map", () => {
+      const config = createTestConfig();
+      const providers = createTTSProviders(config);
+
+      expect(providers).toBeInstanceOf(Map);
+      expect(providers.size).toBe(0);
+    });
+  });
+
+  // ==========================================
+  // Default tier
+  // ==========================================
+
+  describe("when default tier is enabled", () => {
+    it("should create a provider for the default tier", () => {
+      const config = createTestConfig({
+        tts: {
+          default: {
+            enabled: true,
+            url: "http://kokoro-tts:8880/v1",
+            voice: "af_heart",
+            format: "mp3",
+          },
+          premium: { enabled: false, url: "" },
+          fallback: { enabled: false, url: "" },
+        },
+      });
+
+      const providers = createTTSProviders(config);
+
+      expect(providers.size).toBe(1);
+      expect(providers.has("default")).toBe(true);
+
+      const provider = providers.get("default");
+      expect(provider).toBeDefined();
+      expect(provider?.tier).toBe("default");
+      expect(provider?.name).toBe("kokoro");
+    });
+  });
+
+  // ==========================================
+  // Premium tier
+  // ==========================================
+
+  describe("when premium tier is enabled", () => {
+    it("should create a provider for the premium tier", () => {
+      const config = createTestConfig({
+        tts: {
+          default: { enabled: false, url: "", voice: "", format: "" },
+          premium: {
+            enabled: true,
+            url: "http://chatterbox-tts:8881/v1",
+          },
+          fallback: { enabled: false, url: "" },
+        },
+      });
+
+      const providers = createTTSProviders(config);
+
+      expect(providers.size).toBe(1);
+      expect(providers.has("premium")).toBe(true);
+
+      const provider = providers.get("premium");
+      expect(provider).toBeDefined();
+      expect(provider?.tier).toBe("premium");
+      expect(provider?.name).toBe("chatterbox");
+    });
+  });
+
+  // ==========================================
+  // Fallback tier
+  // ==========================================
+
+  describe("when fallback tier is enabled", () => {
+    it("should create a provider for the fallback tier", () => {
+      const config = createTestConfig({
+        tts: {
+          default: { enabled: false, url: "", voice: "", format: "" },
+          premium: { enabled: false, url: "" },
+          fallback: {
+            enabled: true,
+            url: "http://openedai-speech:8000/v1",
+          },
+        },
+      });
+
+      const providers = createTTSProviders(config);
+
+      expect(providers.size).toBe(1);
+      expect(providers.has("fallback")).toBe(true);
+
+      const provider = providers.get("fallback");
+      expect(provider).toBeDefined();
+      expect(provider?.tier).toBe("fallback");
+      expect(provider?.name).toBe("piper");
+    });
+  });
+
+  // ==========================================
+  // Multiple tiers
+  // ==========================================
+
+  describe("when multiple tiers are enabled", () => {
+    it("should create providers for all enabled tiers", () => {
+      const config = createTestConfig({
+        tts: {
+          default: {
+            enabled: true,
+            url: "http://kokoro-tts:8880/v1",
+            voice: "af_heart",
+            format: "mp3",
+          },
+          premium: {
+            enabled: true,
+            url: "http://chatterbox-tts:8881/v1",
+          },
+          fallback: {
+            enabled: true,
+            url: "http://openedai-speech:8000/v1",
+          },
+        },
+      });
+
+      const providers = createTTSProviders(config);
+
+      expect(providers.size).toBe(3);
+      expect(providers.has("default")).toBe(true);
+      expect(providers.has("premium")).toBe(true);
+      expect(providers.has("fallback")).toBe(true);
+    });
+
+    it("should create providers only for enabled tiers", () => {
+      const config = createTestConfig({
+        tts: {
+          default: {
+            enabled: true,
+            url: "http://kokoro-tts:8880/v1",
+            voice: "af_heart",
+            format: "mp3",
+          },
+          premium: { enabled: false, url: "" },
+          fallback: {
+            enabled: true,
+            url: "http://openedai-speech:8000/v1",
+          },
+        },
+      });
+
+      const providers = createTTSProviders(config);
+
+      expect(providers.size).toBe(2);
+      expect(providers.has("default")).toBe(true);
+      expect(providers.has("premium")).toBe(false);
+      expect(providers.has("fallback")).toBe(true);
+    });
+  });
+
+  // ==========================================
+  // Provider properties
+  // ==========================================
+
+  describe("provider properties", () => {
+    it("should implement ITTSProvider interface methods", () => {
+      const config = createTestConfig({
+        tts: {
+          default: {
+            enabled: true,
+            url: "http://kokoro-tts:8880/v1",
+            voice: "af_heart",
+            format: "mp3",
+          },
+          premium: { enabled: false, url: "" },
+          fallback: { enabled: false, url: "" },
+        },
+      });
+
+      const providers = createTTSProviders(config);
+      const provider = providers.get("default");
+
+      expect(provider).toBeDefined();
+      expect(typeof provider?.synthesize).toBe("function");
+      expect(typeof provider?.listVoices).toBe("function");
+      expect(typeof provider?.isHealthy).toBe("function");
+    });
+
+    it("should return providers as a Map<SpeechTier, ITTSProvider>", () => {
+      const config = createTestConfig({
+        tts: {
+          default: {
+            enabled: true,
+            url: "http://kokoro-tts:8880/v1",
+            voice: "af_heart",
+            format: "mp3",
+          },
+          premium: { enabled: false, url: "" },
+          fallback: { enabled: false, url: "" },
+        },
+      });
+
+      const providers = createTTSProviders(config);
+
+      // Verify the map keys are valid SpeechTier values
+      for (const [tier] of providers) {
+        expect(["default", "premium", "fallback"]).toContain(tier as SpeechTier);
+      }
+    });
+  });
+});
--- a/apps/api/src/speech/providers/tts-provider.factory.ts
+++ b/apps/api/src/speech/providers/tts-provider.factory.ts
@@ -0,0 +1,75 @@
+/**
+ * TTS Provider Factory
+ *
+ * Creates and registers TTS providers based on speech configuration.
+ * Reads enabled flags and URLs from config and instantiates the appropriate
+ * provider for each tier.
+ *
+ * Each tier maps to a specific TTS engine:
+ * - default: Kokoro-FastAPI (CPU, always available)
+ * - premium: Chatterbox (GPU, voice cloning)
+ * - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
+ *
+ * Issue #391
+ */
+
+import { Logger } from "@nestjs/common";
+import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
+import { KokoroTtsProvider } from "./kokoro-tts.provider";
+import { PiperTtsProvider } from "./piper-tts.provider";
+import type { ITTSProvider } from "../interfaces/tts-provider.interface";
+import type { SpeechTier } from "../interfaces/speech-types";
+import type { SpeechConfig } from "../speech.config";
+
+// ==========================================
+// Factory function
+// ==========================================
+
+const logger = new Logger("TTSProviderFactory");
+
+/**
+ * Create and register TTS providers based on the speech configuration.
+ *
+ * Only creates providers for tiers that are enabled in the config.
+ * Returns a Map keyed by SpeechTier for use with the TTS_PROVIDERS injection token.
+ *
+ * @param config - Speech configuration with TTS tier settings
+ * @returns Map of enabled TTS providers keyed by tier
+ */
+export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSProvider> {
+  const providers = new Map<SpeechTier, ITTSProvider>();
+
+  // Default tier: Kokoro
+  if (config.tts.default.enabled) {
+    const provider = new KokoroTtsProvider(
+      config.tts.default.url,
+      config.tts.default.voice,
+      config.tts.default.format
+    );
+    providers.set("default", provider);
+    logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`);
+  }
+
+  // Premium tier: Chatterbox
+  if (config.tts.premium.enabled) {
+    const provider = new ChatterboxTTSProvider(config.tts.premium.url);
+    providers.set("premium", provider);
+    logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`);
+  }
+
+  // Fallback tier: Piper
+  if (config.tts.fallback.enabled) {
+    const provider = new PiperTtsProvider(config.tts.fallback.url);
+    providers.set("fallback", provider);
+    logger.log(`Registered fallback TTS provider: piper at ${config.tts.fallback.url}`);
+  }
+
+  if (providers.size === 0) {
+    logger.warn("No TTS providers are enabled. TTS synthesis will not be available.");
+  } else {
+    const tierNames = Array.from(providers.keys()).join(", ");
+    logger.log(`TTS providers ready: ${tierNames} (${String(providers.size)} total)`);
+  }
+
+  return providers;
+}
--- a/apps/api/src/speech/speech.config.spec.ts
+++ b/apps/api/src/speech/speech.config.spec.ts
@@ -0,0 +1,458 @@
+/**
+ * Speech Configuration Tests
+ *
+ * Issue #401: Tests for speech services environment variable validation
+ * Tests cover STT, TTS (default, premium, fallback), and speech limits configuration.
+ */
+
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+import {
+  isSttEnabled,
+  isTtsEnabled,
+  isTtsPremiumEnabled,
+  isTtsFallbackEnabled,
+  validateSpeechConfig,
+  getSpeechConfig,
+  type SpeechConfig,
+} from "./speech.config";
+
+describe("speech.config", () => {
+  const originalEnv = { ...process.env };
+
+  beforeEach(() => {
+    // Clear all speech-related env vars before each test
+    delete process.env.STT_ENABLED;
+    delete process.env.STT_BASE_URL;
+    delete process.env.STT_MODEL;
+    delete process.env.STT_LANGUAGE;
+    delete process.env.TTS_ENABLED;
+    delete process.env.TTS_DEFAULT_URL;
+    delete process.env.TTS_DEFAULT_VOICE;
+    delete process.env.TTS_DEFAULT_FORMAT;
+    delete process.env.TTS_PREMIUM_ENABLED;
+    delete process.env.TTS_PREMIUM_URL;
+    delete process.env.TTS_FALLBACK_ENABLED;
+    delete process.env.TTS_FALLBACK_URL;
+    delete process.env.SPEECH_MAX_UPLOAD_SIZE;
+    delete process.env.SPEECH_MAX_DURATION_SECONDS;
+    delete process.env.SPEECH_MAX_TEXT_LENGTH;
+  });
+
+  afterEach(() => {
+    process.env = { ...originalEnv };
+  });
+
+  // ==========================================
+  // STT enabled check
+  // ==========================================
+  describe("isSttEnabled", () => {
+    it("should return false when STT_ENABLED is not set", () => {
+      expect(isSttEnabled()).toBe(false);
+    });
+
+    it("should return false when STT_ENABLED is 'false'", () => {
+      process.env.STT_ENABLED = "false";
+      expect(isSttEnabled()).toBe(false);
+    });
+
+    it("should return false when STT_ENABLED is '0'", () => {
+      process.env.STT_ENABLED = "0";
+      expect(isSttEnabled()).toBe(false);
+    });
+
+    it("should return false when STT_ENABLED is empty string", () => {
+      process.env.STT_ENABLED = "";
+      expect(isSttEnabled()).toBe(false);
+    });
+
+    it("should return true when STT_ENABLED is 'true'", () => {
+      process.env.STT_ENABLED = "true";
+      expect(isSttEnabled()).toBe(true);
+    });
+
+    it("should return true when STT_ENABLED is '1'", () => {
+      process.env.STT_ENABLED = "1";
+      expect(isSttEnabled()).toBe(true);
+    });
+  });
+
+  // ==========================================
+  // TTS enabled check
+  // ==========================================
+  describe("isTtsEnabled", () => {
+    it("should return false when TTS_ENABLED is not set", () => {
+      expect(isTtsEnabled()).toBe(false);
+    });
+
+    it("should return false when TTS_ENABLED is 'false'", () => {
+      process.env.TTS_ENABLED = "false";
+      expect(isTtsEnabled()).toBe(false);
+    });
+
+    it("should return true when TTS_ENABLED is 'true'", () => {
+      process.env.TTS_ENABLED = "true";
+      expect(isTtsEnabled()).toBe(true);
+    });
+
+    it("should return true when TTS_ENABLED is '1'", () => {
+      process.env.TTS_ENABLED = "1";
+      expect(isTtsEnabled()).toBe(true);
+    });
+  });
+
+  // ==========================================
+  // TTS premium enabled check
+  // ==========================================
+  describe("isTtsPremiumEnabled", () => {
+    it("should return false when TTS_PREMIUM_ENABLED is not set", () => {
+      expect(isTtsPremiumEnabled()).toBe(false);
+    });
+
+    it("should return false when TTS_PREMIUM_ENABLED is 'false'", () => {
+      process.env.TTS_PREMIUM_ENABLED = "false";
+      expect(isTtsPremiumEnabled()).toBe(false);
+    });
+
+    it("should return true when TTS_PREMIUM_ENABLED is 'true'", () => {
+      process.env.TTS_PREMIUM_ENABLED = "true";
+      expect(isTtsPremiumEnabled()).toBe(true);
+    });
+  });
+
+  // ==========================================
+  // TTS fallback enabled check
+  // ==========================================
+  describe("isTtsFallbackEnabled", () => {
+    it("should return false when TTS_FALLBACK_ENABLED is not set", () => {
+      expect(isTtsFallbackEnabled()).toBe(false);
+    });
+
+    it("should return false when TTS_FALLBACK_ENABLED is 'false'", () => {
+      process.env.TTS_FALLBACK_ENABLED = "false";
+      expect(isTtsFallbackEnabled()).toBe(false);
+    });
+
+    it("should return true when TTS_FALLBACK_ENABLED is 'true'", () => {
+      process.env.TTS_FALLBACK_ENABLED = "true";
+      expect(isTtsFallbackEnabled()).toBe(true);
+    });
+  });
+
+  // ==========================================
+  // validateSpeechConfig
+  // ==========================================
+  describe("validateSpeechConfig", () => {
+    describe("when all services are disabled", () => {
+      it("should not throw when no speech services are enabled", () => {
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+
+      it("should not throw when services are explicitly disabled", () => {
+        process.env.STT_ENABLED = "false";
+        process.env.TTS_ENABLED = "false";
+        process.env.TTS_PREMIUM_ENABLED = "false";
+        process.env.TTS_FALLBACK_ENABLED = "false";
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+    });
+
+    describe("STT validation", () => {
+      beforeEach(() => {
+        process.env.STT_ENABLED = "true";
+      });
+
+      it("should throw when STT is enabled but STT_BASE_URL is missing", () => {
+        expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL");
+        expect(() => validateSpeechConfig()).toThrow(
+          "STT is enabled (STT_ENABLED=true) but required environment variables are missing"
+        );
+      });
+
+      it("should throw when STT_BASE_URL is empty string", () => {
+        process.env.STT_BASE_URL = "";
+        expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL");
+      });
+
+      it("should throw when STT_BASE_URL is whitespace only", () => {
+        process.env.STT_BASE_URL = "   ";
+        expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL");
+      });
+
+      it("should not throw when STT is enabled and STT_BASE_URL is set", () => {
+        process.env.STT_BASE_URL = "http://speaches:8000/v1";
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+
+      it("should suggest disabling STT in error message", () => {
+        expect(() => validateSpeechConfig()).toThrow("STT_ENABLED=false");
+      });
+    });
+
+    describe("TTS default validation", () => {
+      beforeEach(() => {
+        process.env.TTS_ENABLED = "true";
+      });
+
+      it("should throw when TTS is enabled but TTS_DEFAULT_URL is missing", () => {
+        expect(() => validateSpeechConfig()).toThrow("TTS_DEFAULT_URL");
+        expect(() => validateSpeechConfig()).toThrow(
+          "TTS is enabled (TTS_ENABLED=true) but required environment variables are missing"
+        );
+      });
+
+      it("should throw when TTS_DEFAULT_URL is empty string", () => {
+        process.env.TTS_DEFAULT_URL = "";
+        expect(() => validateSpeechConfig()).toThrow("TTS_DEFAULT_URL");
+      });
+
+      it("should not throw when TTS is enabled and TTS_DEFAULT_URL is set", () => {
+        process.env.TTS_DEFAULT_URL = "http://kokoro-tts:8880/v1";
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+
+      it("should suggest disabling TTS in error message", () => {
+        expect(() => validateSpeechConfig()).toThrow("TTS_ENABLED=false");
+      });
+    });
+
+    describe("TTS premium validation", () => {
+      beforeEach(() => {
+        process.env.TTS_PREMIUM_ENABLED = "true";
+      });
+
+      it("should throw when TTS premium is enabled but TTS_PREMIUM_URL is missing", () => {
+        expect(() => validateSpeechConfig()).toThrow("TTS_PREMIUM_URL");
+        expect(() => validateSpeechConfig()).toThrow(
+          "TTS premium is enabled (TTS_PREMIUM_ENABLED=true) but required environment variables are missing"
+        );
+      });
+
+      it("should throw when TTS_PREMIUM_URL is empty string", () => {
+        process.env.TTS_PREMIUM_URL = "";
+        expect(() => validateSpeechConfig()).toThrow("TTS_PREMIUM_URL");
+      });
+
+      it("should not throw when TTS premium is enabled and TTS_PREMIUM_URL is set", () => {
+        process.env.TTS_PREMIUM_URL = "http://chatterbox-tts:8881/v1";
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+
+      it("should suggest disabling TTS premium in error message", () => {
+        expect(() => validateSpeechConfig()).toThrow("TTS_PREMIUM_ENABLED=false");
+      });
+    });
+
+    describe("TTS fallback validation", () => {
+      beforeEach(() => {
+        process.env.TTS_FALLBACK_ENABLED = "true";
+      });
+
+      it("should throw when TTS fallback is enabled but TTS_FALLBACK_URL is missing", () => {
+        expect(() => validateSpeechConfig()).toThrow("TTS_FALLBACK_URL");
+        expect(() => validateSpeechConfig()).toThrow(
+          "TTS fallback is enabled (TTS_FALLBACK_ENABLED=true) but required environment variables are missing"
+        );
+      });
+
+      it("should throw when TTS_FALLBACK_URL is empty string", () => {
+        process.env.TTS_FALLBACK_URL = "";
+        expect(() => validateSpeechConfig()).toThrow("TTS_FALLBACK_URL");
+      });
+
+      it("should not throw when TTS fallback is enabled and TTS_FALLBACK_URL is set", () => {
+        process.env.TTS_FALLBACK_URL = "http://openedai-speech:8000/v1";
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+
+      it("should suggest disabling TTS fallback in error message", () => {
+        expect(() => validateSpeechConfig()).toThrow("TTS_FALLBACK_ENABLED=false");
+      });
+    });
+
+    describe("multiple services enabled simultaneously", () => {
+      it("should validate all enabled services", () => {
+        process.env.STT_ENABLED = "true";
+        process.env.TTS_ENABLED = "true";
+        // Missing both STT_BASE_URL and TTS_DEFAULT_URL
+
+        expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL");
+      });
+
+      it("should pass when all enabled services are properly configured", () => {
+        process.env.STT_ENABLED = "true";
+        process.env.STT_BASE_URL = "http://speaches:8000/v1";
+        process.env.TTS_ENABLED = "true";
+        process.env.TTS_DEFAULT_URL = "http://kokoro-tts:8880/v1";
+        process.env.TTS_PREMIUM_ENABLED = "true";
+        process.env.TTS_PREMIUM_URL = "http://chatterbox-tts:8881/v1";
+        process.env.TTS_FALLBACK_ENABLED = "true";
+        process.env.TTS_FALLBACK_URL = "http://openedai-speech:8000/v1";
+
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+    });
+
+    describe("limits validation", () => {
+      it("should throw when SPEECH_MAX_UPLOAD_SIZE is not a valid number", () => {
+        process.env.SPEECH_MAX_UPLOAD_SIZE = "not-a-number";
+        expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_UPLOAD_SIZE");
+        expect(() => validateSpeechConfig()).toThrow("must be a positive integer");
+      });
+
+      it("should throw when SPEECH_MAX_UPLOAD_SIZE is negative", () => {
+        process.env.SPEECH_MAX_UPLOAD_SIZE = "-100";
+        expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_UPLOAD_SIZE");
+      });
+
+      it("should throw when SPEECH_MAX_UPLOAD_SIZE is zero", () => {
+        process.env.SPEECH_MAX_UPLOAD_SIZE = "0";
+        expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_UPLOAD_SIZE");
+      });
+
+      it("should throw when SPEECH_MAX_DURATION_SECONDS is not a valid number", () => {
+        process.env.SPEECH_MAX_DURATION_SECONDS = "abc";
+        expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_DURATION_SECONDS");
+      });
+
+      it("should throw when SPEECH_MAX_TEXT_LENGTH is not a valid number", () => {
+        process.env.SPEECH_MAX_TEXT_LENGTH = "xyz";
+        expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_TEXT_LENGTH");
+      });
+
+      it("should not throw when limits are valid positive integers", () => {
+        process.env.SPEECH_MAX_UPLOAD_SIZE = "50000000";
+        process.env.SPEECH_MAX_DURATION_SECONDS = "1200";
+        process.env.SPEECH_MAX_TEXT_LENGTH = "8192";
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+
+      it("should not throw when limits are not set (uses defaults)", () => {
+        expect(() => validateSpeechConfig()).not.toThrow();
+      });
+    });
+  });
+
+  // ==========================================
+  // getSpeechConfig
+  // ==========================================
+  describe("getSpeechConfig", () => {
+    it("should return default values when no env vars are set", () => {
+      const config = getSpeechConfig();
+
+      expect(config.stt.enabled).toBe(false);
+      expect(config.stt.baseUrl).toBe("http://speaches:8000/v1");
+      expect(config.stt.model).toBe("Systran/faster-whisper-large-v3-turbo");
+      expect(config.stt.language).toBe("en");
+
+      expect(config.tts.default.enabled).toBe(false);
+      expect(config.tts.default.url).toBe("http://kokoro-tts:8880/v1");
+      expect(config.tts.default.voice).toBe("af_heart");
+      expect(config.tts.default.format).toBe("mp3");
+
+      expect(config.tts.premium.enabled).toBe(false);
+      expect(config.tts.premium.url).toBe("http://chatterbox-tts:8881/v1");
+
+      expect(config.tts.fallback.enabled).toBe(false);
+      expect(config.tts.fallback.url).toBe("http://openedai-speech:8000/v1");
+
+      expect(config.limits.maxUploadSize).toBe(25000000);
+      expect(config.limits.maxDurationSeconds).toBe(600);
+      expect(config.limits.maxTextLength).toBe(4096);
+    });
+
+    it("should use custom env var values when set", () => {
+      process.env.STT_ENABLED = "true";
+      process.env.STT_BASE_URL = "http://custom-stt:9000/v1";
+      process.env.STT_MODEL = "custom-model";
+      process.env.STT_LANGUAGE = "fr";
+
+      process.env.TTS_ENABLED = "true";
+      process.env.TTS_DEFAULT_URL = "http://custom-tts:9001/v1";
+      process.env.TTS_DEFAULT_VOICE = "custom_voice";
+      process.env.TTS_DEFAULT_FORMAT = "wav";
+
+      process.env.TTS_PREMIUM_ENABLED = "true";
+      process.env.TTS_PREMIUM_URL = "http://custom-premium:9002/v1";
+
+      process.env.TTS_FALLBACK_ENABLED = "true";
+      process.env.TTS_FALLBACK_URL = "http://custom-fallback:9003/v1";
+
+      process.env.SPEECH_MAX_UPLOAD_SIZE = "50000000";
+      process.env.SPEECH_MAX_DURATION_SECONDS = "1200";
+      process.env.SPEECH_MAX_TEXT_LENGTH = "8192";
+
+      const config = getSpeechConfig();
+
+      expect(config.stt.enabled).toBe(true);
+      expect(config.stt.baseUrl).toBe("http://custom-stt:9000/v1");
+      expect(config.stt.model).toBe("custom-model");
+      expect(config.stt.language).toBe("fr");
+
+      expect(config.tts.default.enabled).toBe(true);
+      expect(config.tts.default.url).toBe("http://custom-tts:9001/v1");
+      expect(config.tts.default.voice).toBe("custom_voice");
+      expect(config.tts.default.format).toBe("wav");
+
+      expect(config.tts.premium.enabled).toBe(true);
+      expect(config.tts.premium.url).toBe("http://custom-premium:9002/v1");
+
+      expect(config.tts.fallback.enabled).toBe(true);
+      expect(config.tts.fallback.url).toBe("http://custom-fallback:9003/v1");
+
+      expect(config.limits.maxUploadSize).toBe(50000000);
+      expect(config.limits.maxDurationSeconds).toBe(1200);
+      expect(config.limits.maxTextLength).toBe(8192);
+    });
+
+    it("should return typed SpeechConfig object", () => {
+      const config: SpeechConfig = getSpeechConfig();
+
+      // Verify structure matches the SpeechConfig type
+      expect(config).toHaveProperty("stt");
+      expect(config).toHaveProperty("tts");
+      expect(config).toHaveProperty("limits");
+      expect(config.tts).toHaveProperty("default");
+      expect(config.tts).toHaveProperty("premium");
+      expect(config.tts).toHaveProperty("fallback");
+    });
+
+    it("should handle partial env var overrides", () => {
+      process.env.STT_ENABLED = "true";
+      process.env.STT_BASE_URL = "http://custom-stt:9000/v1";
+      // STT_MODEL and STT_LANGUAGE not set, should use defaults
+
+      const config = getSpeechConfig();
+
+      expect(config.stt.enabled).toBe(true);
+      expect(config.stt.baseUrl).toBe("http://custom-stt:9000/v1");
+      expect(config.stt.model).toBe("Systran/faster-whisper-large-v3-turbo");
+      expect(config.stt.language).toBe("en");
+    });
+
+    it("should parse numeric limits correctly", () => {
+      process.env.SPEECH_MAX_UPLOAD_SIZE = "10000000";
+      const config = getSpeechConfig();
+      expect(typeof config.limits.maxUploadSize).toBe("number");
+      expect(config.limits.maxUploadSize).toBe(10000000);
+    });
+  });
+
+  // ==========================================
+  // registerAs integration
+  // ==========================================
+  describe("speechConfig (registerAs factory)", () => {
+    it("should be importable as a config namespace factory", async () => {
+      const { speechConfig } = await import("./speech.config");
+      expect(speechConfig).toBeDefined();
+      expect(speechConfig.KEY).toBe("CONFIGURATION(speech)");
+    });
+
+    it("should return config object when called", async () => {
+      const { speechConfig } = await import("./speech.config");
+      const config = speechConfig() as SpeechConfig;
+      expect(config).toHaveProperty("stt");
+      expect(config).toHaveProperty("tts");
+      expect(config).toHaveProperty("limits");
+    });
+  });
+});
--- a/apps/api/src/speech/speech.config.ts
+++ b/apps/api/src/speech/speech.config.ts
@@ -0,0 +1,305 @@
+/**
+ * Speech Services Configuration
+ *
+ * Issue #401: Environment variables and validation for STT (speech-to-text),
+ * TTS (text-to-speech), and speech service limits.
+ *
+ * Validates conditional requirements at startup:
+ * - STT_BASE_URL is required when STT_ENABLED=true
+ * - TTS_DEFAULT_URL is required when TTS_ENABLED=true
+ * - TTS_PREMIUM_URL is required when TTS_PREMIUM_ENABLED=true
+ * - TTS_FALLBACK_URL is required when TTS_FALLBACK_ENABLED=true
+ */
+
+import { registerAs } from "@nestjs/config";
+import type { AudioFormat } from "./interfaces/speech-types";
+
+// ==========================================
+// Default values
+// ==========================================
+
+const STT_DEFAULTS = {
+  baseUrl: "http://speaches:8000/v1",
+  model: "Systran/faster-whisper-large-v3-turbo",
+  language: "en",
+} as const;
+
+const TTS_DEFAULT_DEFAULTS = {
+  url: "http://kokoro-tts:8880/v1",
+  voice: "af_heart",
+  format: "mp3",
+} as const;
+
+const TTS_PREMIUM_DEFAULTS = {
+  url: "http://chatterbox-tts:8881/v1",
+} as const;
+
+const TTS_FALLBACK_DEFAULTS = {
+  url: "http://openedai-speech:8000/v1",
+} as const;
+
+const LIMITS_DEFAULTS = {
+  maxUploadSize: 25_000_000,
+  maxDurationSeconds: 600,
+  maxTextLength: 4096,
+} as const;
+
+// ==========================================
+// Types
+// ==========================================
+
+export interface SttConfig {
+  enabled: boolean;
+  baseUrl: string;
+  model: string;
+  language: string;
+}
+
+export interface TtsDefaultConfig {
+  enabled: boolean;
+  url: string;
+  voice: string;
+  format: AudioFormat;
+}
+
+export interface TtsPremiumConfig {
+  enabled: boolean;
+  url: string;
+}
+
+export interface TtsFallbackConfig {
+  enabled: boolean;
+  url: string;
+}
+
+export interface TtsConfig {
+  default: TtsDefaultConfig;
+  premium: TtsPremiumConfig;
+  fallback: TtsFallbackConfig;
+}
+
+export interface SpeechLimitsConfig {
+  maxUploadSize: number;
+  maxDurationSeconds: number;
+  maxTextLength: number;
+}
+
+export interface SpeechConfig {
+  stt: SttConfig;
+  tts: TtsConfig;
+  limits: SpeechLimitsConfig;
+}
+
+// ==========================================
+// Helper: parse boolean env var
+// ==========================================
+
+function parseBooleanEnv(value: string | undefined): boolean {
+  return value === "true" || value === "1";
+}
+
+// ==========================================
+// Enabled checks
+// ==========================================
+
+/**
+ * Check if speech-to-text (STT) is enabled via environment variable.
+ */
+export function isSttEnabled(): boolean {
+  return parseBooleanEnv(process.env.STT_ENABLED);
+}
+
+/**
+ * Check if text-to-speech (TTS) default engine is enabled via environment variable.
+ */
+export function isTtsEnabled(): boolean {
+  return parseBooleanEnv(process.env.TTS_ENABLED);
+}
+
+/**
+ * Check if TTS premium engine (Chatterbox) is enabled via environment variable.
+ */
+export function isTtsPremiumEnabled(): boolean {
+  return parseBooleanEnv(process.env.TTS_PREMIUM_ENABLED);
+}
+
+/**
+ * Check if TTS fallback engine (Piper/OpenedAI) is enabled via environment variable.
+ */
+export function isTtsFallbackEnabled(): boolean {
+  return parseBooleanEnv(process.env.TTS_FALLBACK_ENABLED);
+}
+
+// ==========================================
+// Validation helpers
+// ==========================================
+
+/**
+ * Check if an environment variable has a non-empty value.
+ */
+function isEnvVarSet(envVar: string): boolean {
+  const value = process.env[envVar];
+  return value !== undefined && value.trim() !== "";
+}
+
+/**
+ * Validate that required env vars are set when a service is enabled.
+ * Throws with a helpful error message listing missing vars and how to disable.
+ */
+function validateRequiredVars(
+  serviceName: string,
+  enabledFlag: string,
+  requiredVars: string[]
+): void {
+  const missingVars: string[] = [];
+
+  for (const envVar of requiredVars) {
+    if (!isEnvVarSet(envVar)) {
+      missingVars.push(envVar);
+    }
+  }
+
+  if (missingVars.length > 0) {
+    throw new Error(
+      `${serviceName} is enabled (${enabledFlag}=true) but required environment variables are missing or empty: ${missingVars.join(", ")}. ` +
+        `Either set these variables or disable by setting ${enabledFlag}=false.`
+    );
+  }
+}
+
+/**
+ * Validate that a numeric env var, if set, is a positive integer.
+ */
+function validatePositiveInteger(envVar: string): void {
+  const value = process.env[envVar];
+  if (value === undefined || value.trim() === "") {
+    return; // Not set, will use default
+  }
+
+  const parsed = parseInt(value, 10);
+  if (isNaN(parsed) || parsed <= 0 || String(parsed) !== value.trim()) {
+    throw new Error(`${envVar} must be a positive integer. Current value: "${value}".`);
+  }
+}
+
+// ==========================================
+// Main validation
+// ==========================================
+
+/**
+ * Validates speech configuration at startup.
+ * Call this during module initialization to fail fast if misconfigured.
+ *
+ * Validates:
+ * - STT_BASE_URL is set when STT_ENABLED=true
+ * - TTS_DEFAULT_URL is set when TTS_ENABLED=true
+ * - TTS_PREMIUM_URL is set when TTS_PREMIUM_ENABLED=true
+ * - TTS_FALLBACK_URL is set when TTS_FALLBACK_ENABLED=true
+ * - Numeric limits are positive integers (when set)
+ *
+ * @throws Error if any required configuration is missing or invalid
+ */
+export function validateSpeechConfig(): void {
+  // STT validation
+  if (isSttEnabled()) {
+    validateRequiredVars("STT", "STT_ENABLED", ["STT_BASE_URL"]);
+  }
+
+  // TTS default validation
+  if (isTtsEnabled()) {
+    validateRequiredVars("TTS", "TTS_ENABLED", ["TTS_DEFAULT_URL"]);
+  }
+
+  // TTS premium validation
+  if (isTtsPremiumEnabled()) {
+    validateRequiredVars("TTS premium", "TTS_PREMIUM_ENABLED", ["TTS_PREMIUM_URL"]);
+  }
+
+  // TTS fallback validation
+  if (isTtsFallbackEnabled()) {
+    validateRequiredVars("TTS fallback", "TTS_FALLBACK_ENABLED", ["TTS_FALLBACK_URL"]);
+  }
+
+  // Limits validation (only if set, otherwise defaults are used)
+  validatePositiveInteger("SPEECH_MAX_UPLOAD_SIZE");
+  validatePositiveInteger("SPEECH_MAX_DURATION_SECONDS");
+  validatePositiveInteger("SPEECH_MAX_TEXT_LENGTH");
+}
+
+// ==========================================
+// Config getter
+// ==========================================
+
+/**
+ * Get the full speech configuration object with typed values and defaults.
+ *
+ * @returns SpeechConfig with all STT, TTS, and limits configuration
+ */
+export function getSpeechConfig(): SpeechConfig {
+  return {
+    stt: {
+      enabled: isSttEnabled(),
+      baseUrl: process.env.STT_BASE_URL ?? STT_DEFAULTS.baseUrl,
+      model: process.env.STT_MODEL ?? STT_DEFAULTS.model,
+      language: process.env.STT_LANGUAGE ?? STT_DEFAULTS.language,
+    },
+    tts: {
+      default: {
+        enabled: isTtsEnabled(),
+        url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url,
+        voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice,
+        format: (process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format) as AudioFormat,
+      },
+      premium: {
+        enabled: isTtsPremiumEnabled(),
+        url: process.env.TTS_PREMIUM_URL ?? TTS_PREMIUM_DEFAULTS.url,
+      },
+      fallback: {
+        enabled: isTtsFallbackEnabled(),
+        url: process.env.TTS_FALLBACK_URL ?? TTS_FALLBACK_DEFAULTS.url,
+      },
+    },
+    limits: {
+      maxUploadSize: parseInt(
+        process.env.SPEECH_MAX_UPLOAD_SIZE ?? String(LIMITS_DEFAULTS.maxUploadSize),
+        10
+      ),
+      maxDurationSeconds: parseInt(
+        process.env.SPEECH_MAX_DURATION_SECONDS ?? String(LIMITS_DEFAULTS.maxDurationSeconds),
+        10
+      ),
+      maxTextLength: parseInt(
+        process.env.SPEECH_MAX_TEXT_LENGTH ?? String(LIMITS_DEFAULTS.maxTextLength),
+        10
+      ),
+    },
+  };
+}
+
+// ==========================================
+// NestJS ConfigModule registerAs factory
+// ==========================================
+
+/**
+ * NestJS ConfigModule namespace factory for speech configuration.
+ *
+ * Usage in a module:
+ * ```typescript
+ * import { speechConfig } from './speech.config';
+ *
+ * @Module({
+ *   imports: [ConfigModule.forFeature(speechConfig)],
+ * })
+ * export class SpeechModule {}
+ * ```
+ *
+ * Then inject via ConfigService:
+ * ```typescript
+ * constructor(private config: ConfigService) {
+ *   const sttUrl = this.config.get<string>('speech.stt.baseUrl');
+ * }
+ * ```
+ */
+export const speechConfig = registerAs("speech", (): SpeechConfig => {
+  return getSpeechConfig();
+});
--- a/apps/api/src/speech/speech.constants.ts
+++ b/apps/api/src/speech/speech.constants.ts
@@ -0,0 +1,19 @@
+/**
+ * Speech Module Constants
+ *
+ * NestJS injection tokens for speech providers.
+ *
+ * Issue #389
+ */
+
+/**
+ * Injection token for the STT (speech-to-text) provider.
+ * Providers implementing ISTTProvider register under this token.
+ */
+export const STT_PROVIDER = Symbol("STT_PROVIDER");
+
+/**
+ * Injection token for TTS (text-to-speech) providers map.
+ * Registered as Map<SpeechTier, ITTSProvider>.
+ */
+export const TTS_PROVIDERS = Symbol("TTS_PROVIDERS");
--- a/apps/api/src/speech/speech.controller.spec.ts
+++ b/apps/api/src/speech/speech.controller.spec.ts
@@ -0,0 +1,437 @@
+import { describe, it, expect, beforeEach, vi } from "vitest";
+import { StreamableFile, ServiceUnavailableException } from "@nestjs/common";
+import { SpeechController } from "./speech.controller";
+import { SpeechService } from "./speech.service";
+import type { TranscribeDto } from "./dto/transcribe.dto";
+import type { SynthesizeDto } from "./dto/synthesize.dto";
+import type { TranscriptionResult, SynthesisResult, VoiceInfo } from "./interfaces/speech-types";
+
+describe("SpeechController", () => {
+  let controller: SpeechController;
+  let service: SpeechService;
+
+  const mockSpeechService = {
+    transcribe: vi.fn(),
+    synthesize: vi.fn(),
+    listVoices: vi.fn(),
+    isSTTAvailable: vi.fn(),
+    isTTSAvailable: vi.fn(),
+  };
+
+  const mockWorkspaceId = "550e8400-e29b-41d4-a716-446655440001";
+  const mockUserId = "550e8400-e29b-41d4-a716-446655440002";
+
+  const mockUser = {
+    id: mockUserId,
+    email: "test@example.com",
+    name: "Test User",
+    workspaceId: mockWorkspaceId,
+  };
+
+  const mockFile: Express.Multer.File = {
+    buffer: Buffer.from("fake-audio-data"),
+    mimetype: "audio/wav",
+    size: 1024,
+    originalname: "test.wav",
+    fieldname: "file",
+    encoding: "7bit",
+    stream: null as never,
+    destination: "",
+    filename: "",
+    path: "",
+  };
+
+  const mockTranscriptionResult: TranscriptionResult = {
+    text: "Hello, world!",
+    language: "en",
+    durationSeconds: 2.5,
+    confidence: 0.95,
+  };
+
+  beforeEach(() => {
+    service = mockSpeechService as unknown as SpeechService;
+    controller = new SpeechController(service);
+
+    vi.clearAllMocks();
+  });
+
+  it("should be defined", () => {
+    expect(controller).toBeDefined();
+  });
+
+  describe("transcribe", () => {
+    it("should transcribe audio file and return data wrapper", async () => {
+      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
+
+      const dto: TranscribeDto = {};
+      const result = await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
+
+      expect(result).toEqual({ data: mockTranscriptionResult });
+      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
+        mimeType: "audio/wav",
+      });
+    });
+
+    it("should pass language override from DTO to service", async () => {
+      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
+
+      const dto: TranscribeDto = { language: "fr" };
+      await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
+
+      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
+        language: "fr",
+        mimeType: "audio/wav",
+      });
+    });
+
+    it("should pass model override from DTO to service", async () => {
+      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
+
+      const dto: TranscribeDto = { model: "whisper-large-v3" };
+      await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
+
+      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
+        model: "whisper-large-v3",
+        mimeType: "audio/wav",
+      });
+    });
+
+    it("should pass all DTO options to service", async () => {
+      mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult);
+
+      const dto: TranscribeDto = {
+        language: "de",
+        model: "whisper-large-v3",
+        prompt: "Meeting notes",
+        temperature: 0.5,
+      };
+      await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser);
+
+      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, {
+        language: "de",
+        model: "whisper-large-v3",
+        prompt: "Meeting notes",
+        temperature: 0.5,
+        mimeType: "audio/wav",
+      });
+    });
+
+    it("should propagate service errors", async () => {
+      mockSpeechService.transcribe.mockRejectedValue(new Error("STT unavailable"));
+
+      const dto: TranscribeDto = {};
+      await expect(controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser)).rejects.toThrow(
+        "STT unavailable"
+      );
+    });
+  });
+
+  describe("health", () => {
+    it("should return health status with both providers available", async () => {
+      mockSpeechService.isSTTAvailable.mockReturnValue(true);
+      mockSpeechService.isTTSAvailable.mockReturnValue(true);
+
+      const result = await controller.health(mockWorkspaceId);
+
+      expect(result).toEqual({
+        data: {
+          stt: { available: true },
+          tts: { available: true },
+        },
+      });
+    });
+
+    it("should return health status with STT unavailable", async () => {
+      mockSpeechService.isSTTAvailable.mockReturnValue(false);
+      mockSpeechService.isTTSAvailable.mockReturnValue(true);
+
+      const result = await controller.health(mockWorkspaceId);
+
+      expect(result).toEqual({
+        data: {
+          stt: { available: false },
+          tts: { available: true },
+        },
+      });
+    });
+
+    it("should return health status with TTS unavailable", async () => {
+      mockSpeechService.isSTTAvailable.mockReturnValue(true);
+      mockSpeechService.isTTSAvailable.mockReturnValue(false);
+
+      const result = await controller.health(mockWorkspaceId);
+
+      expect(result).toEqual({
+        data: {
+          stt: { available: true },
+          tts: { available: false },
+        },
+      });
+    });
+
+    it("should return health status with both providers unavailable", async () => {
+      mockSpeechService.isSTTAvailable.mockReturnValue(false);
+      mockSpeechService.isTTSAvailable.mockReturnValue(false);
+
+      const result = await controller.health(mockWorkspaceId);
+
+      expect(result).toEqual({
+        data: {
+          stt: { available: false },
+          tts: { available: false },
+        },
+      });
+    });
+  });
+
+  // ==============================================
+  // POST /api/speech/synthesize (Issue #396)
+  // ==============================================
+
+  describe("synthesize", () => {
+    const mockAudioBuffer = Buffer.from("fake-audio-data");
+
+    const mockSynthesisResult: SynthesisResult = {
+      audio: mockAudioBuffer,
+      format: "mp3",
+      voice: "af_heart",
+      tier: "default",
+      durationSeconds: 2.5,
+    };
+
+    it("should synthesize text and return a StreamableFile", async () => {
+      const dto: SynthesizeDto = { text: "Hello world" };
+
+      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Hello world", {});
+      expect(result).toBeInstanceOf(StreamableFile);
+    });
+
+    it("should pass voice, speed, format, and tier options to the service", async () => {
+      const dto: SynthesizeDto = {
+        text: "Test with options",
+        voice: "af_heart",
+        speed: 1.5,
+        format: "wav",
+        tier: "premium",
+      };
+
+      const wavResult: SynthesisResult = {
+        audio: mockAudioBuffer,
+        format: "wav",
+        voice: "af_heart",
+        tier: "premium",
+      };
+
+      mockSpeechService.synthesize.mockResolvedValue(wavResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Test with options", {
+        voice: "af_heart",
+        speed: 1.5,
+        format: "wav",
+        tier: "premium",
+      });
+      expect(result).toBeInstanceOf(StreamableFile);
+    });
+
+    it("should set correct Content-Type for mp3 format", async () => {
+      const dto: SynthesizeDto = { text: "Hello", format: "mp3" };
+
+      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      expect(result).toBeInstanceOf(StreamableFile);
+      const headers = result.getHeaders();
+      expect(headers.type).toBe("audio/mpeg");
+    });
+
+    it("should set correct Content-Type for wav format", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+      const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" };
+
+      mockSpeechService.synthesize.mockResolvedValue(wavResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.type).toBe("audio/wav");
+    });
+
+    it("should set correct Content-Type for opus format", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+      const opusResult: SynthesisResult = { ...mockSynthesisResult, format: "opus" };
+
+      mockSpeechService.synthesize.mockResolvedValue(opusResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.type).toBe("audio/opus");
+    });
+
+    it("should set correct Content-Type for flac format", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+      const flacResult: SynthesisResult = { ...mockSynthesisResult, format: "flac" };
+
+      mockSpeechService.synthesize.mockResolvedValue(flacResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.type).toBe("audio/flac");
+    });
+
+    it("should set correct Content-Type for aac format", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+      const aacResult: SynthesisResult = { ...mockSynthesisResult, format: "aac" };
+
+      mockSpeechService.synthesize.mockResolvedValue(aacResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.type).toBe("audio/aac");
+    });
+
+    it("should set correct Content-Type for pcm format", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+      const pcmResult: SynthesisResult = { ...mockSynthesisResult, format: "pcm" };
+
+      mockSpeechService.synthesize.mockResolvedValue(pcmResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.type).toBe("audio/pcm");
+    });
+
+    it("should set Content-Disposition header for download with correct extension", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+
+      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.disposition).toContain("attachment");
+      expect(headers.disposition).toContain("speech.mp3");
+    });
+
+    it("should set Content-Disposition with correct file extension for wav", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+      const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" };
+
+      mockSpeechService.synthesize.mockResolvedValue(wavResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.disposition).toContain("speech.wav");
+    });
+
+    it("should set Content-Length header based on audio buffer size", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+
+      mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult);
+
+      const result = await controller.synthesize(dto, mockWorkspaceId, mockUser);
+
+      const headers = result.getHeaders();
+      expect(headers.length).toBe(mockAudioBuffer.length);
+    });
+
+    it("should propagate ServiceUnavailableException from service", async () => {
+      const dto: SynthesizeDto = { text: "Hello" };
+
+      mockSpeechService.synthesize.mockRejectedValue(
+        new ServiceUnavailableException("No TTS providers are available")
+      );
+
+      await expect(controller.synthesize(dto, mockWorkspaceId, mockUser)).rejects.toThrow(
+        ServiceUnavailableException
+      );
+    });
+  });
+
+  // ==============================================
+  // GET /api/speech/voices (Issue #396)
+  // ==============================================
+
+  describe("getVoices", () => {
+    const mockVoices: VoiceInfo[] = [
+      {
+        id: "af_heart",
+        name: "Heart",
+        language: "en",
+        tier: "default",
+        isDefault: true,
+      },
+      {
+        id: "af_sky",
+        name: "Sky",
+        language: "en",
+        tier: "default",
+        isDefault: false,
+      },
+      {
+        id: "chatterbox-voice",
+        name: "Chatterbox Default",
+        language: "en",
+        tier: "premium",
+        isDefault: true,
+      },
+    ];
+
+    it("should return all voices when no tier filter is provided", async () => {
+      mockSpeechService.listVoices.mockResolvedValue(mockVoices);
+
+      const result = await controller.getVoices(mockWorkspaceId);
+
+      expect(mockSpeechService.listVoices).toHaveBeenCalledWith(undefined);
+      expect(result).toEqual({ data: mockVoices });
+    });
+
+    it("should filter voices by default tier", async () => {
+      const defaultVoices = mockVoices.filter((v) => v.tier === "default");
+      mockSpeechService.listVoices.mockResolvedValue(defaultVoices);
+
+      const result = await controller.getVoices(mockWorkspaceId, "default");
+
+      expect(mockSpeechService.listVoices).toHaveBeenCalledWith("default");
+      expect(result).toEqual({ data: defaultVoices });
+    });
+
+    it("should filter voices by premium tier", async () => {
+      const premiumVoices = mockVoices.filter((v) => v.tier === "premium");
+      mockSpeechService.listVoices.mockResolvedValue(premiumVoices);
+
+      const result = await controller.getVoices(mockWorkspaceId, "premium");
+
+      expect(mockSpeechService.listVoices).toHaveBeenCalledWith("premium");
+      expect(result).toEqual({ data: premiumVoices });
+    });
+
+    it("should return empty array when no voices are available", async () => {
+      mockSpeechService.listVoices.mockResolvedValue([]);
+
+      const result = await controller.getVoices(mockWorkspaceId);
+
+      expect(result).toEqual({ data: [] });
+    });
+
+    it("should return empty array when filtering by tier with no matching voices", async () => {
+      mockSpeechService.listVoices.mockResolvedValue([]);
+
+      const result = await controller.getVoices(mockWorkspaceId, "fallback");
+
+      expect(mockSpeechService.listVoices).toHaveBeenCalledWith("fallback");
+      expect(result).toEqual({ data: [] });
+    });
+  });
+});
--- a/apps/api/src/speech/speech.controller.ts
+++ b/apps/api/src/speech/speech.controller.ts
@@ -0,0 +1,193 @@
+/**
+ * SpeechController
+ *
+ * REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services.
+ * Handles audio file uploads for transcription, text-to-speech synthesis,
+ * voice listing, and provider health status.
+ *
+ * Endpoints:
+ * - POST /api/speech/transcribe   - Transcribe uploaded audio file to text
+ * - POST /api/speech/synthesize   - Synthesize text to audio (TTS)
+ * - GET  /api/speech/voices       - List available TTS voices
+ * - GET  /api/speech/health       - Check STT/TTS provider availability
+ *
+ * Issue #392, #396
+ */
+
+import {
+  Controller,
+  Post,
+  Get,
+  Body,
+  Query,
+  UseGuards,
+  UseInterceptors,
+  UploadedFile,
+  StreamableFile,
+} from "@nestjs/common";
+import { FileInterceptor } from "@nestjs/platform-express";
+import { SpeechService } from "./speech.service";
+import { TranscribeDto } from "./dto/transcribe.dto";
+import { SynthesizeDto } from "./dto/synthesize.dto";
+import { AudioValidationPipe } from "./pipes/audio-validation.pipe";
+import { AuthGuard } from "../auth/guards/auth.guard";
+import { WorkspaceGuard, PermissionGuard } from "../common/guards";
+import { Workspace, Permission, RequirePermission } from "../common/decorators";
+import { CurrentUser } from "../auth/decorators/current-user.decorator";
+import type { AuthenticatedUser } from "../common/types/user.types";
+import type {
+  AudioFormat,
+  SynthesizeOptions,
+  TranscribeOptions,
+  TranscriptionResult,
+  VoiceInfo,
+  SpeechTier,
+} from "./interfaces/speech-types";
+
+/**
+ * Map audio format to MIME type for Content-Type header.
+ */
+const AUDIO_FORMAT_MIME_TYPES: Record<AudioFormat, string> = {
+  mp3: "audio/mpeg",
+  wav: "audio/wav",
+  opus: "audio/opus",
+  flac: "audio/flac",
+  aac: "audio/aac",
+  pcm: "audio/pcm",
+};
+
+/**
+ * Health status for a single speech provider category.
+ */
+interface ProviderHealth {
+  available: boolean;
+}
+
+/**
+ * Combined health status response for all speech providers.
+ */
+interface SpeechHealthResponse {
+  data: {
+    stt: ProviderHealth;
+    tts: ProviderHealth;
+  };
+}
+
+@Controller("speech")
+@UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard)
+export class SpeechController {
+  constructor(private readonly speechService: SpeechService) {}
+
+  /**
+   * POST /api/speech/transcribe
+   *
+   * Transcribe an uploaded audio file to text.
+   * Accepts multipart form data with an audio file and optional transcription parameters.
+   *
+   * @param file - Uploaded audio file (validated by AudioValidationPipe)
+   * @param dto - Optional transcription parameters (language, model, prompt, temperature)
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @param _user - Authenticated user (validated by AuthGuard)
+   * @returns Transcription result wrapped in standard data envelope
+   */
+  @Post("transcribe")
+  @RequirePermission(Permission.WORKSPACE_MEMBER)
+  @UseInterceptors(FileInterceptor("file"))
+  async transcribe(
+    @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File,
+    @Body() dto: TranscribeDto,
+    @Workspace() _workspaceId: string,
+    @CurrentUser() _user: AuthenticatedUser
+  ): Promise<{ data: TranscriptionResult }> {
+    const options: TranscribeOptions = { mimeType: file.mimetype };
+    if (dto.language !== undefined) options.language = dto.language;
+    if (dto.model !== undefined) options.model = dto.model;
+    if (dto.prompt !== undefined) options.prompt = dto.prompt;
+    if (dto.temperature !== undefined) options.temperature = dto.temperature;
+
+    const result = await this.speechService.transcribe(file.buffer, options);
+
+    return { data: result };
+  }
+
+  /**
+   * GET /api/speech/health
+   *
+   * Check availability of STT and TTS providers.
+   *
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @returns Health status of STT and TTS providers
+   */
+  @Get("health")
+  @RequirePermission(Permission.WORKSPACE_ANY)
+  health(@Workspace() _workspaceId: string): SpeechHealthResponse {
+    return {
+      data: {
+        stt: { available: this.speechService.isSTTAvailable() },
+        tts: { available: this.speechService.isTTSAvailable() },
+      },
+    };
+  }
+
+  /**
+   * POST /api/speech/synthesize
+   *
+   * Synthesize text to audio using TTS providers.
+   * Accepts JSON body with text and optional voice/format/speed/tier parameters.
+   * Returns audio binary with appropriate Content-Type and Content-Disposition headers.
+   *
+   * Provider selection follows fallback chain: requested tier -> default -> fallback.
+   *
+   * @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?)
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @param _user - Authenticated user (validated by AuthGuard)
+   * @returns StreamableFile containing synthesized audio
+   *
+   * Issue #396
+   */
+  @Post("synthesize")
+  @RequirePermission(Permission.WORKSPACE_MEMBER)
+  async synthesize(
+    @Body() dto: SynthesizeDto,
+    @Workspace() _workspaceId: string,
+    @CurrentUser() _user: AuthenticatedUser
+  ): Promise<StreamableFile> {
+    const options: SynthesizeOptions = {};
+    if (dto.voice !== undefined) options.voice = dto.voice;
+    if (dto.speed !== undefined) options.speed = dto.speed;
+    if (dto.format !== undefined) options.format = dto.format;
+    if (dto.tier !== undefined) options.tier = dto.tier;
+
+    const result = await this.speechService.synthesize(dto.text, options);
+
+    const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format];
+
+    return new StreamableFile(result.audio, {
+      type: mimeType,
+      disposition: `attachment; filename="speech.${result.format}"`,
+      length: result.audio.length,
+    });
+  }
+
+  /**
+   * GET /api/speech/voices
+   *
+   * List available TTS voices across all tiers.
+   * Optionally filter by tier using the `tier` query parameter.
+   *
+   * @param _workspaceId - Workspace context (validated by WorkspaceGuard)
+   * @param tier - Optional tier filter (default, premium, fallback)
+   * @returns Voice information array wrapped in standard data envelope
+   *
+   * Issue #396
+   */
+  @Get("voices")
+  @RequirePermission(Permission.WORKSPACE_ANY)
+  async getVoices(
+    @Workspace() _workspaceId: string,
+    @Query("tier") tier?: SpeechTier
+  ): Promise<{ data: VoiceInfo[] }> {
+    const voices = await this.speechService.listVoices(tier);
+    return { data: voices };
+  }
+}
--- a/apps/api/src/speech/speech.gateway.spec.ts
+++ b/apps/api/src/speech/speech.gateway.spec.ts
@@ -0,0 +1,683 @@
+/**
+ * SpeechGateway Tests
+ *
+ * Issue #397: WebSocket streaming transcription endpoint tests.
+ * Written FIRST following TDD (Red-Green-Refactor).
+ *
+ * Tests cover:
+ * - Authentication via handshake token
+ * - Session lifecycle: start -> audio chunks -> stop
+ * - Transcription result emission
+ * - Session cleanup on disconnect
+ * - Error handling
+ * - Buffer size limit enforcement
+ */
+
+import { describe, it, expect, beforeEach, vi } from "vitest";
+import { Socket } from "socket.io";
+import { SpeechGateway } from "./speech.gateway";
+import { SpeechService } from "./speech.service";
+import { AuthService } from "../auth/auth.service";
+import { PrismaService } from "../prisma/prisma.service";
+import type { SpeechConfig } from "./speech.config";
+import type { TranscriptionResult } from "./interfaces/speech-types";
+
+// ==========================================
+// Test helpers
+// ==========================================
+
+interface AuthenticatedSocket extends Socket {
+  data: {
+    userId?: string;
+    workspaceId?: string;
+  };
+}
+
+function createMockConfig(): SpeechConfig {
+  return {
+    stt: {
+      enabled: true,
+      baseUrl: "http://localhost:8000/v1",
+      model: "test-model",
+      language: "en",
+    },
+    tts: {
+      default: { enabled: true, url: "http://localhost:8880/v1", voice: "test", format: "mp3" },
+      premium: { enabled: false, url: "" },
+      fallback: { enabled: false, url: "" },
+    },
+    limits: {
+      maxUploadSize: 25_000_000,
+      maxDurationSeconds: 600,
+      maxTextLength: 4096,
+    },
+  };
+}
+
+function createMockSocket(overrides?: Partial<AuthenticatedSocket>): AuthenticatedSocket {
+  return {
+    id: "test-socket-id",
+    join: vi.fn(),
+    leave: vi.fn(),
+    emit: vi.fn(),
+    disconnect: vi.fn(),
+    data: {},
+    handshake: {
+      auth: { token: "valid-token" },
+      query: {},
+      headers: {},
+    },
+    ...overrides,
+  } as unknown as AuthenticatedSocket;
+}
+
+function createMockAuthService(): {
+  verifySession: ReturnType<typeof vi.fn>;
+} {
+  return {
+    verifySession: vi.fn().mockResolvedValue({
+      user: { id: "user-123" },
+      session: { id: "session-123" },
+    }),
+  };
+}
+
+function createMockPrismaService(): {
+  workspaceMember: { findFirst: ReturnType<typeof vi.fn> };
+} {
+  return {
+    workspaceMember: {
+      findFirst: vi.fn().mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      }),
+    },
+  };
+}
+
+function createMockSpeechService(): {
+  transcribe: ReturnType<typeof vi.fn>;
+  isSTTAvailable: ReturnType<typeof vi.fn>;
+} {
+  return {
+    transcribe: vi.fn().mockResolvedValue({
+      text: "Hello world",
+      language: "en",
+      durationSeconds: 2.5,
+    } satisfies TranscriptionResult),
+    isSTTAvailable: vi.fn().mockReturnValue(true),
+  };
+}
+
+// ==========================================
+// Tests
+// ==========================================
+
+describe("SpeechGateway", () => {
+  let gateway: SpeechGateway;
+  let mockAuthService: ReturnType<typeof createMockAuthService>;
+  let mockPrismaService: ReturnType<typeof createMockPrismaService>;
+  let mockSpeechService: ReturnType<typeof createMockSpeechService>;
+  let mockConfig: SpeechConfig;
+  let mockClient: AuthenticatedSocket;
+
+  beforeEach(() => {
+    mockAuthService = createMockAuthService();
+    mockPrismaService = createMockPrismaService();
+    mockSpeechService = createMockSpeechService();
+    mockConfig = createMockConfig();
+    mockClient = createMockSocket();
+
+    gateway = new SpeechGateway(
+      mockAuthService as unknown as AuthService,
+      mockPrismaService as unknown as PrismaService,
+      mockSpeechService as unknown as SpeechService,
+      mockConfig
+    );
+
+    vi.clearAllMocks();
+  });
+
+  // ==========================================
+  // Authentication
+  // ==========================================
+  describe("handleConnection", () => {
+    it("should authenticate client and populate socket data on valid token", async () => {
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      });
+
+      await gateway.handleConnection(mockClient);
+
+      expect(mockAuthService.verifySession).toHaveBeenCalledWith("valid-token");
+      expect(mockClient.data.userId).toBe("user-123");
+      expect(mockClient.data.workspaceId).toBe("workspace-456");
+    });
+
+    it("should disconnect client without token", async () => {
+      const clientNoToken = createMockSocket({
+        handshake: { auth: {}, query: {}, headers: {} },
+      } as Partial<AuthenticatedSocket>);
+
+      await gateway.handleConnection(clientNoToken);
+
+      expect(clientNoToken.disconnect).toHaveBeenCalled();
+    });
+
+    it("should disconnect client with invalid token", async () => {
+      mockAuthService.verifySession.mockResolvedValue(null);
+
+      await gateway.handleConnection(mockClient);
+
+      expect(mockClient.disconnect).toHaveBeenCalled();
+    });
+
+    it("should disconnect client without workspace access", async () => {
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue(null);
+
+      await gateway.handleConnection(mockClient);
+
+      expect(mockClient.disconnect).toHaveBeenCalled();
+    });
+
+    it("should disconnect client when auth throws", async () => {
+      mockAuthService.verifySession.mockRejectedValue(new Error("Auth failure"));
+
+      await gateway.handleConnection(mockClient);
+
+      expect(mockClient.disconnect).toHaveBeenCalled();
+    });
+
+    it("should extract token from handshake.query as fallback", async () => {
+      const clientQueryToken = createMockSocket({
+        handshake: {
+          auth: {},
+          query: { token: "query-token" },
+          headers: {},
+        },
+      } as Partial<AuthenticatedSocket>);
+
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      });
+
+      await gateway.handleConnection(clientQueryToken);
+
+      expect(mockAuthService.verifySession).toHaveBeenCalledWith("query-token");
+    });
+  });
+
+  // ==========================================
+  // start-transcription
+  // ==========================================
+  describe("handleStartTranscription", () => {
+    beforeEach(async () => {
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      });
+      await gateway.handleConnection(mockClient);
+      vi.clearAllMocks();
+    });
+
+    it("should create a transcription session", () => {
+      gateway.handleStartTranscription(mockClient, { language: "en" });
+
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-started",
+        expect.objectContaining({ sessionId: expect.any(String) })
+      );
+    });
+
+    it("should create a session with optional language parameter", () => {
+      gateway.handleStartTranscription(mockClient, { language: "fr" });
+
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-started",
+        expect.objectContaining({ sessionId: expect.any(String) })
+      );
+    });
+
+    it("should create a session with no options", () => {
+      gateway.handleStartTranscription(mockClient, {});
+
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-started",
+        expect.objectContaining({ sessionId: expect.any(String) })
+      );
+    });
+
+    it("should emit error if client is not authenticated", () => {
+      const unauthClient = createMockSocket();
+      // Not connected through handleConnection, so no userId set
+
+      gateway.handleStartTranscription(unauthClient, {});
+
+      expect(unauthClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.any(String) })
+      );
+    });
+
+    it("should replace existing session if one already exists", () => {
+      gateway.handleStartTranscription(mockClient, {});
+      gateway.handleStartTranscription(mockClient, { language: "de" });
+
+      // Should have emitted transcription-started twice (no error)
+      const startedCalls = (mockClient.emit as ReturnType<typeof vi.fn>).mock.calls.filter(
+        (call: unknown[]) => call[0] === "transcription-started"
+      );
+      expect(startedCalls).toHaveLength(2);
+    });
+  });
+
+  // ==========================================
+  // audio-chunk
+  // ==========================================
+  describe("handleAudioChunk", () => {
+    beforeEach(async () => {
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      });
+      await gateway.handleConnection(mockClient);
+      vi.clearAllMocks();
+      gateway.handleStartTranscription(mockClient, {});
+      vi.clearAllMocks();
+    });
+
+    it("should accumulate audio data in the session", () => {
+      const chunk = Buffer.from("audio-data-1");
+      gateway.handleAudioChunk(mockClient, chunk);
+
+      // No error emitted
+      const errorCalls = (mockClient.emit as ReturnType<typeof vi.fn>).mock.calls.filter(
+        (call: unknown[]) => call[0] === "transcription-error"
+      );
+      expect(errorCalls).toHaveLength(0);
+    });
+
+    it("should accept Uint8Array data and convert to Buffer", () => {
+      const chunk = new Uint8Array([1, 2, 3, 4]);
+      gateway.handleAudioChunk(mockClient, chunk);
+
+      const errorCalls = (mockClient.emit as ReturnType<typeof vi.fn>).mock.calls.filter(
+        (call: unknown[]) => call[0] === "transcription-error"
+      );
+      expect(errorCalls).toHaveLength(0);
+    });
+
+    it("should emit error if no active session exists", () => {
+      const noSessionClient = createMockSocket({ id: "no-session" });
+      noSessionClient.data = { userId: "user-123", workspaceId: "workspace-456" };
+
+      const chunk = Buffer.from("audio-data");
+      gateway.handleAudioChunk(noSessionClient, chunk);
+
+      expect(noSessionClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.any(String) })
+      );
+    });
+
+    it("should emit error if client is not authenticated", () => {
+      const unauthClient = createMockSocket({ id: "unauth" });
+      // Not authenticated
+
+      const chunk = Buffer.from("audio-data");
+      gateway.handleAudioChunk(unauthClient, chunk);
+
+      expect(unauthClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.any(String) })
+      );
+    });
+
+    it("should emit error when buffer size exceeds max upload size", () => {
+      // Set a very small max upload size
+      const smallConfig = createMockConfig();
+      smallConfig.limits.maxUploadSize = 10;
+
+      const limitedGateway = new SpeechGateway(
+        mockAuthService as unknown as AuthService,
+        mockPrismaService as unknown as PrismaService,
+        mockSpeechService as unknown as SpeechService,
+        smallConfig
+      );
+
+      // We need to manually set up the authenticated client in the new gateway
+      const limitedClient = createMockSocket({ id: "limited-client" });
+      limitedClient.data = { userId: "user-123", workspaceId: "workspace-456" };
+
+      // Start session directly (since handleConnection populates data)
+      limitedGateway.handleStartTranscription(limitedClient, {});
+      vi.clearAllMocks();
+
+      // Send a chunk that exceeds the limit
+      const largeChunk = Buffer.alloc(20, "a");
+      limitedGateway.handleAudioChunk(limitedClient, largeChunk);
+
+      expect(limitedClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.stringContaining("exceeds") })
+      );
+    });
+
+    it("should emit error when accumulated buffer size exceeds max upload size", () => {
+      const smallConfig = createMockConfig();
+      smallConfig.limits.maxUploadSize = 15;
+
+      const limitedGateway = new SpeechGateway(
+        mockAuthService as unknown as AuthService,
+        mockPrismaService as unknown as PrismaService,
+        mockSpeechService as unknown as SpeechService,
+        smallConfig
+      );
+
+      const limitedClient = createMockSocket({ id: "limited-client-2" });
+      limitedClient.data = { userId: "user-123", workspaceId: "workspace-456" };
+
+      limitedGateway.handleStartTranscription(limitedClient, {});
+      vi.clearAllMocks();
+
+      // Send two chunks that together exceed the limit
+      const chunk1 = Buffer.alloc(10, "a");
+      const chunk2 = Buffer.alloc(10, "b");
+      limitedGateway.handleAudioChunk(limitedClient, chunk1);
+      limitedGateway.handleAudioChunk(limitedClient, chunk2);
+
+      expect(limitedClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.stringContaining("exceeds") })
+      );
+    });
+  });
+
+  // ==========================================
+  // stop-transcription
+  // ==========================================
+  describe("handleStopTranscription", () => {
+    beforeEach(async () => {
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      });
+      await gateway.handleConnection(mockClient);
+      vi.clearAllMocks();
+    });
+
+    it("should transcribe accumulated audio and emit final result", async () => {
+      gateway.handleStartTranscription(mockClient, { language: "en" });
+
+      const chunk1 = Buffer.from("audio-part-1");
+      const chunk2 = Buffer.from("audio-part-2");
+      gateway.handleAudioChunk(mockClient, chunk1);
+      gateway.handleAudioChunk(mockClient, chunk2);
+
+      vi.clearAllMocks();
+
+      const expectedResult: TranscriptionResult = {
+        text: "Hello world",
+        language: "en",
+        durationSeconds: 2.5,
+      };
+      mockSpeechService.transcribe.mockResolvedValue(expectedResult);
+
+      await gateway.handleStopTranscription(mockClient);
+
+      // Should have called transcribe with concatenated buffer
+      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({})
+      );
+
+      // Should emit transcription-final
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-final",
+        expect.objectContaining({ text: "Hello world" })
+      );
+    });
+
+    it("should pass language option to SpeechService.transcribe", async () => {
+      gateway.handleStartTranscription(mockClient, { language: "fr" });
+      gateway.handleAudioChunk(mockClient, Buffer.from("audio"));
+
+      vi.clearAllMocks();
+
+      await gateway.handleStopTranscription(mockClient);
+
+      expect(mockSpeechService.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({ language: "fr" })
+      );
+    });
+
+    it("should clean up session after stop", async () => {
+      gateway.handleStartTranscription(mockClient, {});
+      gateway.handleAudioChunk(mockClient, Buffer.from("audio"));
+
+      await gateway.handleStopTranscription(mockClient);
+
+      vi.clearAllMocks();
+
+      // Sending more audio after stop should error (no session)
+      gateway.handleAudioChunk(mockClient, Buffer.from("more-audio"));
+
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.any(String) })
+      );
+    });
+
+    it("should emit transcription-error when transcription fails", async () => {
+      gateway.handleStartTranscription(mockClient, {});
+      gateway.handleAudioChunk(mockClient, Buffer.from("audio"));
+
+      vi.clearAllMocks();
+
+      mockSpeechService.transcribe.mockRejectedValue(new Error("STT service down"));
+
+      await gateway.handleStopTranscription(mockClient);
+
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.stringContaining("STT service down") })
+      );
+    });
+
+    it("should emit error if no active session exists", async () => {
+      await gateway.handleStopTranscription(mockClient);
+
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.any(String) })
+      );
+    });
+
+    it("should emit error if client is not authenticated", async () => {
+      const unauthClient = createMockSocket({ id: "unauth-stop" });
+
+      await gateway.handleStopTranscription(unauthClient);
+
+      expect(unauthClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.any(String) })
+      );
+    });
+
+    it("should emit error when stopping with no audio chunks received", async () => {
+      gateway.handleStartTranscription(mockClient, {});
+
+      vi.clearAllMocks();
+
+      await gateway.handleStopTranscription(mockClient);
+
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.stringContaining("No audio") })
+      );
+    });
+  });
+
+  // ==========================================
+  // handleDisconnect
+  // ==========================================
+  describe("handleDisconnect", () => {
+    beforeEach(async () => {
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      });
+      await gateway.handleConnection(mockClient);
+      vi.clearAllMocks();
+    });
+
+    it("should clean up active session on disconnect", () => {
+      gateway.handleStartTranscription(mockClient, {});
+      gateway.handleAudioChunk(mockClient, Buffer.from("audio"));
+
+      gateway.handleDisconnect(mockClient);
+
+      // Session should be gone. Verify by trying to add a chunk to a new
+      // socket with the same ID (should error since session was cleaned up).
+      const newClient = createMockSocket({ id: mockClient.id });
+      newClient.data = { userId: "user-123", workspaceId: "workspace-456" };
+
+      gateway.handleAudioChunk(newClient, Buffer.from("more"));
+
+      expect(newClient.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.any(String) })
+      );
+    });
+
+    it("should not throw when disconnecting client without active session", () => {
+      expect(() => gateway.handleDisconnect(mockClient)).not.toThrow();
+    });
+
+    it("should not throw when disconnecting unauthenticated client", () => {
+      const unauthClient = createMockSocket({ id: "unauth-disconnect" });
+      expect(() => gateway.handleDisconnect(unauthClient)).not.toThrow();
+    });
+  });
+
+  // ==========================================
+  // Edge cases
+  // ==========================================
+  describe("edge cases", () => {
+    beforeEach(async () => {
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-123" },
+        session: { id: "session-123" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-123",
+        workspaceId: "workspace-456",
+        role: "MEMBER",
+      });
+      await gateway.handleConnection(mockClient);
+      vi.clearAllMocks();
+    });
+
+    it("should handle multiple start-stop cycles for the same client", async () => {
+      // First cycle
+      gateway.handleStartTranscription(mockClient, {});
+      gateway.handleAudioChunk(mockClient, Buffer.from("cycle-1"));
+      await gateway.handleStopTranscription(mockClient);
+
+      vi.clearAllMocks();
+
+      // Second cycle
+      gateway.handleStartTranscription(mockClient, { language: "de" });
+      gateway.handleAudioChunk(mockClient, Buffer.from("cycle-2"));
+      await gateway.handleStopTranscription(mockClient);
+
+      expect(mockSpeechService.transcribe).toHaveBeenCalledTimes(1);
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-final",
+        expect.objectContaining({ text: "Hello world" })
+      );
+    });
+
+    it("should isolate sessions between different clients", async () => {
+      const client2 = createMockSocket({ id: "client-2" });
+      client2.data = { userId: "user-456", workspaceId: "workspace-789" };
+
+      // Client 2 also needs to be "connected"
+      mockAuthService.verifySession.mockResolvedValue({
+        user: { id: "user-456" },
+        session: { id: "session-456" },
+      });
+      mockPrismaService.workspaceMember.findFirst.mockResolvedValue({
+        userId: "user-456",
+        workspaceId: "workspace-789",
+        role: "MEMBER",
+      });
+      await gateway.handleConnection(client2);
+      vi.clearAllMocks();
+
+      // Start sessions for both clients
+      gateway.handleStartTranscription(mockClient, {});
+      gateway.handleStartTranscription(client2, {});
+
+      // Send audio to client 1 only
+      gateway.handleAudioChunk(mockClient, Buffer.from("audio-for-client-1"));
+
+      // Stop client 2 (no audio)
+      await gateway.handleStopTranscription(client2);
+
+      // Client 2 should get an error (no audio received)
+      expect(client2.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({ message: expect.stringContaining("No audio") })
+      );
+
+      vi.clearAllMocks();
+
+      // Stop client 1 (has audio) -- should succeed
+      await gateway.handleStopTranscription(mockClient);
+      expect(mockSpeechService.transcribe).toHaveBeenCalled();
+      expect(mockClient.emit).toHaveBeenCalledWith(
+        "transcription-final",
+        expect.objectContaining({ text: "Hello world" })
+      );
+    });
+  });
+});
--- a/apps/api/src/speech/speech.gateway.ts
+++ b/apps/api/src/speech/speech.gateway.ts
@@ -0,0 +1,381 @@
+/**
+ * SpeechGateway
+ *
+ * WebSocket gateway for real-time streaming transcription.
+ * Uses a separate `/speech` namespace from the main WebSocket gateway.
+ *
+ * Protocol:
+ * 1. Client connects with auth token in handshake
+ * 2. Client emits `start-transcription` with optional { language }
+ * 3. Client streams audio via `audio-chunk` events (Buffer/Uint8Array)
+ * 4. Client emits `stop-transcription` to finalize
+ * 5. Server responds with `transcription-final` containing the result
+ *
+ * Session management:
+ * - One active transcription session per client
+ * - Audio chunks accumulated in memory (Buffer array)
+ * - On stop: chunks concatenated and sent to SpeechService.transcribe()
+ * - Sessions cleaned up on disconnect
+ *
+ * Rate limiting:
+ * - Total accumulated audio size is capped by config limits.maxUploadSize
+ *
+ * Issue #397
+ */
+
+import {
+  WebSocketGateway as WSGateway,
+  WebSocketServer,
+  SubscribeMessage,
+  OnGatewayConnection,
+  OnGatewayDisconnect,
+} from "@nestjs/websockets";
+import { Logger, Inject } from "@nestjs/common";
+import { Server, Socket } from "socket.io";
+import { AuthService } from "../auth/auth.service";
+import { PrismaService } from "../prisma/prisma.service";
+import { SpeechService } from "./speech.service";
+import { speechConfig, type SpeechConfig } from "./speech.config";
+
+// ==========================================
+// Types
+// ==========================================
+
+interface AuthenticatedSocket extends Socket {
+  data: {
+    userId?: string;
+    workspaceId?: string;
+  };
+}
+
+interface TranscriptionSession {
+  chunks: Buffer[];
+  totalSize: number;
+  language: string | undefined;
+  startedAt: Date;
+}
+
+interface StartTranscriptionPayload {
+  language?: string;
+}
+
+// ==========================================
+// Gateway
+// ==========================================
+
+@WSGateway({
+  namespace: "/speech",
+  cors: {
+    origin: process.env.WEB_URL ?? "http://localhost:3000",
+    credentials: true,
+  },
+})
+export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
+  @WebSocketServer()
+  server!: Server;
+
+  private readonly logger = new Logger(SpeechGateway.name);
+  private readonly sessions = new Map<string, TranscriptionSession>();
+  private readonly CONNECTION_TIMEOUT_MS = 5000;
+
+  constructor(
+    private readonly authService: AuthService,
+    private readonly prisma: PrismaService,
+    private readonly speechService: SpeechService,
+    @Inject(speechConfig.KEY)
+    private readonly config: SpeechConfig
+  ) {}
+
+  // ==========================================
+  // Connection lifecycle
+  // ==========================================
+
+  /**
+   * Authenticate client on connection using the same pattern as the main WebSocket gateway.
+   * Extracts token from handshake, verifies session, and checks workspace membership.
+   */
+  async handleConnection(client: Socket): Promise<void> {
+    const authenticatedClient = client as AuthenticatedSocket;
+
+    const timeoutId = setTimeout(() => {
+      if (!authenticatedClient.data.userId) {
+        this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication timed out.",
+        });
+        authenticatedClient.disconnect();
+      }
+    }, this.CONNECTION_TIMEOUT_MS);
+
+    try {
+      const token = this.extractTokenFromHandshake(authenticatedClient);
+
+      if (!token) {
+        this.logger.warn(`Client ${authenticatedClient.id} connected without token`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: no token provided.",
+        });
+        authenticatedClient.disconnect();
+        clearTimeout(timeoutId);
+        return;
+      }
+
+      const sessionData = await this.authService.verifySession(token);
+
+      if (!sessionData) {
+        this.logger.warn(`Client ${authenticatedClient.id} has invalid token`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: invalid or expired token.",
+        });
+        authenticatedClient.disconnect();
+        clearTimeout(timeoutId);
+        return;
+      }
+
+      const user = sessionData.user as { id: string };
+      const userId = user.id;
+
+      const workspaceMembership = await this.prisma.workspaceMember.findFirst({
+        where: { userId },
+        select: { workspaceId: true, userId: true, role: true },
+      });
+
+      if (!workspaceMembership) {
+        this.logger.warn(`User ${userId} has no workspace access`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: no workspace access.",
+        });
+        authenticatedClient.disconnect();
+        clearTimeout(timeoutId);
+        return;
+      }
+
+      authenticatedClient.data.userId = userId;
+      authenticatedClient.data.workspaceId = workspaceMembership.workspaceId;
+
+      clearTimeout(timeoutId);
+      this.logger.log(
+        `Speech client ${authenticatedClient.id} connected (user: ${userId}, workspace: ${workspaceMembership.workspaceId})`
+      );
+    } catch (error) {
+      clearTimeout(timeoutId);
+      this.logger.error(
+        `Authentication failed for speech client ${authenticatedClient.id}:`,
+        error instanceof Error ? error.message : "Unknown error"
+      );
+      authenticatedClient.emit("transcription-error", {
+        message: "Authentication failed: an unexpected error occurred.",
+      });
+      authenticatedClient.disconnect();
+    }
+  }
+
+  /**
+   * Clean up transcription session on client disconnect.
+   */
+  handleDisconnect(client: Socket): void {
+    const authenticatedClient = client as AuthenticatedSocket;
+    const sessionId = authenticatedClient.id;
+
+    if (this.sessions.has(sessionId)) {
+      this.sessions.delete(sessionId);
+      this.logger.log(`Cleaned up transcription session for client ${sessionId}`);
+    }
+
+    this.logger.debug(`Speech client ${sessionId} disconnected`);
+  }
+
+  // ==========================================
+  // Transcription events
+  // ==========================================
+
+  /**
+   * Start a new transcription session for the client.
+   * Replaces any existing session for this client.
+   *
+   * @param client - The connected socket client
+   * @param payload - Optional parameters: { language?: string }
+   */
+  @SubscribeMessage("start-transcription")
+  handleStartTranscription(client: Socket, payload: StartTranscriptionPayload): void {
+    const authenticatedClient = client as AuthenticatedSocket;
+
+    if (!authenticatedClient.data.userId) {
+      authenticatedClient.emit("transcription-error", {
+        message: "Not authenticated. Connect with a valid token.",
+      });
+      return;
+    }
+
+    const sessionId = authenticatedClient.id;
+
+    // Clean up any existing session for this client
+    if (this.sessions.has(sessionId)) {
+      this.sessions.delete(sessionId);
+      this.logger.debug(`Replaced existing session for client ${sessionId}`);
+    }
+
+    const language = payload.language;
+
+    const session: TranscriptionSession = {
+      chunks: [],
+      totalSize: 0,
+      language,
+      startedAt: new Date(),
+    };
+
+    this.sessions.set(sessionId, session);
+
+    authenticatedClient.emit("transcription-started", {
+      sessionId,
+      language,
+    });
+
+    this.logger.debug(
+      `Transcription session started for client ${sessionId} (language: ${language ?? "auto"})`
+    );
+  }
+
+  /**
+   * Receive an audio chunk and accumulate it in the active session.
+   * Enforces maximum buffer size from configuration.
+   *
+   * @param client - The connected socket client
+   * @param data - Audio data as Buffer or Uint8Array
+   */
+  @SubscribeMessage("audio-chunk")
+  handleAudioChunk(client: Socket, data: Buffer | Uint8Array): void {
+    const authenticatedClient = client as AuthenticatedSocket;
+
+    if (!authenticatedClient.data.userId) {
+      authenticatedClient.emit("transcription-error", {
+        message: "Not authenticated. Connect with a valid token.",
+      });
+      return;
+    }
+
+    const sessionId = authenticatedClient.id;
+    const session = this.sessions.get(sessionId);
+
+    if (!session) {
+      authenticatedClient.emit("transcription-error", {
+        message: "No active transcription session. Send start-transcription first.",
+      });
+      return;
+    }
+
+    const chunk = Buffer.isBuffer(data) ? data : Buffer.from(data);
+    const newTotalSize = session.totalSize + chunk.length;
+
+    if (newTotalSize > this.config.limits.maxUploadSize) {
+      authenticatedClient.emit("transcription-error", {
+        message: `Audio buffer size (${String(newTotalSize)} bytes) exceeds maximum allowed size (${String(this.config.limits.maxUploadSize)} bytes).`,
+      });
+      // Clean up the session on overflow
+      this.sessions.delete(sessionId);
+      return;
+    }
+
+    session.chunks.push(chunk);
+    session.totalSize = newTotalSize;
+  }
+
+  /**
+   * Stop the transcription session, concatenate audio chunks, and transcribe.
+   * Emits `transcription-final` on success or `transcription-error` on failure.
+   *
+   * @param client - The connected socket client
+   */
+  @SubscribeMessage("stop-transcription")
+  async handleStopTranscription(client: Socket): Promise<void> {
+    const authenticatedClient = client as AuthenticatedSocket;
+
+    if (!authenticatedClient.data.userId) {
+      authenticatedClient.emit("transcription-error", {
+        message: "Not authenticated. Connect with a valid token.",
+      });
+      return;
+    }
+
+    const sessionId = authenticatedClient.id;
+    const session = this.sessions.get(sessionId);
+
+    if (!session) {
+      authenticatedClient.emit("transcription-error", {
+        message: "No active transcription session. Send start-transcription first.",
+      });
+      return;
+    }
+
+    // Always remove session before processing (prevents double-stop)
+    this.sessions.delete(sessionId);
+
+    if (session.chunks.length === 0) {
+      authenticatedClient.emit("transcription-error", {
+        message: "No audio data received. Send audio-chunk events before stopping.",
+      });
+      return;
+    }
+
+    try {
+      const audioBuffer = Buffer.concat(session.chunks);
+      const options: { language?: string } = {};
+      if (session.language) {
+        options.language = session.language;
+      }
+
+      this.logger.debug(
+        `Transcribing ${String(audioBuffer.length)} bytes for client ${sessionId} (language: ${session.language ?? "auto"})`
+      );
+
+      const result = await this.speechService.transcribe(audioBuffer, options);
+
+      authenticatedClient.emit("transcription-final", {
+        text: result.text,
+        language: result.language,
+        durationSeconds: result.durationSeconds,
+        confidence: result.confidence,
+        segments: result.segments,
+      });
+
+      this.logger.debug(`Transcription complete for client ${sessionId}: "${result.text}"`);
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`Transcription failed for client ${sessionId}: ${message}`);
+      authenticatedClient.emit("transcription-error", {
+        message: `Transcription failed: ${message}`,
+      });
+    }
+  }
+
+  // ==========================================
+  // Private helpers
+  // ==========================================
+
+  /**
+   * Extract authentication token from Socket.IO handshake.
+   * Checks auth.token, query.token, and Authorization header (in that order).
+   */
+  private extractTokenFromHandshake(client: Socket): string | undefined {
+    const authToken = client.handshake.auth.token as unknown;
+    if (typeof authToken === "string" && authToken.length > 0) {
+      return authToken;
+    }
+
+    const queryToken = client.handshake.query.token as unknown;
+    if (typeof queryToken === "string" && queryToken.length > 0) {
+      return queryToken;
+    }
+
+    const authHeader = client.handshake.headers.authorization as unknown;
+    if (typeof authHeader === "string") {
+      const parts = authHeader.split(" ");
+      const [type, token] = parts;
+      if (type === "Bearer" && token) {
+        return token;
+      }
+    }
+
+    return undefined;
+  }
+}
--- a/apps/api/src/speech/speech.integration.spec.ts
+++ b/apps/api/src/speech/speech.integration.spec.ts
@@ -0,0 +1,933 @@
+/**
+ * Speech Services E2E Integration Tests
+ *
+ * Tests the full speech pipeline from API endpoints through to mocked external providers.
+ * Covers REST transcription, synthesis, provider fallback, WebSocket streaming,
+ * audio validation, file size limits, authentication, voice listing, and health checks.
+ *
+ * Uses NestJS testing module with supertest for HTTP testing and direct gateway
+ * invocation for WebSocket streaming tests.
+ *
+ * Issue #405
+ */
+
+import { describe, it, expect, beforeAll, beforeEach, afterAll, vi } from "vitest";
+import { Test } from "@nestjs/testing";
+import {
+  type INestApplication,
+  type CanActivate,
+  type ExecutionContext,
+  UnauthorizedException,
+  ValidationPipe,
+} from "@nestjs/common";
+import request from "supertest";
+import type { App } from "supertest/types";
+
+import { SpeechController } from "./speech.controller";
+import { SpeechService } from "./speech.service";
+import { SpeechGateway } from "./speech.gateway";
+import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
+import { speechConfig } from "./speech.config";
+import type { SpeechConfig } from "./speech.config";
+import type { ISTTProvider } from "./interfaces/stt-provider.interface";
+import type { ITTSProvider } from "./interfaces/tts-provider.interface";
+import type {
+  TranscriptionResult,
+  SynthesisResult,
+  VoiceInfo,
+  SpeechTier,
+} from "./interfaces/speech-types";
+import { AuthGuard } from "../auth/guards/auth.guard";
+import { WorkspaceGuard, PermissionGuard } from "../common/guards";
+import { AuthService } from "../auth/auth.service";
+import { PrismaService } from "../prisma/prisma.service";
+
+// ==========================================
+// Test Fixtures
+// ==========================================
+
+/**
+ * Small WAV file header (44 bytes) + minimal data.
+ * Not a real audio file, but has the correct structure for testing.
+ */
+const TEST_AUDIO_BUFFER = Buffer.alloc(1024, 0);
+
+const MOCK_WORKSPACE_ID = "550e8400-e29b-41d4-a716-446655440001";
+const MOCK_USER_ID = "550e8400-e29b-41d4-a716-446655440002";
+
+const MOCK_USER = {
+  id: MOCK_USER_ID,
+  email: "test@example.com",
+  name: "Test User",
+  workspaceId: MOCK_WORKSPACE_ID,
+};
+
+const MOCK_TRANSCRIPTION_RESULT: TranscriptionResult = {
+  text: "Hello, this is a test transcription.",
+  language: "en",
+  durationSeconds: 3.2,
+  confidence: 0.97,
+  segments: [
+    { text: "Hello, this is a test transcription.", start: 0, end: 3.2, confidence: 0.97 },
+  ],
+};
+
+const MOCK_SYNTHESIS_RESULT: SynthesisResult = {
+  audio: Buffer.from("fake-synthesized-audio-data-mp3"),
+  format: "mp3",
+  voice: "af_heart",
+  tier: "default" as SpeechTier,
+  durationSeconds: 2.1,
+};
+
+const MOCK_VOICES: VoiceInfo[] = [
+  { id: "af_heart", name: "Heart", language: "en", tier: "default", isDefault: true },
+  { id: "af_sky", name: "Sky", language: "en", tier: "default", isDefault: false },
+  {
+    id: "chatterbox-default",
+    name: "Chatterbox",
+    language: "en",
+    tier: "premium",
+    isDefault: true,
+  },
+];
+
+const MOCK_SPEECH_CONFIG: SpeechConfig = {
+  stt: {
+    enabled: true,
+    baseUrl: "http://speaches:8000/v1",
+    model: "test-model",
+    language: "en",
+  },
+  tts: {
+    default: { enabled: true, url: "http://kokoro:8880/v1", voice: "af_heart", format: "mp3" },
+    premium: { enabled: true, url: "http://chatterbox:8881/v1" },
+    fallback: { enabled: true, url: "http://openedai:8000/v1" },
+  },
+  limits: {
+    maxUploadSize: 25_000_000,
+    maxDurationSeconds: 600,
+    maxTextLength: 4096,
+  },
+};
+
+// ==========================================
+// Mock Providers
+// ==========================================
+
+function createMockSTTProvider(): ISTTProvider {
+  return {
+    name: "mock-stt",
+    transcribe: vi.fn().mockResolvedValue(MOCK_TRANSCRIPTION_RESULT),
+    isHealthy: vi.fn().mockResolvedValue(true),
+  };
+}
+
+function createMockTTSProvider(tier: SpeechTier, name: string): ITTSProvider {
+  const voices = MOCK_VOICES.filter((v) => v.tier === tier);
+  return {
+    name,
+    tier,
+    synthesize: vi.fn().mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier,
+    }),
+    listVoices: vi.fn().mockResolvedValue(voices),
+    isHealthy: vi.fn().mockResolvedValue(true),
+  };
+}
+
+// ==========================================
+// Test Guards
+// ==========================================
+
+/**
+ * Conditional auth guard for testing.
+ * Authenticates requests that carry `Authorization: Bearer test-token`.
+ * Rejects all others with UnauthorizedException.
+ */
+class TestAuthGuard implements CanActivate {
+  canActivate(context: ExecutionContext): boolean {
+    const req = context.switchToHttp().getRequest<{
+      headers: Record<string, string | undefined>;
+      user?: typeof MOCK_USER;
+      cookies?: Record<string, string>;
+    }>();
+    const authHeader = req.headers.authorization;
+    const cookieToken = req.cookies?.["better-auth.session_token"];
+
+    if (authHeader === "Bearer test-token" || cookieToken === "test-token") {
+      req.user = { ...MOCK_USER };
+      return true;
+    }
+
+    throw new UnauthorizedException("No authentication token provided");
+  }
+}
+
+/**
+ * Test workspace guard that attaches a mock workspace to the request.
+ */
+class TestWorkspaceGuard implements CanActivate {
+  canActivate(context: ExecutionContext): boolean {
+    const req = context.switchToHttp().getRequest<{
+      workspace?: { id: string };
+      headers: Record<string, string | undefined>;
+    }>();
+    const workspaceId = req.headers["x-workspace-id"] ?? MOCK_WORKSPACE_ID;
+    req.workspace = { id: workspaceId as string };
+    return true;
+  }
+}
+
+/**
+ * Test permission guard that always allows access.
+ */
+class TestPermissionGuard implements CanActivate {
+  canActivate(): boolean {
+    return true;
+  }
+}
+
+// ==========================================
+// Tests
+// ==========================================
+
+describe("Speech Services E2E Integration", () => {
+  let app: INestApplication;
+  let mockSTTProvider: ISTTProvider;
+  let defaultTTSProvider: ITTSProvider;
+  let premiumTTSProvider: ITTSProvider;
+  let fallbackTTSProvider: ITTSProvider;
+  let ttsProvidersMap: Map<SpeechTier, ITTSProvider>;
+
+  // WebSocket gateway test dependencies
+  let speechGateway: SpeechGateway;
+  let mockSpeechService: SpeechService;
+
+  beforeAll(async () => {
+    // Create mock providers
+    mockSTTProvider = createMockSTTProvider();
+    defaultTTSProvider = createMockTTSProvider("default", "mock-kokoro");
+    premiumTTSProvider = createMockTTSProvider("premium", "mock-chatterbox");
+    fallbackTTSProvider = createMockTTSProvider("fallback", "mock-piper");
+
+    ttsProvidersMap = new Map<SpeechTier, ITTSProvider>([
+      ["default", defaultTTSProvider],
+      ["premium", premiumTTSProvider],
+      ["fallback", fallbackTTSProvider],
+    ]);
+
+    const moduleRef = await Test.createTestingModule({
+      controllers: [SpeechController],
+      providers: [
+        SpeechService,
+        {
+          provide: speechConfig.KEY,
+          useValue: MOCK_SPEECH_CONFIG,
+        },
+        {
+          provide: STT_PROVIDER,
+          useValue: mockSTTProvider,
+        },
+        {
+          provide: TTS_PROVIDERS,
+          useValue: ttsProvidersMap,
+        },
+        // Gateway dependencies (not tested via HTTP but needed for DI)
+        {
+          provide: SpeechGateway,
+          useFactory: (
+            authService: AuthService,
+            prisma: PrismaService,
+            speechService: SpeechService,
+            config: SpeechConfig
+          ): SpeechGateway => {
+            return new SpeechGateway(authService, prisma, speechService, config);
+          },
+          inject: [AuthService, PrismaService, SpeechService, speechConfig.KEY],
+        },
+        {
+          provide: AuthService,
+          useValue: {
+            verifySession: vi.fn().mockResolvedValue({
+              user: { id: MOCK_USER_ID, email: "test@example.com", name: "Test User" },
+              session: { id: "test-session" },
+            }),
+          },
+        },
+        {
+          provide: PrismaService,
+          useValue: {
+            workspaceMember: {
+              findFirst: vi.fn().mockResolvedValue({
+                userId: MOCK_USER_ID,
+                workspaceId: MOCK_WORKSPACE_ID,
+                role: "MEMBER",
+              }),
+            },
+          },
+        },
+      ],
+    })
+      .overrideGuard(AuthGuard)
+      .useClass(TestAuthGuard)
+      .overrideGuard(WorkspaceGuard)
+      .useClass(TestWorkspaceGuard)
+      .overrideGuard(PermissionGuard)
+      .useClass(TestPermissionGuard)
+      .compile();
+
+    app = moduleRef.createNestApplication();
+    app.useGlobalPipes(new ValidationPipe({ transform: true, whitelist: true }));
+    await app.init();
+
+    // Capture references for WebSocket tests
+    speechGateway = moduleRef.get(SpeechGateway);
+    mockSpeechService = moduleRef.get(SpeechService);
+  });
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+
+    // Reset default mock behaviors
+    (mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
+      MOCK_TRANSCRIPTION_RESULT
+    );
+    (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier: "default",
+    });
+    (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier: "premium",
+    });
+    (fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+      ...MOCK_SYNTHESIS_RESULT,
+      tier: "fallback",
+    });
+    (defaultTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
+      MOCK_VOICES.filter((v) => v.tier === "default")
+    );
+    (premiumTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue(
+      MOCK_VOICES.filter((v) => v.tier === "premium")
+    );
+    (fallbackTTSProvider.listVoices as ReturnType<typeof vi.fn>).mockResolvedValue([]);
+  });
+
+  afterAll(async () => {
+    if (app) {
+      await app.close();
+    }
+  });
+
+  // ==========================================
+  // Scenario 1: REST Transcription
+  // ==========================================
+  describe("Scenario 1: REST Transcription (POST /speech/transcribe)", () => {
+    it("should transcribe an uploaded audio file and return the transcription result", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", TEST_AUDIO_BUFFER, {
+          filename: "test.wav",
+          contentType: "audio/wav",
+        })
+        .expect(201);
+
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toMatchObject({
+        text: MOCK_TRANSCRIPTION_RESULT.text,
+        language: MOCK_TRANSCRIPTION_RESULT.language,
+        durationSeconds: MOCK_TRANSCRIPTION_RESULT.durationSeconds,
+        confidence: MOCK_TRANSCRIPTION_RESULT.confidence,
+      });
+      expect(response.body.data.segments).toBeDefined();
+      expect(response.body.data.segments).toHaveLength(1);
+
+      expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({ mimeType: "audio/wav" })
+      );
+    });
+
+    it("should pass optional transcription parameters to the service", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", TEST_AUDIO_BUFFER, {
+          filename: "test.mp3",
+          contentType: "audio/mpeg",
+        })
+        .field("language", "fr")
+        .field("model", "whisper-large-v3")
+        .field("prompt", "Meeting transcript")
+        .field("temperature", "0.3")
+        .expect(201);
+
+      expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
+
+      expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({
+          mimeType: "audio/mpeg",
+          language: "fr",
+          model: "whisper-large-v3",
+          prompt: "Meeting transcript",
+          temperature: 0.3,
+        })
+      );
+    });
+
+    it("should reject request without an audio file", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .expect(400);
+
+      expect(response.body).toHaveProperty("message");
+    });
+  });
+
+  // ==========================================
+  // Scenario 2: REST Synthesis
+  // ==========================================
+  describe("Scenario 2: REST Synthesis (POST /speech/synthesize)", () => {
+    it("should synthesize text and return audio binary response", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "Hello, world!" })
+        .expect(201);
+
+      // Response should be binary audio
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+      expect(response.headers["content-disposition"]).toContain("attachment");
+      expect(response.headers["content-disposition"]).toContain("speech.mp3");
+      expect(response.body).toBeDefined();
+      expect(Buffer.isBuffer(response.body) || response.body instanceof Buffer).toBe(true);
+    });
+
+    it("should pass voice, speed, format, and tier options to the service", async () => {
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+        audio: Buffer.from("wav-audio-data"),
+        format: "wav",
+        voice: "af_sky",
+        tier: "default",
+        durationSeconds: 1.5,
+      });
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({
+          text: "Test with options",
+          voice: "af_sky",
+          speed: 1.5,
+          format: "wav",
+        })
+        .expect(201);
+
+      expect(response.headers["content-type"]).toContain("audio/wav");
+      expect(response.headers["content-disposition"]).toContain("speech.wav");
+    });
+
+    it("should accept empty text (validation delegated to service)", async () => {
+      // The SynthesizeDto allows empty strings (no @IsNotEmpty decorator).
+      // The service/provider handles empty text semantics.
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "" })
+        .expect(201);
+
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+    });
+
+    it("should reject missing text field", async () => {
+      await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({})
+        .expect(400);
+    });
+  });
+
+  // ==========================================
+  // Scenario 3: Provider Fallback
+  // ==========================================
+  describe("Scenario 3: Provider Fallback", () => {
+    it("should fall back from premium to default when premium fails", async () => {
+      // Make premium provider fail
+      (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Premium provider unavailable")
+      );
+
+      // Default provider should succeed
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+        audio: Buffer.from("fallback-audio"),
+        format: "mp3",
+        voice: "af_heart",
+        tier: "default",
+      });
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "Fallback test", tier: "premium" })
+        .expect(201);
+
+      // Premium was attempted first
+      expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
+      // Then default succeeded
+      expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+    });
+
+    it("should fall back through entire chain: premium -> default -> fallback", async () => {
+      // Make premium and default fail
+      (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Premium down")
+      );
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Default down")
+      );
+
+      // Fallback should succeed
+      (fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockResolvedValue({
+        audio: Buffer.from("fallback-piper-audio"),
+        format: "mp3",
+        voice: "piper-default",
+        tier: "fallback",
+      });
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "Full fallback chain test", tier: "premium" })
+        .expect(201);
+
+      expect(premiumTTSProvider.synthesize).toHaveBeenCalled();
+      expect(defaultTTSProvider.synthesize).toHaveBeenCalled();
+      expect(fallbackTTSProvider.synthesize).toHaveBeenCalled();
+      expect(response.headers["content-type"]).toContain("audio/mpeg");
+    });
+
+    it("should return 503 when all TTS providers fail", async () => {
+      (premiumTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Premium down")
+      );
+      (defaultTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Default down")
+      );
+      (fallbackTTSProvider.synthesize as ReturnType<typeof vi.fn>).mockRejectedValue(
+        new Error("Fallback down")
+      );
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .set("Authorization", "Bearer test-token")
+        .send({ text: "All providers down", tier: "premium" })
+        .expect(503);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("All TTS providers failed");
+    });
+  });
+
+  // ==========================================
+  // Scenario 4: WebSocket Streaming Transcription
+  // ==========================================
+  describe("Scenario 4: WebSocket Streaming Transcription", () => {
+    interface MockSocket {
+      id: string;
+      join: ReturnType<typeof vi.fn>;
+      leave: ReturnType<typeof vi.fn>;
+      emit: ReturnType<typeof vi.fn>;
+      disconnect: ReturnType<typeof vi.fn>;
+      data: { userId?: string; workspaceId?: string };
+      handshake: {
+        auth: Record<string, unknown>;
+        query: Record<string, unknown>;
+        headers: Record<string, unknown>;
+      };
+    }
+
+    function createTestSocket(overrides?: Partial<MockSocket>): MockSocket {
+      return {
+        id: "e2e-test-socket",
+        join: vi.fn(),
+        leave: vi.fn(),
+        emit: vi.fn(),
+        disconnect: vi.fn(),
+        data: {},
+        handshake: {
+          auth: { token: "valid-token" },
+          query: {},
+          headers: {},
+        },
+        ...overrides,
+      };
+    }
+
+    it("should complete the full streaming transcription lifecycle", async () => {
+      const client = createTestSocket();
+      // Authenticate the client
+      await speechGateway.handleConnection(client as never);
+
+      expect(client.data.userId).toBe(MOCK_USER_ID);
+      expect(client.data.workspaceId).toBe(MOCK_WORKSPACE_ID);
+      expect(client.disconnect).not.toHaveBeenCalled();
+
+      // Start transcription session
+      speechGateway.handleStartTranscription(client as never, { language: "en" });
+
+      expect(client.emit).toHaveBeenCalledWith(
+        "transcription-started",
+        expect.objectContaining({ sessionId: "e2e-test-socket" })
+      );
+
+      // Send audio chunks
+      const chunk1 = Buffer.from("audio-data-chunk-1");
+      const chunk2 = Buffer.from("audio-data-chunk-2");
+      const chunk3 = Buffer.from("audio-data-chunk-3");
+
+      speechGateway.handleAudioChunk(client as never, chunk1);
+      speechGateway.handleAudioChunk(client as never, chunk2);
+      speechGateway.handleAudioChunk(client as never, chunk3);
+
+      // No errors should have been emitted for chunks
+      const errorCalls = client.emit.mock.calls.filter(
+        (call: unknown[]) => call[0] === "transcription-error"
+      );
+      expect(errorCalls).toHaveLength(0);
+
+      vi.clearAllMocks();
+      (mockSTTProvider.transcribe as ReturnType<typeof vi.fn>).mockResolvedValue(
+        MOCK_TRANSCRIPTION_RESULT
+      );
+
+      // Stop transcription - should trigger the full transcription pipeline
+      await speechGateway.handleStopTranscription(client as never);
+
+      // Verify transcription was called with concatenated audio
+      expect(mockSTTProvider.transcribe).toHaveBeenCalledWith(
+        expect.any(Buffer),
+        expect.objectContaining({ language: "en" })
+      );
+
+      // Verify the final result was emitted
+      expect(client.emit).toHaveBeenCalledWith(
+        "transcription-final",
+        expect.objectContaining({
+          text: MOCK_TRANSCRIPTION_RESULT.text,
+          language: "en",
+          durationSeconds: 3.2,
+          confidence: 0.97,
+        })
+      );
+    });
+
+    it("should clean up session on disconnect", async () => {
+      const client = createTestSocket({ id: "disconnect-test" });
+      await speechGateway.handleConnection(client as never);
+
+      speechGateway.handleStartTranscription(client as never, {});
+      speechGateway.handleAudioChunk(client as never, Buffer.from("data"));
+
+      // Disconnect
+      speechGateway.handleDisconnect(client as never);
+
+      // Trying to send more chunks should fail (session cleaned up)
+      vi.clearAllMocks();
+      speechGateway.handleAudioChunk(client as never, Buffer.from("more-data"));
+
+      expect(client.emit).toHaveBeenCalledWith(
+        "transcription-error",
+        expect.objectContaining({
+          message: expect.stringContaining("No active transcription session"),
+        })
+      );
+    });
+
+    it("should reject unauthenticated WebSocket clients", async () => {
+      const client = createTestSocket({
+        id: "unauth-ws-client",
+        handshake: { auth: {}, query: {}, headers: {} },
+      });
+
+      await speechGateway.handleConnection(client as never);
+
+      expect(client.disconnect).toHaveBeenCalled();
+      expect(client.data.userId).toBeUndefined();
+    });
+  });
+
+  // ==========================================
+  // Scenario 5: Audio Validation (Invalid MIME Type)
+  // ==========================================
+  describe("Scenario 5: Audio Validation", () => {
+    it("should reject files with unsupported MIME types", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", Buffer.from("not-audio"), {
+          filename: "document.pdf",
+          contentType: "application/pdf",
+        })
+        .expect(400);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("Unsupported audio format");
+      expect(response.body.message).toContain("application/pdf");
+    });
+
+    it("should reject files with text/plain MIME type", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", Buffer.from("plain text content"), {
+          filename: "notes.txt",
+          contentType: "text/plain",
+        })
+        .expect(400);
+
+      expect(response.body.message).toContain("Unsupported audio format");
+    });
+
+    it("should reject video MIME types", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", Buffer.from("video-data"), {
+          filename: "video.mp4",
+          contentType: "video/mp4",
+        })
+        .expect(400);
+
+      expect(response.body.message).toContain("Unsupported audio format");
+    });
+
+    it("should accept valid audio MIME types", async () => {
+      const validMimeTypes = [
+        { mime: "audio/wav", ext: "wav" },
+        { mime: "audio/mpeg", ext: "mp3" },
+        { mime: "audio/webm", ext: "webm" },
+        { mime: "audio/ogg", ext: "ogg" },
+        { mime: "audio/flac", ext: "flac" },
+      ];
+
+      for (const { mime, ext } of validMimeTypes) {
+        const response = await request(app.getHttpServer() as App)
+          .post("/speech/transcribe")
+          .set("Authorization", "Bearer test-token")
+          .attach("file", TEST_AUDIO_BUFFER, {
+            filename: `test.${ext}`,
+            contentType: mime,
+          })
+          .expect(201);
+
+        expect(response.body).toHaveProperty("data");
+        expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text);
+      }
+    });
+  });
+
+  // ==========================================
+  // Scenario 6: File Size Limits
+  // ==========================================
+  describe("Scenario 6: File Size Limits", () => {
+    it("should reject files exceeding the maximum upload size (25 MB)", async () => {
+      // Create a buffer slightly over the 25 MB limit
+      const oversizedBuffer = Buffer.alloc(25_000_001, 0);
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", oversizedBuffer, {
+          filename: "large-audio.wav",
+          contentType: "audio/wav",
+        })
+        .expect(400);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("exceeds maximum allowed size");
+    });
+
+    it("should accept files within the size limit", async () => {
+      // Create a buffer at the exact limit
+      const maxBuffer = Buffer.alloc(1024, 0);
+
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .set("Authorization", "Bearer test-token")
+        .attach("file", maxBuffer, {
+          filename: "acceptable-audio.wav",
+          contentType: "audio/wav",
+        })
+        .expect(201);
+
+      expect(response.body).toHaveProperty("data");
+    });
+  });
+
+  // ==========================================
+  // Scenario 7: Authentication
+  // ==========================================
+  describe("Scenario 7: Authentication", () => {
+    it("should reject POST /speech/transcribe without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/transcribe")
+        .attach("file", TEST_AUDIO_BUFFER, {
+          filename: "test.wav",
+          contentType: "audio/wav",
+        })
+        .expect(401);
+
+      expect(response.body).toHaveProperty("message");
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject POST /speech/synthesize without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .post("/speech/synthesize")
+        .send({ text: "Hello" })
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject GET /speech/voices without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices")
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject GET /speech/health without authentication", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/health")
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+
+    it("should reject requests with an invalid token", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices")
+        .set("Authorization", "Bearer invalid-token-xyz")
+        .expect(401);
+
+      expect(response.body.message).toContain("No authentication token provided");
+    });
+  });
+
+  // ==========================================
+  // Scenario 8: Voice Listing
+  // ==========================================
+  describe("Scenario 8: Voice Listing (GET /speech/voices)", () => {
+    it("should return all voices when no tier filter is provided", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      expect(response.body).toHaveProperty("data");
+      expect(Array.isArray(response.body.data)).toBe(true);
+
+      // Should have voices from all providers that returned voices
+      const voices = response.body.data as VoiceInfo[];
+      expect(voices.length).toBeGreaterThan(0);
+
+      // Verify voice structure
+      for (const voice of voices) {
+        expect(voice).toHaveProperty("id");
+        expect(voice).toHaveProperty("name");
+        expect(voice).toHaveProperty("tier");
+      }
+    });
+
+    it("should filter voices by tier when tier query param is provided", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices?tier=default")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      const voices = response.body.data as VoiceInfo[];
+      expect(voices.length).toBeGreaterThan(0);
+
+      for (const voice of voices) {
+        expect(voice.tier).toBe("default");
+      }
+
+      expect(defaultTTSProvider.listVoices).toHaveBeenCalled();
+    });
+
+    it("should return empty array for tier with no voices", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices?tier=fallback")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      expect(response.body.data).toEqual([]);
+    });
+
+    it("should include voice metadata (id, name, language, tier, isDefault)", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/voices?tier=default")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      const voices = response.body.data as VoiceInfo[];
+      const defaultVoice = voices.find((v) => v.isDefault === true);
+
+      expect(defaultVoice).toBeDefined();
+      expect(defaultVoice).toMatchObject({
+        id: "af_heart",
+        name: "Heart",
+        language: "en",
+        tier: "default",
+        isDefault: true,
+      });
+    });
+  });
+
+  // ==========================================
+  // Scenario 9: Health Check
+  // ==========================================
+  describe("Scenario 9: Health Check (GET /speech/health)", () => {
+    it("should return health status for both STT and TTS providers", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/health")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      expect(response.body).toHaveProperty("data");
+      expect(response.body.data).toHaveProperty("stt");
+      expect(response.body.data).toHaveProperty("tts");
+
+      expect(response.body.data.stt).toHaveProperty("available");
+      expect(response.body.data.tts).toHaveProperty("available");
+
+      // Both should be available since we have mock providers registered and config enabled
+      expect(response.body.data.stt.available).toBe(true);
+      expect(response.body.data.tts.available).toBe(true);
+    });
+
+    it("should return consistent health check format", async () => {
+      const response = await request(app.getHttpServer() as App)
+        .get("/speech/health")
+        .set("Authorization", "Bearer test-token")
+        .expect(200);
+
+      // Verify the response matches the expected shape
+      expect(response.body).toEqual({
+        data: {
+          stt: { available: expect.any(Boolean) },
+          tts: { available: expect.any(Boolean) },
+        },
+      });
+    });
+  });
+});
--- a/apps/api/src/speech/speech.module.ts
+++ b/apps/api/src/speech/speech.module.ts
@@ -0,0 +1,82 @@
+/**
+ * SpeechModule
+ *
+ * NestJS module for speech-to-text (STT) and text-to-speech (TTS) services.
+ * Provides a provider abstraction layer with graceful fallback for TTS tiers.
+ *
+ * TTS providers are created dynamically based on configuration:
+ * - default: Kokoro-FastAPI (CPU, always available)
+ * - premium: Chatterbox (GPU, voice cloning)
+ * - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU)
+ *
+ * Imports:
+ * - ConfigModule.forFeature(speechConfig) for speech configuration
+ * - AuthModule for WebSocket authentication
+ * - PrismaModule for workspace membership queries
+ *
+ * Providers:
+ * - SpeechService: High-level speech operations with provider selection
+ * - SpeechGateway: WebSocket gateway for streaming transcription (Issue #397)
+ * - TTS_PROVIDERS: Map<SpeechTier, ITTSProvider> populated by factory based on config
+ *
+ * Exports:
+ * - SpeechService for use by other modules (e.g., controllers, brain)
+ *
+ * Issue #389, #390, #391, #397
+ */
+
+import { Module, type OnModuleInit, Logger } from "@nestjs/common";
+import { ConfigModule, ConfigService } from "@nestjs/config";
+import {
+  speechConfig,
+  validateSpeechConfig,
+  isSttEnabled,
+  type SpeechConfig,
+} from "./speech.config";
+import { SpeechService } from "./speech.service";
+import { SpeechController } from "./speech.controller";
+import { SpeechGateway } from "./speech.gateway";
+import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
+import { SpeachesSttProvider } from "./providers/speaches-stt.provider";
+import { createTTSProviders } from "./providers/tts-provider.factory";
+import { AuthModule } from "../auth/auth.module";
+import { PrismaModule } from "../prisma/prisma.module";
+
+@Module({
+  imports: [ConfigModule.forFeature(speechConfig), AuthModule, PrismaModule],
+  controllers: [SpeechController],
+  providers: [
+    SpeechService,
+    SpeechGateway,
+    // STT provider: conditionally register SpeachesSttProvider when STT is enabled
+    ...(isSttEnabled()
+      ? [
+          {
+            provide: STT_PROVIDER,
+            useClass: SpeachesSttProvider,
+          },
+        ]
+      : []),
+    {
+      provide: TTS_PROVIDERS,
+      useFactory: (configService: ConfigService) => {
+        const config = configService.get<SpeechConfig>("speech");
+        if (!config) {
+          return new Map();
+        }
+        return createTTSProviders(config);
+      },
+      inject: [ConfigService],
+    },
+  ],
+  exports: [SpeechService],
+})
+export class SpeechModule implements OnModuleInit {
+  private readonly logger = new Logger(SpeechModule.name);
+
+  onModuleInit(): void {
+    // Validate configuration at startup (fail fast)
+    validateSpeechConfig();
+    this.logger.log("Speech module initialized");
+  }
+}
--- a/apps/api/src/speech/speech.service.spec.ts
+++ b/apps/api/src/speech/speech.service.spec.ts
@@ -0,0 +1,541 @@
+/**
+ * SpeechService Tests
+ *
+ * Issue #389: Tests for provider abstraction layer with fallback logic.
+ * Written FIRST following TDD (Red-Green-Refactor).
+ */
+
+import { describe, it, expect, beforeEach, vi } from "vitest";
+import { Test, TestingModule } from "@nestjs/testing";
+import { ServiceUnavailableException } from "@nestjs/common";
+import { SpeechService } from "./speech.service";
+import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
+import { speechConfig } from "./speech.config";
+import type { ISTTProvider } from "./interfaces/stt-provider.interface";
+import type { ITTSProvider } from "./interfaces/tts-provider.interface";
+import type {
+  SpeechTier,
+  TranscriptionResult,
+  SynthesisResult,
+  VoiceInfo,
+} from "./interfaces/speech-types";
+
+// ==========================================
+// Mock provider factories
+// ==========================================
+
+function createMockSttProvider(overrides?: Partial<ISTTProvider>): ISTTProvider {
+  return {
+    name: "mock-stt",
+    transcribe: vi.fn().mockResolvedValue({
+      text: "Hello world",
+      language: "en",
+      durationSeconds: 2.5,
+    } satisfies TranscriptionResult),
+    isHealthy: vi.fn().mockResolvedValue(true),
+    ...overrides,
+  };
+}
+
+function createMockTtsProvider(tier: SpeechTier, overrides?: Partial<ITTSProvider>): ITTSProvider {
+  return {
+    name: `mock-tts-${tier}`,
+    tier,
+    synthesize: vi.fn().mockResolvedValue({
+      audio: Buffer.from("fake-audio"),
+      format: "mp3",
+      voice: "test-voice",
+      tier,
+    } satisfies SynthesisResult),
+    listVoices: vi
+      .fn()
+      .mockResolvedValue([
+        { id: `${tier}-voice-1`, name: `${tier} Voice 1`, tier, isDefault: true },
+      ] satisfies VoiceInfo[]),
+    isHealthy: vi.fn().mockResolvedValue(true),
+    ...overrides,
+  };
+}
+
+// ==========================================
+// Default config for tests
+// ==========================================
+
+function createTestConfig(): ReturnType<typeof speechConfig> {
+  return {
+    stt: {
+      enabled: true,
+      baseUrl: "http://localhost:8000/v1",
+      model: "test-model",
+      language: "en",
+    },
+    tts: {
+      default: {
+        enabled: true,
+        url: "http://localhost:8880/v1",
+        voice: "test-voice",
+        format: "mp3",
+      },
+      premium: {
+        enabled: true,
+        url: "http://localhost:8881/v1",
+      },
+      fallback: {
+        enabled: true,
+        url: "http://localhost:8882/v1",
+      },
+    },
+    limits: {
+      maxUploadSize: 25_000_000,
+      maxDurationSeconds: 600,
+      maxTextLength: 4096,
+    },
+  } as ReturnType<typeof speechConfig>;
+}
+
+// ==========================================
+// Test helper: create testing module
+// ==========================================
+
+async function createTestModule(options: {
+  sttProvider?: ISTTProvider | null;
+  ttsProviders?: Map<SpeechTier, ITTSProvider>;
+  config?: ReturnType<typeof speechConfig>;
+}): Promise<TestingModule> {
+  const config = options.config ?? createTestConfig();
+  const ttsProviders = options.ttsProviders ?? new Map<SpeechTier, ITTSProvider>();
+
+  const providers: Array<{ provide: symbol | string; useValue: unknown }> = [
+    { provide: speechConfig.KEY, useValue: config },
+    { provide: TTS_PROVIDERS, useValue: ttsProviders },
+  ];
+
+  if (options.sttProvider !== undefined) {
+    providers.push({ provide: STT_PROVIDER, useValue: options.sttProvider });
+  }
+
+  return Test.createTestingModule({
+    providers: [SpeechService, ...providers],
+  }).compile();
+}
+
+// ==========================================
+// Tests
+// ==========================================
+
+describe("SpeechService", () => {
+  // ==========================================
+  // Construction and initialization
+  // ==========================================
+  describe("construction", () => {
+    it("should be defined when all providers are injected", async () => {
+      const module = await createTestModule({
+        sttProvider: createMockSttProvider(),
+        ttsProviders: new Map([["default", createMockTtsProvider("default")]]),
+      });
+
+      const service = module.get<SpeechService>(SpeechService);
+      expect(service).toBeDefined();
+    });
+
+    it("should be defined with no STT provider", async () => {
+      const module = await createTestModule({
+        sttProvider: null,
+        ttsProviders: new Map([["default", createMockTtsProvider("default")]]),
+      });
+
+      const service = module.get<SpeechService>(SpeechService);
+      expect(service).toBeDefined();
+    });
+
+    it("should be defined with empty TTS providers map", async () => {
+      const module = await createTestModule({
+        sttProvider: createMockSttProvider(),
+        ttsProviders: new Map(),
+      });
+
+      const service = module.get<SpeechService>(SpeechService);
+      expect(service).toBeDefined();
+    });
+  });
+
+  // ==========================================
+  // transcribe()
+  // ==========================================
+  describe("transcribe", () => {
+    let service: SpeechService;
+    let mockStt: ISTTProvider;
+
+    beforeEach(async () => {
+      mockStt = createMockSttProvider();
+      const module = await createTestModule({ sttProvider: mockStt });
+      service = module.get<SpeechService>(SpeechService);
+    });
+
+    it("should delegate to the STT provider", async () => {
+      const audio = Buffer.from("test-audio");
+      const result = await service.transcribe(audio);
+
+      expect(mockStt.transcribe).toHaveBeenCalledWith(audio, undefined);
+      expect(result.text).toBe("Hello world");
+      expect(result.language).toBe("en");
+    });
+
+    it("should pass options to the STT provider", async () => {
+      const audio = Buffer.from("test-audio");
+      const options = { language: "fr", model: "custom-model" };
+      await service.transcribe(audio, options);
+
+      expect(mockStt.transcribe).toHaveBeenCalledWith(audio, options);
+    });
+
+    it("should throw ServiceUnavailableException when STT is disabled in config", async () => {
+      const config = createTestConfig();
+      config.stt.enabled = false;
+      const module = await createTestModule({ sttProvider: mockStt, config });
+      service = module.get<SpeechService>(SpeechService);
+
+      await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow(
+        ServiceUnavailableException
+      );
+    });
+
+    it("should throw ServiceUnavailableException when no STT provider is registered", async () => {
+      const module = await createTestModule({ sttProvider: null });
+      service = module.get<SpeechService>(SpeechService);
+
+      await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow(
+        ServiceUnavailableException
+      );
+    });
+
+    it("should propagate provider errors as ServiceUnavailableException", async () => {
+      const failingStt = createMockSttProvider({
+        transcribe: vi.fn().mockRejectedValue(new Error("Connection refused")),
+      });
+      const module = await createTestModule({ sttProvider: failingStt });
+      service = module.get<SpeechService>(SpeechService);
+
+      await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow(
+        ServiceUnavailableException
+      );
+    });
+  });
+
+  // ==========================================
+  // synthesize()
+  // ==========================================
+  describe("synthesize", () => {
+    let service: SpeechService;
+    let defaultProvider: ITTSProvider;
+    let premiumProvider: ITTSProvider;
+    let fallbackProvider: ITTSProvider;
+
+    beforeEach(async () => {
+      defaultProvider = createMockTtsProvider("default");
+      premiumProvider = createMockTtsProvider("premium");
+      fallbackProvider = createMockTtsProvider("fallback");
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["default", defaultProvider],
+        ["premium", premiumProvider],
+        ["fallback", fallbackProvider],
+      ]);
+
+      const module = await createTestModule({ ttsProviders });
+      service = module.get<SpeechService>(SpeechService);
+    });
+
+    it("should use the default tier when no tier is specified", async () => {
+      const result = await service.synthesize("Hello world");
+
+      expect(defaultProvider.synthesize).toHaveBeenCalledWith("Hello world", undefined);
+      expect(result.tier).toBe("default");
+    });
+
+    it("should use the requested tier when specified", async () => {
+      const result = await service.synthesize("Hello world", { tier: "premium" });
+
+      expect(premiumProvider.synthesize).toHaveBeenCalled();
+      expect(result.tier).toBe("premium");
+    });
+
+    it("should pass options to the TTS provider", async () => {
+      const options = { voice: "custom-voice", format: "wav" as const };
+      await service.synthesize("Hello", options);
+
+      expect(defaultProvider.synthesize).toHaveBeenCalledWith("Hello", options);
+    });
+
+    it("should throw ServiceUnavailableException when TTS default is disabled and no tier specified", async () => {
+      const config = createTestConfig();
+      config.tts.default.enabled = false;
+      config.tts.premium.enabled = false;
+      config.tts.fallback.enabled = false;
+      const module = await createTestModule({
+        ttsProviders: new Map([["default", defaultProvider]]),
+        config,
+      });
+      service = module.get<SpeechService>(SpeechService);
+
+      await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException);
+    });
+
+    it("should throw ServiceUnavailableException when no TTS providers are registered", async () => {
+      const module = await createTestModule({ ttsProviders: new Map() });
+      service = module.get<SpeechService>(SpeechService);
+
+      await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException);
+    });
+  });
+
+  // ==========================================
+  // synthesize() fallback logic
+  // ==========================================
+  describe("synthesize fallback", () => {
+    it("should fall back from premium to default when premium provider fails", async () => {
+      const failingPremium = createMockTtsProvider("premium", {
+        synthesize: vi.fn().mockRejectedValue(new Error("Premium unavailable")),
+      });
+      const defaultProvider = createMockTtsProvider("default");
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["premium", failingPremium],
+        ["default", defaultProvider],
+      ]);
+
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const result = await service.synthesize("Hello", { tier: "premium" });
+
+      expect(failingPremium.synthesize).toHaveBeenCalled();
+      expect(defaultProvider.synthesize).toHaveBeenCalled();
+      expect(result.tier).toBe("default");
+    });
+
+    it("should fall back from default to fallback when default provider fails", async () => {
+      const failingDefault = createMockTtsProvider("default", {
+        synthesize: vi.fn().mockRejectedValue(new Error("Default unavailable")),
+      });
+      const fallbackProvider = createMockTtsProvider("fallback");
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["default", failingDefault],
+        ["fallback", fallbackProvider],
+      ]);
+
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const result = await service.synthesize("Hello");
+
+      expect(failingDefault.synthesize).toHaveBeenCalled();
+      expect(fallbackProvider.synthesize).toHaveBeenCalled();
+      expect(result.tier).toBe("fallback");
+    });
+
+    it("should fall back premium -> default -> fallback", async () => {
+      const failingPremium = createMockTtsProvider("premium", {
+        synthesize: vi.fn().mockRejectedValue(new Error("Premium fail")),
+      });
+      const failingDefault = createMockTtsProvider("default", {
+        synthesize: vi.fn().mockRejectedValue(new Error("Default fail")),
+      });
+      const fallbackProvider = createMockTtsProvider("fallback");
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["premium", failingPremium],
+        ["default", failingDefault],
+        ["fallback", fallbackProvider],
+      ]);
+
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const result = await service.synthesize("Hello", { tier: "premium" });
+
+      expect(failingPremium.synthesize).toHaveBeenCalled();
+      expect(failingDefault.synthesize).toHaveBeenCalled();
+      expect(fallbackProvider.synthesize).toHaveBeenCalled();
+      expect(result.tier).toBe("fallback");
+    });
+
+    it("should throw ServiceUnavailableException when all tiers fail", async () => {
+      const failingDefault = createMockTtsProvider("default", {
+        synthesize: vi.fn().mockRejectedValue(new Error("Default fail")),
+      });
+      const failingFallback = createMockTtsProvider("fallback", {
+        synthesize: vi.fn().mockRejectedValue(new Error("Fallback fail")),
+      });
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["default", failingDefault],
+        ["fallback", failingFallback],
+      ]);
+
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException);
+    });
+
+    it("should skip unavailable tiers in fallback chain", async () => {
+      // premium requested, but only fallback registered (no default)
+      const failingPremium = createMockTtsProvider("premium", {
+        synthesize: vi.fn().mockRejectedValue(new Error("Premium fail")),
+      });
+      const fallbackProvider = createMockTtsProvider("fallback");
+
+      const config = createTestConfig();
+      config.tts.default.enabled = false;
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["premium", failingPremium],
+        ["fallback", fallbackProvider],
+      ]);
+
+      const module = await createTestModule({ ttsProviders, config });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const result = await service.synthesize("Hello", { tier: "premium" });
+      expect(result.tier).toBe("fallback");
+    });
+  });
+
+  // ==========================================
+  // listVoices()
+  // ==========================================
+  describe("listVoices", () => {
+    it("should aggregate voices from all registered TTS providers", async () => {
+      const defaultProvider = createMockTtsProvider("default", {
+        listVoices: vi.fn().mockResolvedValue([
+          { id: "voice-1", name: "Voice 1", tier: "default" as SpeechTier, isDefault: true },
+          { id: "voice-2", name: "Voice 2", tier: "default" as SpeechTier },
+        ]),
+      });
+      const premiumProvider = createMockTtsProvider("premium", {
+        listVoices: vi
+          .fn()
+          .mockResolvedValue([
+            { id: "voice-3", name: "Voice 3", tier: "premium" as SpeechTier, isDefault: true },
+          ]),
+      });
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["default", defaultProvider],
+        ["premium", premiumProvider],
+      ]);
+
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const voices = await service.listVoices();
+
+      expect(voices).toHaveLength(3);
+      expect(voices.map((v) => v.id)).toEqual(["voice-1", "voice-2", "voice-3"]);
+    });
+
+    it("should filter voices by tier when specified", async () => {
+      const defaultProvider = createMockTtsProvider("default", {
+        listVoices: vi
+          .fn()
+          .mockResolvedValue([{ id: "voice-1", name: "Voice 1", tier: "default" as SpeechTier }]),
+      });
+      const premiumProvider = createMockTtsProvider("premium", {
+        listVoices: vi
+          .fn()
+          .mockResolvedValue([{ id: "voice-2", name: "Voice 2", tier: "premium" as SpeechTier }]),
+      });
+
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["default", defaultProvider],
+        ["premium", premiumProvider],
+      ]);
+
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const voices = await service.listVoices("premium");
+
+      expect(voices).toHaveLength(1);
+      expect(voices[0].id).toBe("voice-2");
+      // Only the premium provider should have been called
+      expect(premiumProvider.listVoices).toHaveBeenCalled();
+      expect(defaultProvider.listVoices).not.toHaveBeenCalled();
+    });
+
+    it("should return empty array when no TTS providers are registered", async () => {
+      const module = await createTestModule({ ttsProviders: new Map() });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const voices = await service.listVoices();
+      expect(voices).toEqual([]);
+    });
+
+    it("should return empty array when requested tier has no provider", async () => {
+      const defaultProvider = createMockTtsProvider("default");
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([["default", defaultProvider]]);
+
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      const voices = await service.listVoices("premium");
+      expect(voices).toEqual([]);
+    });
+  });
+
+  // ==========================================
+  // isSTTAvailable / isTTSAvailable
+  // ==========================================
+  describe("availability checks", () => {
+    it("should report STT as available when enabled and provider registered", async () => {
+      const module = await createTestModule({
+        sttProvider: createMockSttProvider(),
+      });
+      const service = module.get<SpeechService>(SpeechService);
+
+      expect(service.isSTTAvailable()).toBe(true);
+    });
+
+    it("should report STT as unavailable when disabled in config", async () => {
+      const config = createTestConfig();
+      config.stt.enabled = false;
+      const module = await createTestModule({
+        sttProvider: createMockSttProvider(),
+        config,
+      });
+      const service = module.get<SpeechService>(SpeechService);
+
+      expect(service.isSTTAvailable()).toBe(false);
+    });
+
+    it("should report STT as unavailable when no provider registered", async () => {
+      const module = await createTestModule({ sttProvider: null });
+      const service = module.get<SpeechService>(SpeechService);
+
+      expect(service.isSTTAvailable()).toBe(false);
+    });
+
+    it("should report TTS as available when at least one tier is enabled with a provider", async () => {
+      const ttsProviders = new Map<SpeechTier, ITTSProvider>([
+        ["default", createMockTtsProvider("default")],
+      ]);
+      const module = await createTestModule({ ttsProviders });
+      const service = module.get<SpeechService>(SpeechService);
+
+      expect(service.isTTSAvailable()).toBe(true);
+    });
+
+    it("should report TTS as unavailable when no providers registered", async () => {
+      const config = createTestConfig();
+      config.tts.default.enabled = false;
+      config.tts.premium.enabled = false;
+      config.tts.fallback.enabled = false;
+      const module = await createTestModule({ ttsProviders: new Map(), config });
+      const service = module.get<SpeechService>(SpeechService);
+
+      expect(service.isTTSAvailable()).toBe(false);
+    });
+  });
+});
--- a/apps/api/src/speech/speech.service.ts
+++ b/apps/api/src/speech/speech.service.ts
@@ -0,0 +1,231 @@
+/**
+ * SpeechService
+ *
+ * High-level service for speech-to-text (STT) and text-to-speech (TTS) operations.
+ * Manages provider selection and graceful fallback for TTS tiers.
+ *
+ * Fallback chain for TTS: premium -> default -> fallback
+ * Each tier is only attempted if enabled in config and a provider is registered.
+ *
+ * Issue #389
+ */
+
+import { Injectable, Inject, Optional, Logger, ServiceUnavailableException } from "@nestjs/common";
+import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants";
+import { speechConfig, type SpeechConfig } from "./speech.config";
+import type { ISTTProvider } from "./interfaces/stt-provider.interface";
+import type { ITTSProvider } from "./interfaces/tts-provider.interface";
+import type {
+  SpeechTier,
+  TranscribeOptions,
+  TranscriptionResult,
+  SynthesizeOptions,
+  SynthesisResult,
+  VoiceInfo,
+} from "./interfaces/speech-types";
+
+/**
+ * Fallback order for TTS tiers.
+ * When a tier fails, the next tier in this array is attempted.
+ */
+const TTS_FALLBACK_ORDER: readonly SpeechTier[] = ["premium", "default", "fallback"] as const;
+
+@Injectable()
+export class SpeechService {
+  private readonly logger = new Logger(SpeechService.name);
+
+  constructor(
+    @Inject(speechConfig.KEY)
+    private readonly config: SpeechConfig,
+
+    @Optional()
+    @Inject(STT_PROVIDER)
+    private readonly sttProvider: ISTTProvider | null,
+
+    @Inject(TTS_PROVIDERS)
+    private readonly ttsProviders: Map<SpeechTier, ITTSProvider>
+  ) {
+    this.logger.log("Speech service initialized");
+
+    if (this.sttProvider) {
+      this.logger.log(`STT provider registered: ${this.sttProvider.name}`);
+    }
+
+    if (this.ttsProviders.size > 0) {
+      const tierNames = Array.from(this.ttsProviders.keys()).join(", ");
+      this.logger.log(`TTS providers registered: ${tierNames}`);
+    }
+  }
+
+  // ==========================================
+  // STT Operations
+  // ==========================================
+
+  /**
+   * Transcribe audio data to text using the registered STT provider.
+   *
+   * @param audio - Raw audio data as a Buffer
+   * @param options - Optional transcription parameters
+   * @returns Transcription result with text and metadata
+   * @throws {ServiceUnavailableException} If STT is disabled or no provider is registered
+   */
+  async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
+    if (!this.config.stt.enabled) {
+      throw new ServiceUnavailableException("Speech-to-text is not enabled");
+    }
+
+    if (!this.sttProvider) {
+      throw new ServiceUnavailableException("No STT provider is registered");
+    }
+
+    try {
+      return await this.sttProvider.transcribe(audio, options);
+    } catch (error: unknown) {
+      const message = error instanceof Error ? error.message : String(error);
+      this.logger.error(`STT transcription failed: ${message}`);
+      throw new ServiceUnavailableException(`Transcription failed: ${message}`);
+    }
+  }
+
+  // ==========================================
+  // TTS Operations
+  // ==========================================
+
+  /**
+   * Synthesize text to audio using TTS providers with graceful fallback.
+   *
+   * Fallback chain: requested tier -> default -> fallback.
+   * Only enabled tiers with registered providers are attempted.
+   *
+   * @param text - Text to convert to speech
+   * @param options - Optional synthesis parameters (voice, format, tier)
+   * @returns Synthesis result with audio buffer and metadata
+   * @throws {ServiceUnavailableException} If no TTS provider can fulfill the request
+   */
+  async synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult> {
+    const requestedTier = options?.tier ?? "default";
+    const fallbackChain = this.buildFallbackChain(requestedTier);
+
+    if (fallbackChain.length === 0) {
+      throw new ServiceUnavailableException(
+        "No TTS providers are available. Check that TTS is enabled and providers are registered."
+      );
+    }
+
+    let lastError: Error | undefined;
+
+    for (const tier of fallbackChain) {
+      const provider = this.ttsProviders.get(tier);
+      if (!provider) {
+        continue;
+      }
+
+      try {
+        return await provider.synthesize(text, options);
+      } catch (error: unknown) {
+        const message = error instanceof Error ? error.message : String(error);
+        this.logger.warn(`TTS tier "${tier}" (${provider.name}) failed: ${message}`);
+        lastError = error instanceof Error ? error : new Error(message);
+      }
+    }
+
+    const errorMessage = lastError?.message ?? "No providers available";
+    throw new ServiceUnavailableException(`All TTS providers failed: ${errorMessage}`);
+  }
+
+  /**
+   * List available voices across all TTS providers, optionally filtered by tier.
+   *
+   * @param tier - Optional tier filter. If omitted, voices from all tiers are returned.
+   * @returns Array of voice information objects
+   */
+  async listVoices(tier?: SpeechTier): Promise<VoiceInfo[]> {
+    const voices: VoiceInfo[] = [];
+
+    if (tier) {
+      const provider = this.ttsProviders.get(tier);
+      if (!provider) {
+        return [];
+      }
+
+      try {
+        return await provider.listVoices();
+      } catch (error: unknown) {
+        const message = error instanceof Error ? error.message : String(error);
+        this.logger.warn(`Failed to list voices for tier "${tier}": ${message}`);
+        return [];
+      }
+    }
+
+    // Aggregate voices from all providers
+    for (const [providerTier, provider] of this.ttsProviders) {
+      try {
+        const tierVoices = await provider.listVoices();
+        voices.push(...tierVoices);
+      } catch (error: unknown) {
+        const message = error instanceof Error ? error.message : String(error);
+        this.logger.warn(`Failed to list voices for tier "${providerTier}": ${message}`);
+      }
+    }
+
+    return voices;
+  }
+
+  // ==========================================
+  // Availability Checks
+  // ==========================================
+
+  /**
+   * Check if STT is available (enabled in config and provider registered).
+   */
+  isSTTAvailable(): boolean {
+    return this.config.stt.enabled && this.sttProvider !== null;
+  }
+
+  /**
+   * Check if TTS is available (at least one tier enabled with a registered provider).
+   */
+  isTTSAvailable(): boolean {
+    return this.getEnabledTiers().some((tier) => this.ttsProviders.has(tier));
+  }
+
+  // ==========================================
+  // Private helpers
+  // ==========================================
+
+  /**
+   * Build the fallback chain starting from the requested tier.
+   * Only includes tiers that are enabled in config and have a registered provider.
+   */
+  private buildFallbackChain(requestedTier: SpeechTier): SpeechTier[] {
+    const startIndex = TTS_FALLBACK_ORDER.indexOf(requestedTier);
+    if (startIndex === -1) {
+      return [];
+    }
+
+    const enabledTiers = this.getEnabledTiers();
+
+    return TTS_FALLBACK_ORDER.slice(startIndex).filter(
+      (tier) => enabledTiers.includes(tier) && this.ttsProviders.has(tier)
+    );
+  }
+
+  /**
+   * Get the list of TTS tiers that are enabled in the configuration.
+   */
+  private getEnabledTiers(): SpeechTier[] {
+    const tiers: SpeechTier[] = [];
+
+    if (this.config.tts.default.enabled) {
+      tiers.push("default");
+    }
+    if (this.config.tts.premium.enabled) {
+      tiers.push("premium");
+    }
+    if (this.config.tts.fallback.enabled) {
+      tiers.push("fallback");
+    }
+
+    return tiers;
+  }
+}
--- a/apps/web/src/components/speech/AudioPlayer.test.tsx
+++ b/apps/web/src/components/speech/AudioPlayer.test.tsx
@@ -0,0 +1,178 @@
+/**
+ * @file AudioPlayer.test.tsx
+ * @description Tests for the AudioPlayer component that provides inline TTS audio playback
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { AudioPlayer } from "./AudioPlayer";
+
+// Mock HTMLAudioElement
+class MockAudio {
+  src = "";
+  currentTime = 0;
+  duration = 60;
+  paused = true;
+  playbackRate = 1;
+  volume = 1;
+  onended: (() => void) | null = null;
+  ontimeupdate: (() => void) | null = null;
+  onloadedmetadata: (() => void) | null = null;
+  onerror: ((e: unknown) => void) | null = null;
+
+  play(): Promise<void> {
+    this.paused = false;
+    return Promise.resolve();
+  }
+
+  pause(): void {
+    this.paused = true;
+  }
+
+  addEventListener(event: string, handler: () => void): void {
+    if (event === "ended") this.onended = handler;
+    if (event === "timeupdate") this.ontimeupdate = handler;
+    if (event === "loadedmetadata") this.onloadedmetadata = handler;
+    if (event === "error") this.onerror = handler;
+  }
+
+  removeEventListener(): void {
+    // no-op for tests
+  }
+}
+
+vi.stubGlobal("Audio", MockAudio);
+
+describe("AudioPlayer", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  describe("rendering", () => {
+    it("should render play button", () => {
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      const playButton = screen.getByRole("button", { name: "Play audio" });
+      expect(playButton).toBeInTheDocument();
+    });
+
+    it("should render download button", () => {
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      const downloadButton = screen.getByRole("button", { name: /download/i });
+      expect(downloadButton).toBeInTheDocument();
+    });
+
+    it("should render time display showing 0:00", () => {
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      expect(screen.getByText("0:00")).toBeInTheDocument();
+    });
+
+    it("should render speed control", () => {
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      const speedButton = screen.getByRole("button", { name: "Playback speed" });
+      expect(speedButton).toBeInTheDocument();
+    });
+
+    it("should render progress bar", () => {
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      const progressBar = screen.getByRole("progressbar");
+      expect(progressBar).toBeInTheDocument();
+    });
+
+    it("should not render when src is null", () => {
+      const { container } = render(<AudioPlayer src={null} />);
+
+      expect(container.firstChild).toBeNull();
+    });
+  });
+
+  describe("play/pause", () => {
+    it("should toggle to pause button when playing", async () => {
+      const user = userEvent.setup();
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      const playButton = screen.getByRole("button", { name: "Play audio" });
+      await user.click(playButton);
+
+      expect(screen.getByRole("button", { name: "Pause audio" })).toBeInTheDocument();
+    });
+  });
+
+  describe("speed control", () => {
+    it("should cycle through speed options on click", async () => {
+      const user = userEvent.setup();
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      const speedButton = screen.getByRole("button", { name: "Playback speed" });
+
+      // Default should be 1x
+      expect(speedButton).toHaveTextContent("1x");
+
+      // Click to go to 1.5x
+      await user.click(speedButton);
+      expect(speedButton).toHaveTextContent("1.5x");
+
+      // Click to go to 2x
+      await user.click(speedButton);
+      expect(speedButton).toHaveTextContent("2x");
+
+      // Click to go to 0.5x
+      await user.click(speedButton);
+      expect(speedButton).toHaveTextContent("0.5x");
+
+      // Click to go back to 1x
+      await user.click(speedButton);
+      expect(speedButton).toHaveTextContent("1x");
+    });
+  });
+
+  describe("accessibility", () => {
+    it("should have proper aria labels on controls", () => {
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      expect(screen.getByRole("button", { name: "Play audio" })).toBeInTheDocument();
+      expect(screen.getByRole("button", { name: /download/i })).toBeInTheDocument();
+      expect(screen.getByRole("button", { name: "Playback speed" })).toBeInTheDocument();
+      expect(screen.getByRole("progressbar")).toHaveAttribute("aria-label");
+    });
+
+    it("should have region role on the player container", () => {
+      render(<AudioPlayer src="blob:test-audio" />);
+
+      expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument();
+    });
+  });
+
+  describe("design", () => {
+    it("should not use aggressive red colors", () => {
+      const { container } = render(<AudioPlayer src="blob:test-audio" />);
+
+      const allElements = container.querySelectorAll("*");
+      allElements.forEach((el) => {
+        const className = el.className;
+        if (typeof className === "string") {
+          expect(className).not.toMatch(/bg-red-|text-red-|border-red-/);
+        }
+      });
+    });
+  });
+
+  describe("callbacks", () => {
+    it("should call onPlayStateChange when play state changes", async () => {
+      const onPlayStateChange = vi.fn();
+      const user = userEvent.setup();
+
+      render(<AudioPlayer src="blob:test-audio" onPlayStateChange={onPlayStateChange} />);
+
+      const playButton = screen.getByRole("button", { name: "Play audio" });
+      await user.click(playButton);
+
+      expect(onPlayStateChange).toHaveBeenCalledWith(true);
+    });
+  });
+});
--- a/apps/web/src/components/speech/AudioPlayer.tsx
+++ b/apps/web/src/components/speech/AudioPlayer.tsx
@@ -0,0 +1,250 @@
+/**
+ * AudioPlayer Component
+ * Inline audio player for TTS content with play/pause, progress,
+ * speed control, download, and duration display.
+ *
+ * Follows PDA-friendly design: no aggressive colors, calm interface.
+ */
+
+import { useState, useRef, useEffect, useCallback } from "react";
+import type { ReactElement } from "react";
+
+/** Playback speed options */
+const SPEED_OPTIONS = [1, 1.5, 2, 0.5] as const;
+
+export interface AudioPlayerProps {
+  /** URL of the audio to play (blob URL or HTTP URL). If null, nothing renders. */
+  src: string | null;
+  /** Whether to auto-play when src changes */
+  autoPlay?: boolean;
+  /** Callback when play state changes */
+  onPlayStateChange?: (isPlaying: boolean) => void;
+  /** Optional className for the container */
+  className?: string;
+}
+
+/**
+ * Format seconds into M:SS display
+ */
+function formatTime(seconds: number): string {
+  if (!isFinite(seconds) || seconds < 0) return "0:00";
+  const mins = Math.floor(seconds / 60);
+  const secs = Math.floor(seconds % 60);
+  return `${String(mins)}:${String(secs).padStart(2, "0")}`;
+}
+
+/**
+ * AudioPlayer displays an inline audio player with controls for
+ * play/pause, progress tracking, speed adjustment, and download.
+ */
+export function AudioPlayer({
+  src,
+  autoPlay = false,
+  onPlayStateChange,
+  className = "",
+}: AudioPlayerProps): ReactElement | null {
+  const [isPlaying, setIsPlaying] = useState(false);
+  const [currentTime, setCurrentTime] = useState(0);
+  const [duration, setDuration] = useState(0);
+  const [speedIndex, setSpeedIndex] = useState(0);
+
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+
+  /**
+   * Set up audio element when src changes
+   */
+  useEffect((): (() => void) | undefined => {
+    if (!src) return undefined;
+
+    const audio = new Audio(src);
+    audioRef.current = audio;
+
+    const onLoadedMetadata = (): void => {
+      if (isFinite(audio.duration)) {
+        setDuration(audio.duration);
+      }
+    };
+
+    const onTimeUpdate = (): void => {
+      setCurrentTime(audio.currentTime);
+    };
+
+    const onEnded = (): void => {
+      setIsPlaying(false);
+      setCurrentTime(0);
+      onPlayStateChange?.(false);
+    };
+
+    audio.addEventListener("loadedmetadata", onLoadedMetadata);
+    audio.addEventListener("timeupdate", onTimeUpdate);
+    audio.addEventListener("ended", onEnded);
+
+    if (autoPlay) {
+      void audio.play().then(() => {
+        setIsPlaying(true);
+        onPlayStateChange?.(true);
+      });
+    }
+
+    return (): void => {
+      audio.pause();
+      audio.removeEventListener("loadedmetadata", onLoadedMetadata);
+      audio.removeEventListener("timeupdate", onTimeUpdate);
+      audio.removeEventListener("ended", onEnded);
+      audioRef.current = null;
+    };
+  }, [src, autoPlay, onPlayStateChange]);
+
+  /**
+   * Toggle play/pause
+   */
+  const togglePlayPause = useCallback(async (): Promise<void> => {
+    const audio = audioRef.current;
+    if (!audio) return;
+
+    if (isPlaying) {
+      audio.pause();
+      setIsPlaying(false);
+      onPlayStateChange?.(false);
+    } else {
+      await audio.play();
+      setIsPlaying(true);
+      onPlayStateChange?.(true);
+    }
+  }, [isPlaying, onPlayStateChange]);
+
+  /**
+   * Cycle through speed options
+   */
+  const cycleSpeed = useCallback((): void => {
+    const nextIndex = (speedIndex + 1) % SPEED_OPTIONS.length;
+    setSpeedIndex(nextIndex);
+
+    const audio = audioRef.current;
+    if (audio) {
+      audio.playbackRate = SPEED_OPTIONS[nextIndex] ?? 1;
+    }
+  }, [speedIndex]);
+
+  /**
+   * Handle progress bar click for seeking
+   */
+  const handleProgressClick = useCallback(
+    (event: React.MouseEvent<HTMLDivElement>): void => {
+      const audio = audioRef.current;
+      if (!audio || !duration) return;
+
+      const rect = event.currentTarget.getBoundingClientRect();
+      const clickX = event.clientX - rect.left;
+      const fraction = clickX / rect.width;
+      audio.currentTime = fraction * duration;
+      setCurrentTime(audio.currentTime);
+    },
+    [duration]
+  );
+
+  /**
+   * Handle download
+   */
+  const handleDownload = useCallback((): void => {
+    if (!src) return;
+
+    const link = document.createElement("a");
+    link.href = src;
+    link.download = "speech-audio.mp3";
+    document.body.appendChild(link);
+    link.click();
+    document.body.removeChild(link);
+  }, [src]);
+
+  // Don't render if no source
+  if (!src) return null;
+
+  const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
+  const currentSpeed = SPEED_OPTIONS[speedIndex] ?? 1;
+
+  return (
+    <div
+      role="region"
+      aria-label="Audio player"
+      className={`flex items-center gap-2 rounded-lg border border-gray-200 bg-gray-50 px-3 py-2 ${className}`}
+    >
+      {/* Play/Pause Button */}
+      <button
+        type="button"
+        onClick={() => void togglePlayPause()}
+        aria-label={isPlaying ? "Pause audio" : "Play audio"}
+        className="flex h-8 w-8 shrink-0 items-center justify-center rounded-full bg-blue-500 text-white transition-colors hover:bg-blue-600 focus:outline-none focus:ring-2 focus:ring-blue-300"
+      >
+        {isPlaying ? (
+          <svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true">
+            <rect x="6" y="4" width="4" height="16" rx="1" />
+            <rect x="14" y="4" width="4" height="16" rx="1" />
+          </svg>
+        ) : (
+          <svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor" aria-hidden="true">
+            <polygon points="6,4 20,12 6,20" />
+          </svg>
+        )}
+      </button>
+
+      {/* Time Display */}
+      <span className="min-w-[3.5rem] text-xs text-gray-500 tabular-nums">
+        {formatTime(currentTime)}
+        {duration > 0 && <span className="text-gray-400"> / {formatTime(duration)}</span>}
+      </span>
+
+      {/* Progress Bar */}
+      <div
+        role="progressbar"
+        aria-label="Audio progress"
+        aria-valuenow={Math.round(progress)}
+        aria-valuemin={0}
+        aria-valuemax={100}
+        className="relative h-1.5 flex-1 cursor-pointer rounded-full bg-gray-200"
+        onClick={handleProgressClick}
+      >
+        <div
+          className="absolute left-0 top-0 h-full rounded-full bg-blue-400 transition-all"
+          style={{ width: `${String(Math.min(progress, 100))}%` }}
+        />
+      </div>
+
+      {/* Speed Control */}
+      <button
+        type="button"
+        onClick={cycleSpeed}
+        aria-label="Playback speed"
+        className="min-w-[2.5rem] rounded px-1.5 py-0.5 text-xs font-medium text-gray-600 transition-colors hover:bg-gray-200 focus:outline-none focus:ring-2 focus:ring-blue-300"
+      >
+        {String(currentSpeed)}x
+      </button>
+
+      {/* Download Button */}
+      <button
+        type="button"
+        onClick={handleDownload}
+        aria-label="Download audio"
+        className="flex h-7 w-7 shrink-0 items-center justify-center rounded text-gray-500 transition-colors hover:bg-gray-200 hover:text-gray-700 focus:outline-none focus:ring-2 focus:ring-blue-300"
+      >
+        <svg
+          width="14"
+          height="14"
+          viewBox="0 0 24 24"
+          fill="none"
+          stroke="currentColor"
+          strokeWidth="2"
+          strokeLinecap="round"
+          strokeLinejoin="round"
+          aria-hidden="true"
+        >
+          <path d="M21 15v4a2 2 0 01-2 2H5a2 2 0 01-2-2v-4" />
+          <polyline points="7 10 12 15 17 10" />
+          <line x1="12" y1="15" x2="12" y2="3" />
+        </svg>
+      </button>
+    </div>
+  );
+}
+
+export default AudioPlayer;
--- a/apps/web/src/components/speech/AudioVisualizer.test.tsx
+++ b/apps/web/src/components/speech/AudioVisualizer.test.tsx
@@ -0,0 +1,70 @@
+import { describe, it, expect } from "vitest";
+import { render, screen } from "@testing-library/react";
+import { AudioVisualizer } from "./AudioVisualizer";
+
+describe("AudioVisualizer", (): void => {
+  it("should render the visualizer container", (): void => {
+    render(<AudioVisualizer audioLevel={0} isActive={false} />);
+
+    const container = screen.getByTestId("audio-visualizer");
+    expect(container).toBeInTheDocument();
+  });
+
+  it("should render visualization bars", (): void => {
+    render(<AudioVisualizer audioLevel={0.5} isActive={true} />);
+
+    const bars = screen.getAllByTestId("visualizer-bar");
+    expect(bars.length).toBeGreaterThan(0);
+  });
+
+  it("should show inactive state when not active", (): void => {
+    render(<AudioVisualizer audioLevel={0} isActive={false} />);
+
+    const container = screen.getByTestId("audio-visualizer");
+    expect(container).toBeInTheDocument();
+    // Bars should be at minimum height when inactive
+    const bars = screen.getAllByTestId("visualizer-bar");
+    bars.forEach((bar) => {
+      const style = bar.getAttribute("style");
+      expect(style).toContain("height");
+    });
+  });
+
+  it("should reflect audio level in bar heights when active", (): void => {
+    render(<AudioVisualizer audioLevel={0.8} isActive={true} />);
+
+    const bars = screen.getAllByTestId("visualizer-bar");
+    // At least one bar should have non-minimal height
+    const hasActiveBars = bars.some((bar) => {
+      const style = bar.getAttribute("style") ?? "";
+      const heightMatch = /height:\s*(\d+)/.exec(style);
+      return heightMatch?.[1] ? parseInt(heightMatch[1], 10) > 4 : false;
+    });
+    expect(hasActiveBars).toBe(true);
+  });
+
+  it("should use calm colors (no aggressive reds)", (): void => {
+    render(<AudioVisualizer audioLevel={0.5} isActive={true} />);
+
+    const container = screen.getByTestId("audio-visualizer");
+    const allElements = container.querySelectorAll("*");
+    allElements.forEach((el) => {
+      const className = (el as HTMLElement).className;
+      expect(className).not.toMatch(/bg-red-|text-red-/);
+    });
+  });
+
+  it("should accept custom className", (): void => {
+    render(<AudioVisualizer audioLevel={0.5} isActive={true} className="custom-class" />);
+
+    const container = screen.getByTestId("audio-visualizer");
+    expect(container.className).toContain("custom-class");
+  });
+
+  it("should render with configurable bar count", (): void => {
+    render(<AudioVisualizer audioLevel={0.5} isActive={true} barCount={8} />);
+
+    const bars = screen.getAllByTestId("visualizer-bar");
+    expect(bars).toHaveLength(8);
+  });
+});
--- a/apps/web/src/components/speech/AudioVisualizer.tsx
+++ b/apps/web/src/components/speech/AudioVisualizer.tsx
@@ -0,0 +1,87 @@
+/**
+ * AudioVisualizer component
+ *
+ * Displays a simple audio level visualization using bars.
+ * Uses the Web Audio API's AnalyserNode data (passed as audioLevel)
+ * to show microphone input levels during recording.
+ *
+ * Design: Calm, non-aggressive colors following PDA-friendly guidelines.
+ */
+
+import { useMemo } from "react";
+
+export interface AudioVisualizerProps {
+  /** Current audio level (0-1) */
+  audioLevel: number;
+  /** Whether the visualizer is actively listening */
+  isActive: boolean;
+  /** Number of bars to display (default: 5) */
+  barCount?: number;
+  /** Additional CSS classes */
+  className?: string;
+}
+
+/**
+ * Generate bar heights based on audio level.
+ * Creates a natural-looking wave pattern where center bars are taller.
+ */
+function generateBarHeights(level: number, count: number): number[] {
+  const heights: number[] = [];
+  const center = (count - 1) / 2;
+
+  for (let i = 0; i < count; i++) {
+    // Distance from center (0-1)
+    const distFromCenter = Math.abs(i - center) / center;
+    // Center bars are taller, edge bars shorter
+    const multiplier = 1 - distFromCenter * 0.5;
+    // Min height 4px, max height 24px when active
+    const minHeight = 4;
+    const maxHeight = 24;
+    const height = minHeight + level * (maxHeight - minHeight) * multiplier;
+    heights.push(Math.round(height));
+  }
+
+  return heights;
+}
+
+/**
+ * Audio level visualizer with animated bars.
+ * Shows microphone input levels during voice recording.
+ */
+export function AudioVisualizer({
+  audioLevel,
+  isActive,
+  barCount = 5,
+  className = "",
+}: AudioVisualizerProps): React.JSX.Element {
+  const barHeights = useMemo(() => {
+    if (!isActive) {
+      return Array.from({ length: barCount }, () => 4);
+    }
+    return generateBarHeights(audioLevel, barCount);
+  }, [audioLevel, isActive, barCount]);
+
+  return (
+    <div
+      data-testid="audio-visualizer"
+      className={`flex items-center gap-0.5 ${className}`}
+      role="img"
+      aria-label={
+        isActive
+          ? `Audio level: ${String(Math.round(audioLevel * 100))}%`
+          : "Audio visualizer inactive"
+      }
+    >
+      {barHeights.map((height, index) => (
+        <div
+          key={index}
+          data-testid="visualizer-bar"
+          className={`w-1 rounded-full transition-all duration-150 ease-out ${
+            isActive ? "bg-sky-400" : "bg-slate-300 dark:bg-slate-600"
+          }`}
+          style={{ height: `${height.toString()}px` }}
+        />
+      ))}
+    </div>
+  );
+}
--- a/apps/web/src/components/speech/SpeechSettings.test.tsx
+++ b/apps/web/src/components/speech/SpeechSettings.test.tsx
@@ -0,0 +1,439 @@
+/**
+ * @file SpeechSettings.test.tsx
+ * @description Tests for the SpeechSettings component
+ *
+ * Validates all settings sections: STT, TTS, Voice Preview, Provider Status.
+ * Follows TDD: tests written before implementation.
+ *
+ * Issue #404
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { render, screen, waitFor, within } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { SpeechSettings } from "./SpeechSettings";
+
+// Mock the speech API
+const mockGetVoices = vi.fn();
+const mockGetHealthStatus = vi.fn();
+const mockSynthesizeSpeech = vi.fn();
+
+vi.mock("@/lib/api/speech", () => ({
+  getVoices: (...args: unknown[]): unknown => mockGetVoices(...args) as unknown,
+  getHealthStatus: (...args: unknown[]): unknown => mockGetHealthStatus(...args) as unknown,
+  synthesizeSpeech: (...args: unknown[]): unknown => mockSynthesizeSpeech(...args) as unknown,
+}));
+
+// Mock the useTextToSpeech hook for voice preview
+const mockSynthesize = vi.fn();
+
+vi.mock("@/hooks/useTextToSpeech", () => ({
+  useTextToSpeech: vi.fn(() => ({
+    synthesize: mockSynthesize,
+    audioUrl: null,
+    isLoading: false,
+    error: null,
+    play: vi.fn(),
+    pause: vi.fn(),
+    stop: vi.fn(),
+    isPlaying: false,
+    duration: 0,
+    currentTime: 0,
+  })),
+}));
+
+// Mock HTMLAudioElement for AudioPlayer used inside preview
+class MockAudio {
+  src = "";
+  currentTime = 0;
+  duration = 60;
+  paused = true;
+  playbackRate = 1;
+  volume = 1;
+  onended: (() => void) | null = null;
+  ontimeupdate: (() => void) | null = null;
+  onloadedmetadata: (() => void) | null = null;
+  onerror: ((e: unknown) => void) | null = null;
+
+  play(): Promise<void> {
+    this.paused = false;
+    return Promise.resolve();
+  }
+
+  pause(): void {
+    this.paused = true;
+  }
+
+  addEventListener(): void {
+    // no-op
+  }
+
+  removeEventListener(): void {
+    // no-op
+  }
+}
+
+vi.stubGlobal("Audio", MockAudio);
+
+// Default mock responses
+const mockVoicesResponse = {
+  data: [
+    { id: "voice-1", name: "Alloy", language: "en", tier: "default", isDefault: true },
+    { id: "voice-2", name: "Nova", language: "en", tier: "default", isDefault: false },
+    { id: "voice-3", name: "Premium Voice", language: "en", tier: "premium", isDefault: true },
+  ],
+};
+
+const mockHealthResponse = {
+  data: {
+    stt: { available: true },
+    tts: { available: true },
+  },
+};
+
+describe("SpeechSettings", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    mockGetVoices.mockResolvedValue(mockVoicesResponse);
+    mockGetHealthStatus.mockResolvedValue(mockHealthResponse);
+    mockSynthesizeSpeech.mockResolvedValue(new Blob());
+  });
+
+  describe("rendering", () => {
+    it("should render the speech settings heading", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Speech Settings")).toBeInTheDocument();
+      });
+    });
+
+    it("should render the STT settings section", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Speech-to-Text")).toBeInTheDocument();
+      });
+    });
+
+    it("should render the TTS settings section", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Text-to-Speech")).toBeInTheDocument();
+      });
+    });
+
+    it("should render the provider status section", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Provider Status")).toBeInTheDocument();
+      });
+    });
+
+    it("should render all four section cards", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Speech-to-Text")).toBeInTheDocument();
+        expect(screen.getByText("Text-to-Speech")).toBeInTheDocument();
+        expect(screen.getByText("Voice Preview")).toBeInTheDocument();
+        expect(screen.getByText("Provider Status")).toBeInTheDocument();
+      });
+    });
+  });
+
+  describe("STT settings", () => {
+    it("should render an enable/disable toggle for STT", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        const sttToggle = screen.getByRole("switch", { name: /enable speech-to-text/i });
+        expect(sttToggle).toBeInTheDocument();
+      });
+    });
+
+    it("should render a language preference dropdown", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Language")).toBeInTheDocument();
+      });
+    });
+
+    it("should toggle STT enabled state when clicked", async () => {
+      const user = userEvent.setup();
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByRole("switch", { name: /enable speech-to-text/i })).toBeInTheDocument();
+      });
+
+      const sttToggle = screen.getByRole("switch", { name: /enable speech-to-text/i });
+      // Default should be checked (enabled)
+      expect(sttToggle).toBeChecked();
+
+      await user.click(sttToggle);
+      expect(sttToggle).not.toBeChecked();
+    });
+  });
+
+  describe("TTS settings", () => {
+    it("should render an enable/disable toggle for TTS", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        const ttsToggle = screen.getByRole("switch", { name: /enable text-to-speech/i });
+        expect(ttsToggle).toBeInTheDocument();
+      });
+    });
+
+    it("should render a voice selector", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Default Voice")).toBeInTheDocument();
+      });
+    });
+
+    it("should render a tier preference selector", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Provider Tier")).toBeInTheDocument();
+      });
+    });
+
+    it("should render an auto-play toggle", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        const autoPlayToggle = screen.getByRole("switch", { name: /auto-play/i });
+        expect(autoPlayToggle).toBeInTheDocument();
+      });
+    });
+
+    it("should render a speed control slider", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Speed")).toBeInTheDocument();
+        const slider = screen.getByRole("slider");
+        expect(slider).toBeInTheDocument();
+      });
+    });
+
+    it("should display the current speed value", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        // The speed display label shows "1.0x" next to the Speed label
+        const speedLabels = screen.getAllByText("1.0x");
+        expect(speedLabels.length).toBeGreaterThanOrEqual(1);
+      });
+    });
+
+    it("should toggle TTS enabled state when clicked", async () => {
+      const user = userEvent.setup();
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByRole("switch", { name: /enable text-to-speech/i })).toBeInTheDocument();
+      });
+
+      const ttsToggle = screen.getByRole("switch", { name: /enable text-to-speech/i });
+      expect(ttsToggle).toBeChecked();
+
+      await user.click(ttsToggle);
+      expect(ttsToggle).not.toBeChecked();
+    });
+
+    it("should toggle auto-play state when clicked", async () => {
+      const user = userEvent.setup();
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByRole("switch", { name: /auto-play/i })).toBeInTheDocument();
+      });
+
+      const autoPlayToggle = screen.getByRole("switch", { name: /auto-play/i });
+      // Default should be unchecked
+      expect(autoPlayToggle).not.toBeChecked();
+
+      await user.click(autoPlayToggle);
+      expect(autoPlayToggle).toBeChecked();
+    });
+  });
+
+  describe("voice selector", () => {
+    it("should fetch voices on mount", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(mockGetVoices).toHaveBeenCalled();
+      });
+    });
+
+    it("should display voice options after fetching", async () => {
+      const user = userEvent.setup();
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(mockGetVoices).toHaveBeenCalled();
+      });
+
+      // Open the voice selector by clicking the trigger button (id="tts-voice")
+      const voiceButton = document.getElementById("tts-voice");
+      expect(voiceButton).toBeTruthy();
+      if (!voiceButton) throw new Error("Voice button not found");
+      await user.click(voiceButton);
+
+      await waitFor(() => {
+        expect(screen.getByText("Alloy")).toBeInTheDocument();
+        expect(screen.getByText("Nova")).toBeInTheDocument();
+      });
+    });
+
+    it("should handle API error gracefully when fetching voices", async () => {
+      mockGetVoices.mockRejectedValueOnce(new Error("Network error"));
+
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText(/unable to load voices/i)).toBeInTheDocument();
+      });
+    });
+  });
+
+  describe("voice preview", () => {
+    it("should render a voice preview section", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Voice Preview")).toBeInTheDocument();
+      });
+    });
+
+    it("should render a test button for voice preview", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        const testButton = screen.getByRole("button", { name: /test voice/i });
+        expect(testButton).toBeInTheDocument();
+      });
+    });
+  });
+
+  describe("provider status", () => {
+    it("should fetch health status on mount", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(mockGetHealthStatus).toHaveBeenCalled();
+      });
+    });
+
+    it("should display STT provider status", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Speech-to-Text Provider")).toBeInTheDocument();
+      });
+    });
+
+    it("should display TTS provider status", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Text-to-Speech Provider")).toBeInTheDocument();
+      });
+    });
+
+    it("should show active indicator when provider is available", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        const statusSection = screen.getByTestId("provider-status");
+        const activeIndicators = within(statusSection).getAllByTestId("status-active");
+        expect(activeIndicators.length).toBe(2);
+      });
+    });
+
+    it("should show inactive indicator when provider is unavailable", async () => {
+      mockGetHealthStatus.mockResolvedValueOnce({
+        data: {
+          stt: { available: false },
+          tts: { available: true },
+        },
+      });
+
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        const statusSection = screen.getByTestId("provider-status");
+        const inactiveIndicators = within(statusSection).getAllByTestId("status-inactive");
+        expect(inactiveIndicators.length).toBe(1);
+      });
+    });
+
+    it("should handle health check error gracefully", async () => {
+      mockGetHealthStatus.mockRejectedValueOnce(new Error("Service unavailable"));
+
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText(/unable to check provider status/i)).toBeInTheDocument();
+      });
+    });
+  });
+
+  describe("PDA-friendly design", () => {
+    it("should not use aggressive red colors", async () => {
+      const { container } = render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Speech Settings")).toBeInTheDocument();
+      });
+
+      const allElements = container.querySelectorAll("*");
+      allElements.forEach((el) => {
+        const className = el.className;
+        if (typeof className === "string") {
+          expect(className).not.toMatch(/bg-red-|text-red-|border-red-/);
+        }
+      });
+    });
+
+    it("should not use demanding language", async () => {
+      const { container } = render(<SpeechSettings />);
+
+      await waitFor(() => {
+        expect(screen.getByText("Speech Settings")).toBeInTheDocument();
+      });
+
+      const text = container.textContent;
+      const demandingWords = [
+        "OVERDUE",
+        "URGENT",
+        "MUST DO",
+        "CRITICAL",
+        "REQUIRED",
+        "YOU NEED TO",
+      ];
+      for (const word of demandingWords) {
+        expect(text.toUpperCase()).not.toContain(word);
+      }
+    });
+
+    it("should use descriptive section headers", async () => {
+      render(<SpeechSettings />);
+
+      await waitFor(() => {
+        // Check for descriptive subtext under section headers
+        expect(screen.getByText("Configure voice input preferences")).toBeInTheDocument();
+        expect(screen.getByText("Configure voice output preferences")).toBeInTheDocument();
+      });
+    });
+  });
+});
--- a/apps/web/src/components/speech/SpeechSettings.tsx
+++ b/apps/web/src/components/speech/SpeechSettings.tsx
@@ -0,0 +1,404 @@
+/**
+ * SpeechSettings Component
+ *
+ * Settings page for configuring speech preferences per workspace.
+ * Includes STT settings, TTS settings, voice preview, and provider status.
+ *
+ * Follows PDA-friendly design: calm colors, no aggressive language.
+ *
+ * Issue #404
+ */
+
+"use client";
+
+import { useState, useEffect, useCallback } from "react";
+import type { ReactElement } from "react";
+import { Card, CardHeader, CardContent, CardTitle, CardDescription } from "@/components/ui/card";
+import { Switch } from "@/components/ui/switch";
+import { Label } from "@/components/ui/label";
+import { Button } from "@/components/ui/button";
+import { Slider } from "@/components/ui/slider";
+import {
+  Select,
+  SelectTrigger,
+  SelectValue,
+  SelectContent,
+  SelectItem,
+} from "@/components/ui/select";
+import { getVoices, getHealthStatus } from "@/lib/api/speech";
+import type { VoiceInfo, HealthResponse } from "@/lib/api/speech";
+import { useTextToSpeech } from "@/hooks/useTextToSpeech";
+
+/** Supported languages for STT */
+const STT_LANGUAGES = [
+  { value: "en", label: "English" },
+  { value: "es", label: "Spanish" },
+  { value: "fr", label: "French" },
+  { value: "de", label: "German" },
+  { value: "it", label: "Italian" },
+  { value: "pt", label: "Portuguese" },
+  { value: "ja", label: "Japanese" },
+  { value: "zh", label: "Chinese" },
+  { value: "ko", label: "Korean" },
+  { value: "auto", label: "Auto-detect" },
+];
+
+/** TTS tier options */
+const TIER_OPTIONS = [
+  { value: "default", label: "Default" },
+  { value: "premium", label: "Premium" },
+  { value: "fallback", label: "Fallback" },
+];
+
+/** Sample text for voice preview */
+const PREVIEW_TEXT = "Hello, this is a preview of the selected voice. How does it sound?";
+
+/**
+ * SpeechSettings provides a comprehensive settings interface for
+ * configuring speech-to-text and text-to-speech preferences.
+ */
+export function SpeechSettings(): ReactElement {
+  // STT state
+  const [sttEnabled, setSttEnabled] = useState(true);
+  const [sttLanguage, setSttLanguage] = useState("en");
+
+  // TTS state
+  const [ttsEnabled, setTtsEnabled] = useState(true);
+  const [selectedVoice, setSelectedVoice] = useState("");
+  const [selectedTier, setSelectedTier] = useState("default");
+  const [autoPlay, setAutoPlay] = useState(false);
+  const [speed, setSpeed] = useState(1.0);
+
+  // Data state
+  const [voices, setVoices] = useState<VoiceInfo[]>([]);
+  const [voicesError, setVoicesError] = useState<string | null>(null);
+  const [healthData, setHealthData] = useState<HealthResponse["data"] | null>(null);
+  const [healthError, setHealthError] = useState<string | null>(null);
+
+  // Preview hook
+  const {
+    synthesize,
+    audioUrl,
+    isLoading: isPreviewLoading,
+    error: previewError,
+  } = useTextToSpeech();
+
+  /**
+   * Fetch available voices from the API
+   */
+  const fetchVoices = useCallback(async (): Promise<void> => {
+    try {
+      setVoicesError(null);
+      const response = await getVoices();
+      setVoices(response.data);
+
+      // Select the first default voice if none selected
+      if (response.data.length > 0 && !selectedVoice) {
+        const defaultVoice = response.data.find((v) => v.isDefault);
+        const firstVoice = response.data[0];
+        setSelectedVoice(defaultVoice?.id ?? firstVoice?.id ?? "");
+      }
+    } catch {
+      setVoicesError("Unable to load voices. Please try again later.");
+    }
+  }, [selectedVoice]);
+
+  /**
+   * Fetch health status from the API
+   */
+  const fetchHealth = useCallback(async (): Promise<void> => {
+    try {
+      setHealthError(null);
+      const response = await getHealthStatus();
+      setHealthData(response.data);
+    } catch {
+      setHealthError("Unable to check provider status. Please try again later.");
+    }
+  }, []);
+
+  // Fetch voices and health on mount
+  useEffect(() => {
+    void fetchVoices();
+    void fetchHealth();
+  }, [fetchVoices, fetchHealth]);
+
+  /**
+   * Handle voice preview test
+   */
+  const handleTestVoice = useCallback(async (): Promise<void> => {
+    const options: Record<string, string | number> = {
+      speed,
+      tier: selectedTier,
+    };
+    if (selectedVoice) {
+      options.voice = selectedVoice;
+    }
+    await synthesize(PREVIEW_TEXT, options);
+  }, [synthesize, selectedVoice, speed, selectedTier]);
+
+  return (
+    <div className="space-y-6">
+      <div>
+        <h2 className="text-2xl font-semibold text-gray-900">Speech Settings</h2>
+        <p className="text-sm text-gray-600 mt-1">
+          Configure voice input and output preferences for your workspace
+        </p>
+      </div>
+
+      {/* STT Settings */}
+      <Card>
+        <CardHeader>
+          <CardTitle className="text-lg">Speech-to-Text</CardTitle>
+          <CardDescription>Configure voice input preferences</CardDescription>
+        </CardHeader>
+        <CardContent>
+          <div className="space-y-4">
+            {/* Enable STT Toggle */}
+            <div className="flex items-center justify-between">
+              <div className="space-y-0.5">
+                <Label htmlFor="stt-enabled">Enable Speech-to-Text</Label>
+                <p className="text-xs text-gray-500">
+                  Allow voice input for text fields and commands
+                </p>
+              </div>
+              <Switch
+                id="stt-enabled"
+                checked={sttEnabled}
+                onCheckedChange={setSttEnabled}
+                aria-label="Enable Speech-to-Text"
+              />
+            </div>
+
+            {/* Language Preference */}
+            <div className="space-y-2">
+              <Label htmlFor="stt-language">Language</Label>
+              <Select value={sttLanguage} onValueChange={setSttLanguage}>
+                <SelectTrigger id="stt-language" className="w-full">
+                  <SelectValue placeholder="Select language" />
+                </SelectTrigger>
+                <SelectContent>
+                  {STT_LANGUAGES.map((lang) => (
+                    <SelectItem key={lang.value} value={lang.value}>
+                      {lang.label}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+          </div>
+        </CardContent>
+      </Card>
+
+      {/* TTS Settings */}
+      <Card>
+        <CardHeader>
+          <CardTitle className="text-lg">Text-to-Speech</CardTitle>
+          <CardDescription>Configure voice output preferences</CardDescription>
+        </CardHeader>
+        <CardContent>
+          <div className="space-y-4">
+            {/* Enable TTS Toggle */}
+            <div className="flex items-center justify-between">
+              <div className="space-y-0.5">
+                <Label htmlFor="tts-enabled">Enable Text-to-Speech</Label>
+                <p className="text-xs text-gray-500">
+                  Allow reading content aloud with synthesized voice
+                </p>
+              </div>
+              <Switch
+                id="tts-enabled"
+                checked={ttsEnabled}
+                onCheckedChange={setTtsEnabled}
+                aria-label="Enable Text-to-Speech"
+              />
+            </div>
+
+            {/* Default Voice Selector */}
+            <div className="space-y-2">
+              <Label htmlFor="tts-voice">Default Voice</Label>
+              {voicesError ? (
+                <p className="text-sm text-amber-600">{voicesError}</p>
+              ) : (
+                <Select value={selectedVoice} onValueChange={setSelectedVoice}>
+                  <SelectTrigger id="tts-voice" className="w-full">
+                    <SelectValue placeholder="Select a voice" />
+                  </SelectTrigger>
+                  <SelectContent>
+                    {voices.map((voice) => (
+                      <SelectItem key={voice.id} value={voice.id}>
+                        {voice.name}
+                      </SelectItem>
+                    ))}
+                  </SelectContent>
+                </Select>
+              )}
+            </div>
+
+            {/* Provider Tier Preference */}
+            <div className="space-y-2">
+              <Label htmlFor="tts-tier">Provider Tier</Label>
+              <p className="text-xs text-gray-500">
+                Choose the preferred quality tier for voice synthesis
+              </p>
+              <Select value={selectedTier} onValueChange={setSelectedTier}>
+                <SelectTrigger id="tts-tier" className="w-full">
+                  <SelectValue placeholder="Select tier" />
+                </SelectTrigger>
+                <SelectContent>
+                  {TIER_OPTIONS.map((tier) => (
+                    <SelectItem key={tier.value} value={tier.value}>
+                      {tier.label}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+
+            {/* Auto-play Toggle */}
+            <div className="flex items-center justify-between">
+              <div className="space-y-0.5">
+                <Label htmlFor="tts-autoplay">Auto-play</Label>
+                <p className="text-xs text-gray-500">
+                  Automatically play TTS responses when received
+                </p>
+              </div>
+              <Switch
+                id="tts-autoplay"
+                checked={autoPlay}
+                onCheckedChange={setAutoPlay}
+                aria-label="Auto-play"
+              />
+            </div>
+
+            {/* Speed Control */}
+            <div className="space-y-2">
+              <div className="flex items-center justify-between">
+                <Label htmlFor="tts-speed">Speed</Label>
+                <span className="text-sm text-gray-600">{speed.toFixed(1)}x</span>
+              </div>
+              <Slider
+                id="tts-speed"
+                min={0.5}
+                max={2}
+                step={0.1}
+                value={[speed]}
+                onValueChange={(values) => {
+                  const newSpeed = values[0];
+                  if (newSpeed !== undefined) {
+                    setSpeed(newSpeed);
+                  }
+                }}
+              />
+              <div className="flex justify-between text-xs text-gray-400">
+                <span>0.5x</span>
+                <span>1.0x</span>
+                <span>2.0x</span>
+              </div>
+            </div>
+          </div>
+        </CardContent>
+      </Card>
+
+      {/* Voice Preview */}
+      <Card>
+        <CardHeader>
+          <CardTitle className="text-lg">Voice Preview</CardTitle>
+          <CardDescription>Preview the selected voice with sample text</CardDescription>
+        </CardHeader>
+        <CardContent>
+          <div className="space-y-3">
+            <p className="text-sm text-gray-600 italic">&ldquo;{PREVIEW_TEXT}&rdquo;</p>
+            <Button
+              variant="outline"
+              size="sm"
+              onClick={() => void handleTestVoice()}
+              disabled={isPreviewLoading}
+              aria-label="Test voice"
+            >
+              {isPreviewLoading ? "Synthesizing..." : "Test Voice"}
+            </Button>
+            {previewError && <p className="text-sm text-amber-600">{previewError}</p>}
+            {audioUrl && (
+              <audio controls src={audioUrl} className="w-full mt-2">
+                <track kind="captions" />
+              </audio>
+            )}
+          </div>
+        </CardContent>
+      </Card>
+
+      {/* Provider Status */}
+      <Card>
+        <CardHeader>
+          <CardTitle className="text-lg">Provider Status</CardTitle>
+          <CardDescription>Current availability of speech service providers</CardDescription>
+        </CardHeader>
+        <CardContent>
+          <div className="space-y-3" data-testid="provider-status">
+            {healthError ? (
+              <p className="text-sm text-amber-600">{healthError}</p>
+            ) : healthData ? (
+              <>
+                {/* STT Provider */}
+                <div className="flex items-center justify-between py-2">
+                  <span className="text-sm text-gray-700">Speech-to-Text Provider</span>
+                  <div className="flex items-center gap-2">
+                    {healthData.stt.available ? (
+                      <>
+                        <span
+                          className="inline-block h-2.5 w-2.5 rounded-full bg-green-500"
+                          data-testid="status-active"
+                          aria-label="Active"
+                        />
+                        <span className="text-sm text-gray-600">Active</span>
+                      </>
+                    ) : (
+                      <>
+                        <span
+                          className="inline-block h-2.5 w-2.5 rounded-full bg-gray-400"
+                          data-testid="status-inactive"
+                          aria-label="Inactive"
+                        />
+                        <span className="text-sm text-gray-600">Inactive</span>
+                      </>
+                    )}
+                  </div>
+                </div>
+
+                {/* TTS Provider */}
+                <div className="flex items-center justify-between py-2">
+                  <span className="text-sm text-gray-700">Text-to-Speech Provider</span>
+                  <div className="flex items-center gap-2">
+                    {healthData.tts.available ? (
+                      <>
+                        <span
+                          className="inline-block h-2.5 w-2.5 rounded-full bg-green-500"
+                          data-testid="status-active"
+                          aria-label="Active"
+                        />
+                        <span className="text-sm text-gray-600">Active</span>
+                      </>
+                    ) : (
+                      <>
+                        <span
+                          className="inline-block h-2.5 w-2.5 rounded-full bg-gray-400"
+                          data-testid="status-inactive"
+                          aria-label="Inactive"
+                        />
+                        <span className="text-sm text-gray-600">Inactive</span>
+                      </>
+                    )}
+                  </div>
+                </div>
+              </>
+            ) : (
+              <p className="text-sm text-gray-500">Checking provider status...</p>
+            )}
+          </div>
+        </CardContent>
+      </Card>
+    </div>
+  );
+}
+
+export default SpeechSettings;
--- a/apps/web/src/components/speech/TextToSpeechButton.test.tsx
+++ b/apps/web/src/components/speech/TextToSpeechButton.test.tsx
@@ -0,0 +1,218 @@
+/**
+ * @file TextToSpeechButton.test.tsx
+ * @description Tests for the TextToSpeechButton "Read aloud" component
+ */
+
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { TextToSpeechButton } from "./TextToSpeechButton";
+
+// Mock the useTextToSpeech hook
+const mockSynthesize = vi.fn();
+const mockPlay = vi.fn();
+const mockPause = vi.fn();
+const mockStop = vi.fn();
+
+vi.mock("@/hooks/useTextToSpeech", () => ({
+  useTextToSpeech: vi.fn(() => ({
+    synthesize: mockSynthesize,
+    play: mockPlay,
+    pause: mockPause,
+    stop: mockStop,
+    audioUrl: null,
+    isLoading: false,
+    error: null,
+    isPlaying: false,
+    duration: 0,
+    currentTime: 0,
+  })),
+}));
+
+// Import after mocking
+import { useTextToSpeech } from "@/hooks/useTextToSpeech";
+
+const mockUseTextToSpeech = useTextToSpeech as ReturnType<typeof vi.fn>;
+
+// Mock HTMLAudioElement for AudioPlayer used inside TextToSpeechButton
+class MockAudio {
+  src = "";
+  currentTime = 0;
+  duration = 60;
+  paused = true;
+  playbackRate = 1;
+  volume = 1;
+  onended: (() => void) | null = null;
+  ontimeupdate: (() => void) | null = null;
+  onloadedmetadata: (() => void) | null = null;
+  onerror: ((e: unknown) => void) | null = null;
+
+  play(): Promise<void> {
+    this.paused = false;
+    return Promise.resolve();
+  }
+
+  pause(): void {
+    this.paused = true;
+  }
+
+  addEventListener(): void {
+    // no-op
+  }
+
+  removeEventListener(): void {
+    // no-op
+  }
+}
+
+vi.stubGlobal("Audio", MockAudio);
+
+describe("TextToSpeechButton", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    mockUseTextToSpeech.mockReturnValue({
+      synthesize: mockSynthesize,
+      play: mockPlay,
+      pause: mockPause,
+      stop: mockStop,
+      audioUrl: null,
+      isLoading: false,
+      error: null,
+      isPlaying: false,
+      duration: 0,
+      currentTime: 0,
+    });
+  });
+
+  describe("rendering", () => {
+    it("should render a read aloud button", () => {
+      render(<TextToSpeechButton text="Hello world" />);
+
+      const button = screen.getByRole("button", { name: /read aloud/i });
+      expect(button).toBeInTheDocument();
+    });
+
+    it("should not render AudioPlayer initially when no audio is synthesized", () => {
+      render(<TextToSpeechButton text="Hello world" />);
+
+      expect(screen.queryByRole("region", { name: /audio player/i })).not.toBeInTheDocument();
+    });
+  });
+
+  describe("click behavior", () => {
+    it("should call synthesize with text on click", async () => {
+      const user = userEvent.setup();
+      mockSynthesize.mockResolvedValueOnce(undefined);
+
+      render(<TextToSpeechButton text="Hello world" />);
+
+      const button = screen.getByRole("button", { name: /read aloud/i });
+      await user.click(button);
+
+      expect(mockSynthesize).toHaveBeenCalledWith("Hello world", undefined);
+    });
+
+    it("should pass voice and tier options when provided", async () => {
+      const user = userEvent.setup();
+      mockSynthesize.mockResolvedValueOnce(undefined);
+
+      render(<TextToSpeechButton text="Hello" voice="alloy" tier="premium" />);
+
+      const button = screen.getByRole("button", { name: /read aloud/i });
+      await user.click(button);
+
+      expect(mockSynthesize).toHaveBeenCalledWith("Hello", {
+        voice: "alloy",
+        tier: "premium",
+      });
+    });
+  });
+
+  describe("loading state", () => {
+    it("should show loading indicator while synthesizing", () => {
+      mockUseTextToSpeech.mockReturnValue({
+        synthesize: mockSynthesize,
+        play: mockPlay,
+        pause: mockPause,
+        stop: mockStop,
+        audioUrl: null,
+        isLoading: true,
+        error: null,
+        isPlaying: false,
+        duration: 0,
+        currentTime: 0,
+      });
+
+      render(<TextToSpeechButton text="Hello world" />);
+
+      const button = screen.getByRole("button", { name: /synthesizing/i });
+      expect(button).toBeInTheDocument();
+      expect(button).toBeDisabled();
+    });
+  });
+
+  describe("audio player integration", () => {
+    it("should show AudioPlayer when audio is available", () => {
+      mockUseTextToSpeech.mockReturnValue({
+        synthesize: mockSynthesize,
+        play: mockPlay,
+        pause: mockPause,
+        stop: mockStop,
+        audioUrl: "blob:mock-url",
+        isLoading: false,
+        error: null,
+        isPlaying: false,
+        duration: 30,
+        currentTime: 0,
+      });
+
+      render(<TextToSpeechButton text="Hello world" />);
+
+      expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument();
+    });
+  });
+
+  describe("error state", () => {
+    it("should display error message when synthesis fails", () => {
+      mockUseTextToSpeech.mockReturnValue({
+        synthesize: mockSynthesize,
+        play: mockPlay,
+        pause: mockPause,
+        stop: mockStop,
+        audioUrl: null,
+        isLoading: false,
+        error: "Synthesis failed",
+        isPlaying: false,
+        duration: 0,
+        currentTime: 0,
+      });
+
+      render(<TextToSpeechButton text="Hello world" />);
+
+      expect(screen.getByText(/synthesis failed/i)).toBeInTheDocument();
+    });
+  });
+
+  describe("accessibility", () => {
+    it("should have proper aria label on button", () => {
+      render(<TextToSpeechButton text="Hello world" />);
+
+      const button = screen.getByRole("button", { name: /read aloud/i });
+      expect(button).toBeInTheDocument();
+    });
+  });
+
+  describe("design", () => {
+    it("should not use aggressive colors", () => {
+      const { container } = render(<TextToSpeechButton text="Hello world" />);
+
+      const allElements = container.querySelectorAll("*");
+      allElements.forEach((el) => {
+        const className = el.className;
+        if (typeof className === "string") {
+          expect(className).not.toMatch(/bg-red-|text-red-|border-red-/);
+        }
+      });
+    });
+  });
+});
--- a/apps/web/src/components/speech/TextToSpeechButton.tsx
+++ b/apps/web/src/components/speech/TextToSpeechButton.tsx
@@ -0,0 +1,126 @@
+/**
+ * TextToSpeechButton Component
+ * "Read aloud" button that synthesizes text and plays it via AudioPlayer.
+ *
+ * Accepts text as a prop, with optional voice and tier selection.
+ * Shows loading state during synthesis and integrates AudioPlayer for playback.
+ *
+ * Follows PDA-friendly design: no aggressive colors, calm interface.
+ */
+
+import { useCallback } from "react";
+import type { ReactElement } from "react";
+import { useTextToSpeech } from "@/hooks/useTextToSpeech";
+import type { SynthesizeOptions } from "@/hooks/useTextToSpeech";
+import { AudioPlayer } from "./AudioPlayer";
+
+export interface TextToSpeechButtonProps {
+  /** The text to synthesize to speech */
+  text: string;
+  /** Optional voice ID to use */
+  voice?: string;
+  /** Optional tier (e.g. "default", "premium", "fallback") */
+  tier?: string;
+  /** Optional className for the container */
+  className?: string;
+}
+
+/**
+ * TextToSpeechButton provides a "Read aloud" button that synthesizes
+ * the given text and displays an AudioPlayer for playback control.
+ */
+export function TextToSpeechButton({
+  text,
+  voice,
+  tier,
+  className = "",
+}: TextToSpeechButtonProps): ReactElement {
+  const { synthesize, audioUrl, isLoading, error } = useTextToSpeech();
+
+  /**
+   * Handle read aloud button click
+   */
+  const handleClick = useCallback(async (): Promise<void> => {
+    let options: SynthesizeOptions | undefined;
+
+    if (voice !== undefined || tier !== undefined) {
+      options = {};
+      if (voice !== undefined) options.voice = voice;
+      if (tier !== undefined) options.tier = tier;
+    }
+
+    await synthesize(text, options);
+  }, [text, voice, tier, synthesize]);
+
+  return (
+    <div className={`flex flex-col gap-2 ${className}`}>
+      {/* Read Aloud Button */}
+      <button
+        type="button"
+        onClick={() => void handleClick()}
+        disabled={isLoading}
+        aria-label={isLoading ? "Synthesizing speech" : "Read aloud"}
+        className="inline-flex items-center gap-2 rounded-lg border border-gray-200 bg-white px-3 py-1.5 text-sm font-medium text-gray-700 transition-colors hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-blue-300 disabled:cursor-not-allowed disabled:opacity-50"
+      >
+        {isLoading ? (
+          <>
+            {/* Spinner */}
+            <svg
+              className="h-4 w-4 animate-spin text-gray-500"
+              viewBox="0 0 24 24"
+              fill="none"
+              aria-hidden="true"
+            >
+              <circle
+                cx="12"
+                cy="12"
+                r="10"
+                stroke="currentColor"
+                strokeWidth="3"
+                className="opacity-25"
+              />
+              <path
+                fill="currentColor"
+                d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
+                className="opacity-75"
+              />
+            </svg>
+            <span>Synthesizing...</span>
+          </>
+        ) : (
+          <>
+            {/* Speaker Icon */}
+            <svg
+              width="16"
+              height="16"
+              viewBox="0 0 24 24"
+              fill="none"
+              stroke="currentColor"
+              strokeWidth="2"
+              strokeLinecap="round"
+              strokeLinejoin="round"
+              aria-hidden="true"
+            >
+              <polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
+              <path d="M15.54 8.46a5 5 0 010 7.07" />
+              <path d="M19.07 4.93a10 10 0 010 14.14" />
+            </svg>
+            <span>Read aloud</span>
+          </>
+        )}
+      </button>
+
+      {/* Error Display */}
+      {error && (
+        <p className="text-sm text-amber-600" role="alert">
+          {error}
+        </p>
+      )}
+
+      {/* Audio Player (shown after synthesis) */}
+      {audioUrl && <AudioPlayer src={audioUrl} />}
+    </div>
+  );
+}
+
+export default TextToSpeechButton;
--- a/apps/web/src/components/speech/VoiceInput.test.tsx
+++ b/apps/web/src/components/speech/VoiceInput.test.tsx
@@ -0,0 +1,228 @@
+import { describe, it, expect, vi, beforeEach } from "vitest";
+import { render, screen } from "@testing-library/react";
+import userEvent from "@testing-library/user-event";
+import { VoiceInput } from "./VoiceInput";
+
+// Mock the useVoiceInput hook
+const mockStartRecording = vi.fn();
+const mockStopRecording = vi.fn();
+
+vi.mock("@/hooks/useVoiceInput", () => ({
+  useVoiceInput: vi.fn(() => ({
+    isRecording: false,
+    startRecording: mockStartRecording,
+    stopRecording: mockStopRecording,
+    transcript: "",
+    partialTranscript: "",
+    error: null,
+    audioLevel: 0,
+  })),
+}));
+
+// We need to import after mocking
+import { useVoiceInput } from "@/hooks/useVoiceInput";
+
+describe("VoiceInput", (): void => {
+  beforeEach((): void => {
+    vi.clearAllMocks();
+    // Reset mock implementation to default
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: false,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "",
+      partialTranscript: "",
+      error: null,
+      audioLevel: 0,
+    });
+  });
+
+  it("should render a microphone button", (): void => {
+    render(<VoiceInput />);
+
+    const button = screen.getByRole("button", {
+      name: /start voice input/i,
+    });
+    expect(button).toBeInTheDocument();
+  });
+
+  it("should have accessible aria label", (): void => {
+    render(<VoiceInput />);
+
+    const button = screen.getByRole("button", {
+      name: /start voice input/i,
+    });
+    expect(button).toHaveAttribute("aria-label", "Start voice input");
+  });
+
+  it("should call startRecording when mic button is clicked", async (): Promise<void> => {
+    const user = userEvent.setup();
+    render(<VoiceInput />);
+
+    const button = screen.getByRole("button", {
+      name: /start voice input/i,
+    });
+    await user.click(button);
+
+    expect(mockStartRecording).toHaveBeenCalledTimes(1);
+  });
+
+  it("should show recording state when isRecording is true", (): void => {
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: true,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "",
+      partialTranscript: "",
+      error: null,
+      audioLevel: 0.5,
+    });
+
+    render(<VoiceInput />);
+
+    const button = screen.getByRole("button", {
+      name: /stop voice input/i,
+    });
+    expect(button).toBeInTheDocument();
+  });
+
+  it("should call stopRecording when mic button is clicked while recording", async (): Promise<void> => {
+    const user = userEvent.setup();
+
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: true,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "",
+      partialTranscript: "",
+      error: null,
+      audioLevel: 0.5,
+    });
+
+    render(<VoiceInput />);
+
+    const button = screen.getByRole("button", {
+      name: /stop voice input/i,
+    });
+    await user.click(button);
+
+    expect(mockStopRecording).toHaveBeenCalledTimes(1);
+  });
+
+  it("should display partial transcription text", (): void => {
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: true,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "",
+      partialTranscript: "hello worl",
+      error: null,
+      audioLevel: 0.3,
+    });
+
+    render(<VoiceInput />);
+
+    expect(screen.getByText("hello worl")).toBeInTheDocument();
+  });
+
+  it("should display final transcript text", (): void => {
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: false,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "hello world",
+      partialTranscript: "",
+      error: null,
+      audioLevel: 0,
+    });
+
+    render(<VoiceInput />);
+
+    expect(screen.getByText("hello world")).toBeInTheDocument();
+  });
+
+  it("should display error message", (): void => {
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: false,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "",
+      partialTranscript: "",
+      error: "Microphone access not available",
+      audioLevel: 0,
+    });
+
+    render(<VoiceInput />);
+
+    expect(screen.getByText("Microphone access not available")).toBeInTheDocument();
+  });
+
+  it("should call onTranscript callback prop", (): void => {
+    const onTranscript = vi.fn();
+
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: false,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "final text",
+      partialTranscript: "",
+      error: null,
+      audioLevel: 0,
+    });
+
+    render(<VoiceInput onTranscript={onTranscript} />);
+
+    // The onTranscript prop is passed to the hook - we verify the prop is accepted
+    expect(useVoiceInput).toHaveBeenCalledWith(
+      expect.objectContaining({
+        onTranscript,
+      })
+    );
+  });
+
+  it("should use calm, non-aggressive design for recording indicator", (): void => {
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: true,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "",
+      partialTranscript: "",
+      error: null,
+      audioLevel: 0.5,
+    });
+
+    render(<VoiceInput />);
+
+    // Check there are no aggressive red colors in the recording state
+    const button = screen.getByRole("button", { name: /stop voice input/i });
+    const className = button.className;
+    expect(className).not.toMatch(/bg-red-|text-red-|border-red-/);
+  });
+
+  it("should use calm design for error display", (): void => {
+    vi.mocked(useVoiceInput).mockReturnValue({
+      isRecording: false,
+      startRecording: mockStartRecording,
+      stopRecording: mockStopRecording,
+      transcript: "",
+      partialTranscript: "",
+      error: "Something went wrong",
+      audioLevel: 0,
+    });
+
+    render(<VoiceInput />);
+
+    const errorEl = screen.getByText("Something went wrong");
+    const className = errorEl.className;
+    expect(className).not.toMatch(/text-red-600|bg-red-/);
+  });
+
+  it("should be disabled when disabled prop is true", (): void => {
+    render(<VoiceInput disabled />);
+
+    const button = screen.getByRole("button", {
+      name: /start voice input/i,
+    });
+    expect(button).toBeDisabled();
+  });
+});
--- a/apps/web/src/components/speech/VoiceInput.tsx
+++ b/apps/web/src/components/speech/VoiceInput.tsx
@@ -0,0 +1,146 @@
+/**
+ * VoiceInput component
+ *
+ * Provides a microphone button with visual feedback for voice input.
+ * Click to start/stop recording with real-time transcription display.
+ *
+ * Design principles:
+ * - PDA-friendly: calm, non-aggressive colors
+ * - Gentle pulsing animation for recording state (blue/green)
+ * - Mobile-friendly touch interaction
+ * - Accessible with proper aria labels
+ */
+
+import { useVoiceInput } from "@/hooks/useVoiceInput";
+import type { UseVoiceInputOptions } from "@/hooks/useVoiceInput";
+import { AudioVisualizer } from "./AudioVisualizer";
+import { Mic, MicOff } from "lucide-react";
+
+export interface VoiceInputProps {
+  /** Callback fired when final transcription is received */
+  onTranscript?: (text: string) => void;
+  /** Whether to use WebSocket streaming (default: true) */
+  useWebSocket?: boolean;
+  /** Whether the input is disabled */
+  disabled?: boolean;
+  /** Additional CSS classes for the container */
+  className?: string;
+}
+
+/**
+ * Voice input component with microphone capture and real-time transcription.
+ * Shows a mic button that toggles recording, with visual feedback
+ * and transcription text display.
+ */
+export function VoiceInput({
+  onTranscript,
+  useWebSocket: useWs,
+  disabled = false,
+  className = "",
+}: VoiceInputProps): React.JSX.Element {
+  const hookOptions: UseVoiceInputOptions = {};
+  if (onTranscript !== undefined) {
+    hookOptions.onTranscript = onTranscript;
+  }
+  if (useWs !== undefined) {
+    hookOptions.useWebSocket = useWs;
+  }
+
+  const {
+    isRecording,
+    startRecording,
+    stopRecording,
+    transcript,
+    partialTranscript,
+    error,
+    audioLevel,
+  } = useVoiceInput(hookOptions);
+
+  const handleClick = (): void => {
+    if (isRecording) {
+      stopRecording();
+    } else {
+      void startRecording();
+    }
+  };
+
+  const displayText = isRecording ? partialTranscript : transcript;
+
+  return (
+    <div className={`flex flex-col items-center gap-3 ${className}`}>
+      {/* Mic button with recording indicator */}
+      <div className="relative flex items-center gap-2">
+        {/* Pulsing ring animation when recording */}
+        {isRecording && (
+          <div
+            className="absolute inset-0 -m-1 rounded-full bg-sky-400/20 animate-pulse"
+            aria-hidden="true"
+          />
+        )}
+
+        <button
+          type="button"
+          onClick={handleClick}
+          disabled={disabled}
+          aria-label={isRecording ? "Stop voice input" : "Start voice input"}
+          className={`
+            relative z-10 flex items-center justify-center
+            w-10 h-10 rounded-full transition-all duration-200
+            focus:outline-none focus:ring-2 focus:ring-sky-400 focus:ring-offset-2
+            disabled:opacity-50 disabled:cursor-not-allowed
+            ${
+              isRecording
+                ? "bg-sky-500 text-white hover:bg-sky-600 shadow-md"
+                : "bg-slate-100 text-slate-600 hover:bg-slate-200 dark:bg-slate-700 dark:text-slate-300 dark:hover:bg-slate-600"
+            }
+          `}
+        >
+          {isRecording ? (
+            <MicOff className="w-5 h-5" aria-hidden="true" />
+          ) : (
+            <Mic className="w-5 h-5" aria-hidden="true" />
+          )}
+        </button>
+
+        {/* Audio level visualizer - shown during recording */}
+        {isRecording && (
+          <AudioVisualizer audioLevel={audioLevel} isActive={isRecording} barCount={5} />
+        )}
+      </div>
+
+      {/* Recording status indicator */}
+      {isRecording && (
+        <div className="flex items-center gap-1.5 text-xs text-sky-600 dark:text-sky-400">
+          <span className="w-2 h-2 rounded-full bg-sky-500 animate-pulse" aria-hidden="true" />
+          <span>Listening...</span>
+        </div>
+      )}
+
+      {/* Transcription text display */}
+      {displayText && (
+        <p
+          className={`
+            text-sm max-w-md text-center px-3 py-1.5 rounded-lg
+            ${
+              isRecording
+                ? "text-slate-500 dark:text-slate-400 bg-slate-50 dark:bg-slate-800/50 italic"
+                : "text-slate-700 dark:text-slate-200 bg-slate-100 dark:bg-slate-800"
+            }
+          `}
+        >
+          {displayText}
+        </p>
+      )}
+
+      {/* Error display - calm, non-aggressive */}
+      {error && (
+        <p
+          className="text-sm text-amber-700 dark:text-amber-400 bg-amber-50 dark:bg-amber-900/20 px-3 py-1.5 rounded-lg max-w-md text-center"
+          role="alert"
+        >
+          {error}
+        </p>
+      )}
+    </div>
+  );
+}
--- a/apps/web/src/components/speech/index.ts
+++ b/apps/web/src/components/speech/index.ts
@@ -0,0 +1,8 @@
+export { VoiceInput } from "./VoiceInput";
+export type { VoiceInputProps } from "./VoiceInput";
+export { AudioVisualizer } from "./AudioVisualizer";
+export type { AudioVisualizerProps } from "./AudioVisualizer";
+export { AudioPlayer } from "./AudioPlayer";
+export type { AudioPlayerProps } from "./AudioPlayer";
+export { TextToSpeechButton } from "./TextToSpeechButton";
+export type { TextToSpeechButtonProps } from "./TextToSpeechButton";
--- a/apps/web/src/components/ui/slider.tsx
+++ b/apps/web/src/components/ui/slider.tsx
@@ -0,0 +1,55 @@
+import * as React from "react";
+
+export interface SliderProps {
+  id?: string;
+  min?: number;
+  max?: number;
+  step?: number;
+  value?: number[];
+  defaultValue?: number[];
+  onValueChange?: (value: number[]) => void;
+  disabled?: boolean;
+  className?: string;
+}
+
+export const Slider = React.forwardRef<HTMLInputElement, SliderProps>(
+  (
+    {
+      id,
+      min = 0,
+      max = 100,
+      step = 1,
+      value,
+      defaultValue,
+      onValueChange,
+      disabled,
+      className = "",
+    },
+    ref
+  ) => {
+    const currentValue = value?.[0] ?? defaultValue?.[0] ?? min;
+
+    return (
+      <input
+        type="range"
+        role="slider"
+        ref={ref}
+        id={id}
+        min={min}
+        max={max}
+        step={step}
+        value={currentValue}
+        onChange={(e) => {
+          onValueChange?.([parseFloat(e.target.value)]);
+        }}
+        disabled={disabled}
+        aria-valuemin={min}
+        aria-valuemax={max}
+        aria-valuenow={currentValue}
+        className={`w-full h-2 rounded-lg appearance-none cursor-pointer bg-gray-200 accent-blue-500 ${className}`}
+      />
+    );
+  }
+);
+
+Slider.displayName = "Slider";
--- a/apps/web/src/hooks/useTextToSpeech.test.ts
+++ b/apps/web/src/hooks/useTextToSpeech.test.ts
@@ -0,0 +1,285 @@
+/**
+ * @file useTextToSpeech.test.ts
+ * @description Tests for the useTextToSpeech hook that manages TTS API integration
+ */
+
+import { renderHook, act } from "@testing-library/react";
+import { describe, it, expect, beforeEach, vi, afterEach } from "vitest";
+import { useTextToSpeech } from "./useTextToSpeech";
+import * as speechApi from "@/lib/api/speech";
+
+// Mock the speech API module
+vi.mock("@/lib/api/speech", () => ({
+  synthesizeSpeech: vi.fn(),
+  getVoices: vi.fn(),
+}));
+
+// Mock URL.createObjectURL and URL.revokeObjectURL
+const mockCreateObjectURL = vi.fn().mockReturnValue("blob:mock-audio-url");
+const mockRevokeObjectURL = vi.fn();
+
+beforeEach(() => {
+  global.URL.createObjectURL = mockCreateObjectURL;
+  global.URL.revokeObjectURL = mockRevokeObjectURL;
+});
+
+// Mock HTMLAudioElement
+class MockAudio {
+  src = "";
+  currentTime = 0;
+  duration = 120;
+  paused = true;
+  playbackRate = 1;
+  volume = 1;
+  onended: (() => void) | null = null;
+  ontimeupdate: (() => void) | null = null;
+  onloadedmetadata: (() => void) | null = null;
+  onerror: ((e: unknown) => void) | null = null;
+
+  play(): Promise<void> {
+    this.paused = false;
+    return Promise.resolve();
+  }
+
+  pause(): void {
+    this.paused = true;
+  }
+
+  addEventListener(event: string, handler: () => void): void {
+    if (event === "ended") this.onended = handler;
+    if (event === "timeupdate") this.ontimeupdate = handler;
+    if (event === "loadedmetadata") this.onloadedmetadata = handler;
+    if (event === "error") this.onerror = handler;
+  }
+
+  removeEventListener(): void {
+    // no-op for tests
+  }
+}
+
+vi.stubGlobal("Audio", MockAudio);
+
+const mockSynthesizeSpeech = speechApi.synthesizeSpeech as ReturnType<typeof vi.fn>;
+
+describe("useTextToSpeech", () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    mockCreateObjectURL.mockReturnValue("blob:mock-audio-url");
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  describe("initial state", () => {
+    it("should return correct initial interface", () => {
+      const { result } = renderHook(() => useTextToSpeech());
+
+      expect(result.current.synthesize).toBeTypeOf("function");
+      expect(result.current.play).toBeTypeOf("function");
+      expect(result.current.pause).toBeTypeOf("function");
+      expect(result.current.stop).toBeTypeOf("function");
+      expect(result.current.audioUrl).toBeNull();
+      expect(result.current.isLoading).toBe(false);
+      expect(result.current.error).toBeNull();
+      expect(result.current.isPlaying).toBe(false);
+      expect(result.current.duration).toBe(0);
+      expect(result.current.currentTime).toBe(0);
+    });
+  });
+
+  describe("synthesize", () => {
+    it("should call API and return audio blob URL", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello world");
+      });
+
+      expect(mockSynthesizeSpeech).toHaveBeenCalledWith({
+        text: "Hello world",
+      });
+      expect(result.current.audioUrl).toBe("blob:mock-audio-url");
+      expect(result.current.isLoading).toBe(false);
+      expect(result.current.error).toBeNull();
+    });
+
+    it("should pass voice and tier options to API", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello", {
+          voice: "alloy",
+          tier: "premium",
+          speed: 1.5,
+        });
+      });
+
+      expect(mockSynthesizeSpeech).toHaveBeenCalledWith({
+        text: "Hello",
+        voice: "alloy",
+        tier: "premium",
+        speed: 1.5,
+      });
+    });
+
+    it("should set loading state while synthesizing", async () => {
+      let resolvePromise: ((value: Blob) => void) | undefined;
+      const pendingPromise = new Promise<Blob>((resolve) => {
+        resolvePromise = resolve;
+      });
+      mockSynthesizeSpeech.mockReturnValueOnce(pendingPromise);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      act(() => {
+        void result.current.synthesize("Hello");
+      });
+
+      expect(result.current.isLoading).toBe(true);
+
+      await act(async () => {
+        resolvePromise?.(new Blob(["audio"], { type: "audio/mpeg" }));
+        await pendingPromise;
+      });
+
+      expect(result.current.isLoading).toBe(false);
+    });
+
+    it("should handle API errors gracefully", async () => {
+      mockSynthesizeSpeech.mockRejectedValueOnce(new Error("Synthesis failed"));
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello");
+      });
+
+      expect(result.current.error).toBe("Synthesis failed");
+      expect(result.current.isLoading).toBe(false);
+      expect(result.current.audioUrl).toBeNull();
+    });
+
+    it("should cache audio for repeated synthesis of same text", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValue(mockBlob);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      // First call
+      await act(async () => {
+        await result.current.synthesize("Hello world");
+      });
+
+      // Second call with same text
+      await act(async () => {
+        await result.current.synthesize("Hello world");
+      });
+
+      // API should only be called once due to caching
+      expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(1);
+    });
+
+    it("should not cache when options differ", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValue(mockBlob);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello", { voice: "alloy" });
+      });
+
+      await act(async () => {
+        await result.current.synthesize("Hello", { voice: "nova" });
+      });
+
+      expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(2);
+    });
+  });
+
+  describe("playback controls", () => {
+    it("should play audio after synthesis", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello");
+      });
+
+      await act(async () => {
+        await result.current.play();
+      });
+
+      expect(result.current.isPlaying).toBe(true);
+    });
+
+    it("should pause audio playback", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello");
+      });
+
+      await act(async () => {
+        await result.current.play();
+      });
+
+      act(() => {
+        result.current.pause();
+      });
+
+      expect(result.current.isPlaying).toBe(false);
+    });
+
+    it("should stop and reset playback", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
+
+      const { result } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello");
+      });
+
+      await act(async () => {
+        await result.current.play();
+      });
+
+      act(() => {
+        result.current.stop();
+      });
+
+      expect(result.current.isPlaying).toBe(false);
+      expect(result.current.currentTime).toBe(0);
+    });
+  });
+
+  describe("cleanup", () => {
+    it("should revoke object URLs on unmount", async () => {
+      const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" });
+      mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob);
+
+      const { result, unmount } = renderHook(() => useTextToSpeech());
+
+      await act(async () => {
+        await result.current.synthesize("Hello");
+      });
+
+      unmount();
+
+      expect(mockRevokeObjectURL).toHaveBeenCalled();
+    });
+  });
+});
--- a/apps/web/src/hooks/useTextToSpeech.ts
+++ b/apps/web/src/hooks/useTextToSpeech.ts
@@ -0,0 +1,248 @@
+/**
+ * useTextToSpeech hook
+ * Manages TTS API integration with synthesis, caching, and playback state
+ */
+
+import { useState, useCallback, useRef, useEffect } from "react";
+import { synthesizeSpeech } from "@/lib/api/speech";
+
+export interface SynthesizeOptions {
+  voice?: string;
+  speed?: number;
+  format?: string;
+  tier?: string;
+}
+
+export interface UseTextToSpeechReturn {
+  /** Synthesize text to speech audio */
+  synthesize: (text: string, options?: SynthesizeOptions) => Promise<void>;
+  /** The URL of the synthesized audio blob */
+  audioUrl: string | null;
+  /** Whether synthesis is in progress */
+  isLoading: boolean;
+  /** Error message if synthesis failed */
+  error: string | null;
+  /** Start or resume audio playback */
+  play: () => Promise<void>;
+  /** Pause audio playback */
+  pause: () => void;
+  /** Stop audio and reset to beginning */
+  stop: () => void;
+  /** Whether audio is currently playing */
+  isPlaying: boolean;
+  /** Total duration of the audio in seconds */
+  duration: number;
+  /** Current playback position in seconds */
+  currentTime: number;
+}
+
+/** Cache key generator for text + options combination */
+function getCacheKey(text: string, options?: SynthesizeOptions): string {
+  return JSON.stringify({ text, ...options });
+}
+
+/**
+ * Hook for text-to-speech API integration with caching and playback controls
+ */
+export function useTextToSpeech(): UseTextToSpeechReturn {
+  const [audioUrl, setAudioUrl] = useState<string | null>(null);
+  const [isLoading, setIsLoading] = useState(false);
+  const [error, setError] = useState<string | null>(null);
+  const [isPlaying, setIsPlaying] = useState(false);
+  const [duration, setDuration] = useState(0);
+  const [currentTime, setCurrentTime] = useState(0);
+
+  // Audio element ref for playback control
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+
+  // Cache: maps cache key -> blob URL
+  const cacheRef = useRef<Map<string, string>>(new Map());
+
+  // Track all blob URLs for cleanup
+  const blobUrlsRef = useRef<Set<string>>(new Set());
+
+  /**
+   * Clean up audio element event listeners and state
+   */
+  const cleanupAudio = useCallback(() => {
+    const audio = audioRef.current;
+    if (audio) {
+      audio.pause();
+      audio.removeEventListener("ended", handleEnded);
+      audio.removeEventListener("timeupdate", handleTimeUpdate);
+      audio.removeEventListener("loadedmetadata", handleLoadedMetadata);
+      audioRef.current = null;
+    }
+    setIsPlaying(false);
+  }, []);
+
+  /**
+   * Handle audio ended event
+   */
+  function handleEnded(): void {
+    setIsPlaying(false);
+    setCurrentTime(0);
+  }
+
+  /**
+   * Handle audio time update event
+   */
+  function handleTimeUpdate(): void {
+    const audio = audioRef.current;
+    if (audio) {
+      setCurrentTime(audio.currentTime);
+    }
+  }
+
+  /**
+   * Handle audio metadata loaded event
+   */
+  function handleLoadedMetadata(): void {
+    const audio = audioRef.current;
+    if (audio && isFinite(audio.duration)) {
+      setDuration(audio.duration);
+    }
+  }
+
+  /**
+   * Set up a new Audio element for a given URL
+   */
+  const setupAudio = useCallback(
+    (url: string) => {
+      cleanupAudio();
+
+      const audio = new Audio(url);
+      audio.addEventListener("ended", handleEnded);
+      audio.addEventListener("timeupdate", handleTimeUpdate);
+      audio.addEventListener("loadedmetadata", handleLoadedMetadata);
+      audioRef.current = audio;
+    },
+    [cleanupAudio]
+  );
+
+  /**
+   * Synthesize text to speech
+   */
+  const synthesize = useCallback(
+    async (text: string, options?: SynthesizeOptions): Promise<void> => {
+      setError(null);
+
+      // Check cache first
+      const cacheKey = getCacheKey(text, options);
+      const cachedUrl = cacheRef.current.get(cacheKey);
+
+      if (cachedUrl) {
+        setAudioUrl(cachedUrl);
+        setupAudio(cachedUrl);
+        return;
+      }
+
+      setIsLoading(true);
+
+      try {
+        const blob = await synthesizeSpeech({
+          text,
+          ...(options?.voice !== undefined && { voice: options.voice }),
+          ...(options?.speed !== undefined && { speed: options.speed }),
+          ...(options?.format !== undefined && { format: options.format }),
+          ...(options?.tier !== undefined && { tier: options.tier }),
+        });
+
+        const url = URL.createObjectURL(blob);
+
+        // Store in cache and track for cleanup
+        cacheRef.current.set(cacheKey, url);
+        blobUrlsRef.current.add(url);
+
+        setAudioUrl(url);
+        setupAudio(url);
+      } catch (err) {
+        const errorMsg = err instanceof Error ? err.message : "Speech synthesis failed";
+        setError(errorMsg);
+        setAudioUrl(null);
+      } finally {
+        setIsLoading(false);
+      }
+    },
+    [setupAudio]
+  );
+
+  /**
+   * Start or resume audio playback
+   */
+  const play = useCallback(async (): Promise<void> => {
+    const audio = audioRef.current;
+    if (audio) {
+      try {
+        await audio.play();
+        setIsPlaying(true);
+      } catch (err) {
+        const message =
+          err instanceof DOMException && err.name === "NotAllowedError"
+            ? "Playback was blocked by the browser. Try interacting with the page first."
+            : "Unable to play audio. The format may not be supported.";
+        setError(message);
+        setIsPlaying(false);
+      }
+    }
+  }, []);
+
+  /**
+   * Pause audio playback
+   */
+  const pause = useCallback((): void => {
+    const audio = audioRef.current;
+    if (audio) {
+      audio.pause();
+      setIsPlaying(false);
+    }
+  }, []);
+
+  /**
+   * Stop audio and reset to beginning
+   */
+  const stop = useCallback((): void => {
+    const audio = audioRef.current;
+    if (audio) {
+      audio.pause();
+      audio.currentTime = 0;
+      setIsPlaying(false);
+      setCurrentTime(0);
+    }
+  }, []);
+
+  // Cleanup on unmount: revoke all blob URLs and clean up audio
+  useEffect((): (() => void) => {
+    return (): void => {
+      // Clean up audio element
+      const audio = audioRef.current;
+      if (audio) {
+        audio.pause();
+        audio.removeEventListener("ended", handleEnded);
+        audio.removeEventListener("timeupdate", handleTimeUpdate);
+        audio.removeEventListener("loadedmetadata", handleLoadedMetadata);
+        audioRef.current = null;
+      }
+
+      // Revoke all blob URLs
+      for (const url of blobUrlsRef.current) {
+        URL.revokeObjectURL(url);
+      }
+      blobUrlsRef.current.clear();
+      cacheRef.current.clear();
+    };
+  }, []);
+
+  return {
+    synthesize,
+    audioUrl,
+    isLoading,
+    error,
+    play,
+    pause,
+    stop,
+    isPlaying,
+    duration,
+    currentTime,
+  };
+}
--- a/apps/web/src/hooks/useVoiceInput.test.ts
+++ b/apps/web/src/hooks/useVoiceInput.test.ts
@@ -0,0 +1,362 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import { renderHook, act, waitFor } from "@testing-library/react";
+import { useVoiceInput } from "./useVoiceInput";
+import type { Socket } from "socket.io-client";
+import { io } from "socket.io-client";
+
+// Mock socket.io-client
+vi.mock("socket.io-client");
+
+// Mock MediaRecorder
+const mockMediaRecorder = {
+  start: vi.fn(),
+  stop: vi.fn(),
+  pause: vi.fn(),
+  resume: vi.fn(),
+  state: "inactive" as RecordingState,
+  ondataavailable: null as ((event: BlobEvent) => void) | null,
+  onstop: null as (() => void) | null,
+  onerror: null as ((event: Event) => void) | null,
+  addEventListener: vi.fn((event: string, handler: EventListenerOrEventListenerObject) => {
+    if (event === "dataavailable") {
+      mockMediaRecorder.ondataavailable = handler as (event: BlobEvent) => void;
+    } else if (event === "stop") {
+      mockMediaRecorder.onstop = handler as () => void;
+    } else if (event === "error") {
+      mockMediaRecorder.onerror = handler as (event: Event) => void;
+    }
+  }),
+  removeEventListener: vi.fn(),
+  stream: {
+    getTracks: vi.fn(() => [{ stop: vi.fn() }]),
+  },
+};
+
+// Mock MediaStream with getByteFrequencyData for audio level
+const mockAnalyserNode = {
+  fftSize: 256,
+  frequencyBinCount: 128,
+  getByteFrequencyData: vi.fn((array: Uint8Array) => {
+    // Simulate some audio data
+    for (let i = 0; i < array.length; i++) {
+      array[i] = 128;
+    }
+  }),
+  connect: vi.fn(),
+  disconnect: vi.fn(),
+};
+
+const mockMediaStreamSource = {
+  connect: vi.fn(),
+  disconnect: vi.fn(),
+};
+
+const mockAudioContext = {
+  createAnalyser: vi.fn(() => mockAnalyserNode),
+  createMediaStreamSource: vi.fn(() => mockMediaStreamSource),
+  close: vi.fn(),
+  state: "running",
+};
+
+// Mock getUserMedia
+const mockGetUserMedia = vi.fn();
+
+// Set up global mocks
+Object.defineProperty(global.navigator, "mediaDevices", {
+  value: {
+    getUserMedia: mockGetUserMedia,
+  },
+  writable: true,
+  configurable: true,
+});
+
+// Mock AudioContext
+vi.stubGlobal(
+  "AudioContext",
+  vi.fn(() => mockAudioContext)
+);
+
+// Mock MediaRecorder constructor
+vi.stubGlobal(
+  "MediaRecorder",
+  vi.fn(() => mockMediaRecorder)
+);
+
+// Add isTypeSupported static method
+(
+  global.MediaRecorder as unknown as { isTypeSupported: (type: string) => boolean }
+).isTypeSupported = vi.fn(() => true);
+
+describe("useVoiceInput", (): void => {
+  let mockSocket: Partial<Socket>;
+  let socketEventHandlers: Record<string, (data: unknown) => void>;
+
+  beforeEach((): void => {
+    socketEventHandlers = {};
+
+    mockSocket = {
+      on: vi.fn((event: string, handler: (...args: unknown[]) => void) => {
+        socketEventHandlers[event] = handler;
+        return mockSocket;
+      }) as unknown as Socket["on"],
+      off: vi.fn(() => mockSocket) as unknown as Socket["off"],
+      emit: vi.fn() as unknown as Socket["emit"],
+      connect: vi.fn(),
+      disconnect: vi.fn(),
+      connected: true,
+    };
+
+    (io as unknown as ReturnType<typeof vi.fn>).mockReturnValue(mockSocket);
+
+    // Reset MediaRecorder mock state
+    mockMediaRecorder.state = "inactive";
+    mockMediaRecorder.ondataavailable = null;
+    mockMediaRecorder.onstop = null;
+    mockMediaRecorder.onerror = null;
+
+    // Default: getUserMedia succeeds
+    const mockStream = {
+      getTracks: vi.fn(() => [{ stop: vi.fn() }]),
+    } as unknown as MediaStream;
+    mockGetUserMedia.mockResolvedValue(mockStream);
+  });
+
+  afterEach((): void => {
+    vi.clearAllMocks();
+  });
+
+  it("should return the correct interface", (): void => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    expect(result.current).toHaveProperty("isRecording");
+    expect(result.current).toHaveProperty("startRecording");
+    expect(result.current).toHaveProperty("stopRecording");
+    expect(result.current).toHaveProperty("transcript");
+    expect(result.current).toHaveProperty("partialTranscript");
+    expect(result.current).toHaveProperty("error");
+    expect(result.current).toHaveProperty("audioLevel");
+  });
+
+  it("should start with default state", (): void => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    expect(result.current.isRecording).toBe(false);
+    expect(result.current.transcript).toBe("");
+    expect(result.current.partialTranscript).toBe("");
+    expect(result.current.error).toBeNull();
+    expect(result.current.audioLevel).toBe(0);
+  });
+
+  it("should start recording when startRecording is called", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    expect(result.current.isRecording).toBe(true);
+    expect(mockGetUserMedia).toHaveBeenCalledWith({
+      audio: {
+        echoCancellation: true,
+        noiseSuppression: true,
+        sampleRate: 16000,
+      },
+    });
+  });
+
+  it("should stop recording when stopRecording is called", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    expect(result.current.isRecording).toBe(true);
+
+    act(() => {
+      result.current.stopRecording();
+    });
+
+    expect(result.current.isRecording).toBe(false);
+  });
+
+  it("should set error when microphone access is denied", async (): Promise<void> => {
+    mockGetUserMedia.mockRejectedValueOnce(
+      new DOMException("Permission denied", "NotAllowedError")
+    );
+
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    expect(result.current.isRecording).toBe(false);
+    expect(result.current.error).toBeTruthy();
+    expect(result.current.error).toContain("microphone");
+  });
+
+  it("should connect to speech WebSocket namespace", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    expect(io).toHaveBeenCalledWith(
+      expect.any(String),
+      expect.objectContaining({
+        path: "/socket.io",
+      })
+    );
+  });
+
+  it("should emit start-transcription when recording begins", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    expect(mockSocket.emit).toHaveBeenCalledWith(
+      "start-transcription",
+      expect.objectContaining({
+        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
+        format: expect.any(String),
+      })
+    );
+  });
+
+  it("should emit stop-transcription when recording stops", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    act(() => {
+      result.current.stopRecording();
+    });
+
+    expect(mockSocket.emit).toHaveBeenCalledWith("stop-transcription");
+  });
+
+  it("should handle partial transcription events", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    act(() => {
+      socketEventHandlers["transcription-partial"]?.({
+        text: "hello world",
+      });
+    });
+
+    await waitFor(() => {
+      expect(result.current.partialTranscript).toBe("hello world");
+    });
+  });
+
+  it("should handle final transcription events", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    act(() => {
+      socketEventHandlers["transcription-final"]?.({
+        text: "hello world final",
+      });
+    });
+
+    await waitFor(() => {
+      expect(result.current.transcript).toBe("hello world final");
+    });
+  });
+
+  it("should handle transcription error events", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    act(() => {
+      socketEventHandlers["transcription-error"]?.({
+        message: "Transcription failed",
+      });
+    });
+
+    await waitFor(() => {
+      expect(result.current.error).toBe("Transcription failed");
+    });
+  });
+
+  it("should call onTranscript callback when final transcription received", async (): Promise<void> => {
+    const onTranscript = vi.fn();
+    const { result } = renderHook(() => useVoiceInput({ onTranscript }));
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    act(() => {
+      socketEventHandlers["transcription-final"]?.({
+        text: "final text",
+      });
+    });
+
+    await waitFor(() => {
+      expect(onTranscript).toHaveBeenCalledWith("final text");
+    });
+  });
+
+  it("should clean up on unmount", async (): Promise<void> => {
+    const { result, unmount } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    unmount();
+
+    expect(mockSocket.disconnect).toHaveBeenCalled();
+  });
+
+  it("should not start recording if already recording", async (): Promise<void> => {
+    const { result } = renderHook(() => useVoiceInput());
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    // Reset the call count
+    mockGetUserMedia.mockClear();
+
+    await act(async () => {
+      await result.current.startRecording();
+    });
+
+    // Should not have called getUserMedia again
+    expect(mockGetUserMedia).not.toHaveBeenCalled();
+  });
+
+  describe("REST fallback", (): void => {
+    it("should fall back to REST when WebSocket is unavailable", async (): Promise<void> => {
+      // Simulate socket not connecting
+      (mockSocket as { connected: boolean }).connected = false;
+
+      const { result } = renderHook(() => useVoiceInput({ useWebSocket: false }));
+
+      // Should still be able to start recording (REST mode)
+      await act(async () => {
+        await result.current.startRecording();
+      });
+
+      expect(result.current.isRecording).toBe(true);
+    });
+  });
+});
--- a/apps/web/src/hooks/useVoiceInput.ts
+++ b/apps/web/src/hooks/useVoiceInput.ts
@@ -0,0 +1,435 @@
+/**
+ * useVoiceInput hook
+ *
+ * Custom hook for microphone capture and speech-to-text transcription.
+ * Supports WebSocket streaming with batch transcription on stop,
+ * with REST upload fallback when WebSocket is unavailable.
+ */
+
+import { useState, useCallback, useRef, useEffect } from "react";
+import type { Socket } from "socket.io-client";
+import { io } from "socket.io-client";
+import { API_BASE_URL } from "@/lib/config";
+import { apiPostFormData } from "@/lib/api/client";
+
+/** Options for the useVoiceInput hook */
+export interface UseVoiceInputOptions {
+  /** Callback fired when final transcription is received */
+  onTranscript?: (text: string) => void;
+  /** Whether to use WebSocket streaming (default: true) */
+  useWebSocket?: boolean;
+  /** Audio sample rate in Hz (default: 16000) */
+  sampleRate?: number;
+  /** Authentication token for WebSocket connection */
+  token?: string;
+}
+
+/** Return type for the useVoiceInput hook */
+export interface UseVoiceInputReturn {
+  /** Whether the microphone is currently recording */
+  isRecording: boolean;
+  /** Start microphone capture and transcription */
+  startRecording: () => Promise<void>;
+  /** Stop microphone capture and transcription */
+  stopRecording: () => void;
+  /** The final transcription text */
+  transcript: string;
+  /** Partial transcription text (updates in real-time) */
+  partialTranscript: string;
+  /** Error message if something went wrong */
+  error: string | null;
+  /** Current audio input level (0-1) */
+  audioLevel: number;
+}
+
+interface TranscriptionPartialPayload {
+  text: string;
+}
+
+interface TranscriptionFinalPayload {
+  text: string;
+}
+
+interface TranscriptionErrorPayload {
+  message: string;
+}
+
+interface TranscribeResponse {
+  data: {
+    text: string;
+  };
+}
+
+/**
+ * Determine the best MIME type for audio recording
+ */
+function getAudioMimeType(): string {
+  if (typeof MediaRecorder === "undefined") {
+    return "audio/webm";
+  }
+  const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"];
+  for (const type of types) {
+    if (MediaRecorder.isTypeSupported(type)) {
+      return type;
+    }
+  }
+  return "audio/webm";
+}
+
+/**
+ * Hook for microphone capture and speech-to-text transcription.
+ *
+ * Uses WebSocket streaming by default with batch transcription on stop.
+ * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket
+ * is disabled or unavailable.
+ */
+export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn {
+  const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options;
+
+  const [isRecording, setIsRecording] = useState(false);
+  const [transcript, setTranscript] = useState("");
+  const [partialTranscript, setPartialTranscript] = useState("");
+  const [error, setError] = useState<string | null>(null);
+  const [audioLevel, setAudioLevel] = useState(0);
+
+  // Refs to hold mutable state without re-renders
+  const socketRef = useRef<Socket | null>(null);
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const analyserRef = useRef<AnalyserNode | null>(null);
+  const animationFrameRef = useRef<number | null>(null);
+  const onTranscriptRef = useRef(onTranscript);
+  const recordedChunksRef = useRef<Blob[]>([]);
+  const isRecordingRef = useRef(false);
+
+  // Keep callback ref up to date
+  useEffect(() => {
+    onTranscriptRef.current = onTranscript;
+  }, [onTranscript]);
+
+  /**
+   * Set up audio analysis for visualizing input level
+   */
+  const setupAudioAnalysis = useCallback((stream: MediaStream): void => {
+    try {
+      const audioContext = new AudioContext();
+      const analyser = audioContext.createAnalyser();
+      const source = audioContext.createMediaStreamSource(stream);
+
+      analyser.fftSize = 256;
+      source.connect(analyser);
+
+      audioContextRef.current = audioContext;
+      analyserRef.current = analyser;
+
+      // Start level monitoring
+      const dataArray = new Uint8Array(analyser.frequencyBinCount);
+
+      const updateLevel = (): void => {
+        if (!isRecordingRef.current) {
+          return;
+        }
+
+        analyser.getByteFrequencyData(dataArray);
+
+        // Calculate average level
+        let sum = 0;
+        for (const value of dataArray) {
+          sum += value;
+        }
+        const average = sum / dataArray.length / 255;
+        setAudioLevel(average);
+
+        animationFrameRef.current = requestAnimationFrame(updateLevel);
+      };
+
+      animationFrameRef.current = requestAnimationFrame(updateLevel);
+    } catch (err) {
+      // Audio analysis is non-critical; continue without it
+      console.warn(
+        "Audio level visualization unavailable:",
+        err instanceof Error ? err.message : String(err)
+      );
+    }
+  }, []);
+
+  /**
+   * Clean up audio analysis resources
+   */
+  const cleanupAudioAnalysis = useCallback((): void => {
+    if (animationFrameRef.current !== null) {
+      cancelAnimationFrame(animationFrameRef.current);
+      animationFrameRef.current = null;
+    }
+    if (audioContextRef.current) {
+      void audioContextRef.current.close();
+      audioContextRef.current = null;
+    }
+    analyserRef.current = null;
+    setAudioLevel(0);
+  }, []);
+
+  /**
+   * Connect to the speech WebSocket namespace
+   */
+  const connectSocket = useCallback((): Socket => {
+    const socket = io(`${API_BASE_URL}/speech`, {
+      path: "/socket.io",
+      transports: ["websocket", "polling"],
+      ...(token ? { auth: { token } } : {}),
+    });
+
+    // Future use: the gateway does not currently emit transcription-partial,
+    // but the listener is registered for when real-time partial transcription is added.
+    socket.on("transcription-partial", (data: TranscriptionPartialPayload) => {
+      setPartialTranscript(data.text);
+    });
+
+    socket.on("transcription-final", (data: TranscriptionFinalPayload) => {
+      setTranscript(data.text);
+      setPartialTranscript("");
+      onTranscriptRef.current?.(data.text);
+    });
+
+    socket.on("transcription-error", (data: TranscriptionErrorPayload) => {
+      setError(data.message);
+    });
+
+    socket.on("connect_error", (err: Error) => {
+      setError(`WebSocket connection failed: ${err.message}`);
+    });
+
+    socket.on("disconnect", (reason: string) => {
+      if (reason !== "io client disconnect") {
+        setError(`WebSocket disconnected unexpectedly: ${reason}`);
+      }
+    });
+
+    socketRef.current = socket;
+    return socket;
+  }, [token]);
+
+  /**
+   * Disconnect the WebSocket
+   */
+  const disconnectSocket = useCallback((): void => {
+    if (socketRef.current) {
+      socketRef.current.off("transcription-partial");
+      socketRef.current.off("transcription-final");
+      socketRef.current.off("transcription-error");
+      socketRef.current.off("connect_error");
+      socketRef.current.off("disconnect");
+      socketRef.current.disconnect();
+      socketRef.current = null;
+    }
+  }, []);
+
+  /**
+   * Send recorded audio via REST API as fallback
+   */
+  const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise<void> => {
+    try {
+      const formData = new FormData();
+      formData.append("file", audioBlob, "recording.webm");
+
+      const response = await apiPostFormData<TranscribeResponse>(
+        "/api/speech/transcribe",
+        formData
+      );
+
+      if (response.data.text) {
+        setTranscript(response.data.text);
+        setPartialTranscript("");
+        onTranscriptRef.current?.(response.data.text);
+      }
+    } catch (err) {
+      const message = err instanceof Error ? err.message : "Transcription request failed";
+      setError(message);
+    }
+  }, []);
+
+  /**
+   * Stop all media tracks on the stream
+   */
+  const stopMediaTracks = useCallback((): void => {
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((track) => {
+        track.stop();
+      });
+      streamRef.current = null;
+    }
+  }, []);
+
+  /**
+   * Start microphone capture and transcription
+   */
+  const startRecording = useCallback(async (): Promise<void> => {
+    // Prevent double-start
+    if (isRecordingRef.current) {
+      return;
+    }
+
+    setError(null);
+    setPartialTranscript("");
+    recordedChunksRef.current = [];
+
+    try {
+      // Request microphone access
+      const stream = await navigator.mediaDevices.getUserMedia({
+        audio: {
+          echoCancellation: true,
+          noiseSuppression: true,
+          sampleRate,
+        },
+      });
+
+      streamRef.current = stream;
+
+      // Set up audio level visualization
+      setupAudioAnalysis(stream);
+
+      // Determine MIME type
+      const mimeType = getAudioMimeType();
+
+      // Create MediaRecorder
+      const mediaRecorder = new MediaRecorder(stream, { mimeType });
+      mediaRecorderRef.current = mediaRecorder;
+
+      // Connect WebSocket if enabled
+      let socket: Socket | null = null;
+      if (useWs) {
+        socket = connectSocket();
+
+        // Emit start-transcription event
+        socket.emit("start-transcription", {
+          format: mimeType,
+          sampleRate,
+        });
+      }
+
+      // Handle audio data chunks
+      mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => {
+        if (event.data.size > 0) {
+          if (socket?.connected) {
+            // Stream chunks via WebSocket
+            socket.emit("audio-chunk", event.data);
+          } else {
+            // Collect chunks for REST upload
+            recordedChunksRef.current.push(event.data);
+          }
+        }
+      });
+
+      // Handle recording stop
+      mediaRecorder.addEventListener("stop", () => {
+        // If using REST fallback, send collected audio
+        if (!useWs || !socket?.connected) {
+          if (recordedChunksRef.current.length > 0) {
+            const audioBlob = new Blob(recordedChunksRef.current, {
+              type: mimeType,
+            });
+            void sendAudioViaRest(audioBlob);
+          }
+        }
+      });
+
+      // Handle errors
+      mediaRecorder.addEventListener("error", (event: Event) => {
+        let errorMessage = "Recording encountered an issue. Please try again.";
+        if ("error" in event && event.error instanceof DOMException) {
+          errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`;
+        }
+        setError(errorMessage);
+        setIsRecording(false);
+        isRecordingRef.current = false;
+        stopMediaTracks();
+        cleanupAudioAnalysis();
+      });
+
+      // Start recording with timeslice for streaming chunks (250ms intervals)
+      mediaRecorder.start(250);
+      setIsRecording(true);
+      isRecordingRef.current = true;
+    } catch (err) {
+      // Handle specific error types
+      if (err instanceof DOMException) {
+        if (err.name === "NotAllowedError") {
+          setError(
+            "Microphone access was not granted. Please allow microphone access to use voice input."
+          );
+        } else if (err.name === "NotFoundError") {
+          setError("No microphone found. Please connect a microphone and try again.");
+        } else {
+          setError("Unable to access the microphone. Please check your device settings.");
+        }
+      } else {
+        setError("Unable to start voice input. Please try again.");
+      }
+
+      // Clean up on failure
+      stopMediaTracks();
+      cleanupAudioAnalysis();
+    }
+  }, [
+    useWs,
+    sampleRate,
+    setupAudioAnalysis,
+    connectSocket,
+    sendAudioViaRest,
+    stopMediaTracks,
+    cleanupAudioAnalysis,
+  ]);
+
+  /**
+   * Stop microphone capture and transcription
+   */
+  const stopRecording = useCallback((): void => {
+    setIsRecording(false);
+    isRecordingRef.current = false;
+
+    // Stop MediaRecorder
+    if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
+      mediaRecorderRef.current.stop();
+      mediaRecorderRef.current = null;
+    }
+
+    // Stop media tracks
+    stopMediaTracks();
+
+    // Clean up audio analysis
+    cleanupAudioAnalysis();
+
+    // Emit stop event and disconnect WebSocket
+    if (socketRef.current) {
+      socketRef.current.emit("stop-transcription");
+      // Give the server a moment to process the final chunk before disconnecting
+      setTimeout(() => {
+        disconnectSocket();
+      }, 500);
+    }
+  }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
+
+  // Cleanup on unmount
+  useEffect(() => {
+    return (): void => {
+      isRecordingRef.current = false;
+      if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") {
+        mediaRecorderRef.current.stop();
+      }
+      stopMediaTracks();
+      cleanupAudioAnalysis();
+      disconnectSocket();
+    };
+  }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]);
+
+  return {
+    isRecording,
+    startRecording,
+    stopRecording,
+    transcript,
+    partialTranscript,
+    error,
+    audioLevel,
+  };
+}
--- a/apps/web/src/lib/api/speech.ts
+++ b/apps/web/src/lib/api/speech.ts
@@ -0,0 +1,82 @@
+/**
+ * Speech API client
+ * Handles text-to-speech synthesis and voice listing via /api/speech
+ */
+
+import { apiGet } from "./client";
+import { API_BASE_URL } from "../config";
+
+export type SpeechTier = "default" | "premium" | "fallback";
+
+export interface VoiceInfo {
+  id: string;
+  name: string;
+  language: string;
+  gender?: string;
+  preview_url?: string;
+  tier?: SpeechTier;
+  isDefault?: boolean;
+}
+
+export interface SynthesizeOptions {
+  text: string;
+  voice?: string;
+  speed?: number;
+  format?: string;
+  tier?: string;
+}
+
+export interface VoicesResponse {
+  data: VoiceInfo[];
+}
+
+export interface ProviderHealth {
+  available: boolean;
+}
+
+export interface HealthResponse {
+  data: {
+    stt: ProviderHealth;
+    tts: ProviderHealth;
+  };
+}
+
+/**
+ * Fetch available TTS voices
+ * Optionally filter by tier (default, premium, fallback)
+ */
+export async function getVoices(tier?: SpeechTier): Promise<VoicesResponse> {
+  const endpoint = tier ? `/api/speech/voices?tier=${tier}` : "/api/speech/voices";
+  return apiGet<VoicesResponse>(endpoint);
+}
+
+/**
+ * Fetch health status of speech providers (STT and TTS)
+ */
+export async function getHealthStatus(): Promise<HealthResponse> {
+  return apiGet<HealthResponse>("/api/speech/health");
+}
+
+/**
+ * Synthesize text to speech audio
+ * Returns the audio as a Blob since the API returns binary audio data
+ */
+export async function synthesizeSpeech(options: SynthesizeOptions): Promise<Blob> {
+  const url = `${API_BASE_URL}/api/speech/synthesize`;
+
+  const response = await fetch(url, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+    },
+    credentials: "include",
+    body: JSON.stringify(options),
+  });
+
+  if (!response.ok) {
+    const errorText = await response.text().catch(() => "Unknown error");
+    throw new Error(`Speech synthesis failed: ${errorText}`);
+  }
+
+  return response.blob();
+}
--- a/docker-compose.speech.yml
+++ b/docker-compose.speech.yml
@@ -0,0 +1,113 @@
+# ==============================================
+# Speech Services - Docker Compose Dev Overlay
+# ==============================================
+#
+# Adds STT and TTS services for local development.
+#
+# Usage:
+#   Basic (STT + default TTS):
+#     docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d
+#
+#   With premium TTS (requires GPU):
+#     docker compose -f docker-compose.yml -f docker-compose.speech.yml --profile premium-tts up -d
+#
+#   Or use Makefile targets:
+#     make speech-up              # Basic speech services
+#     make speech-down            # Stop speech services
+#     make speech-logs            # View speech service logs
+# ==============================================
+
+services:
+  # ======================
+  # Speaches (STT + basic TTS)
+  # ======================
+  speaches:
+    image: ghcr.io/speaches-ai/speaches:latest
+    container_name: mosaic-speaches
+    restart: unless-stopped
+    environment:
+      WHISPER__MODEL: ${SPEACHES_WHISPER_MODEL:-Systran/faster-whisper-large-v3-turbo}
+    ports:
+      - "${SPEACHES_PORT:-8090}:8000"
+    volumes:
+      - speaches_models:/root/.cache/huggingface
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 120s
+    networks:
+      - mosaic-internal
+    labels:
+      - "com.mosaic.service=speech-stt"
+      - "com.mosaic.description=Speaches STT (Whisper) and basic TTS"
+
+  # ======================
+  # Kokoro TTS (Default TTS)
+  # ======================
+  kokoro-tts:
+    image: ghcr.io/remsky/kokoro-fastapi:latest-cpu
+    container_name: mosaic-kokoro-tts
+    restart: unless-stopped
+    ports:
+      - "${KOKORO_TTS_PORT:-8880}:8880"
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8880/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 120s
+    networks:
+      - mosaic-internal
+    labels:
+      - "com.mosaic.service=speech-tts"
+      - "com.mosaic.description=Kokoro FastAPI TTS engine"
+
+  # ======================
+  # Chatterbox TTS (Premium TTS - Optional)
+  # ======================
+  # Only starts with: --profile premium-tts
+  # Requires NVIDIA GPU with docker nvidia runtime
+  chatterbox-tts:
+    image: devnen/chatterbox-tts-server:latest
+    container_name: mosaic-chatterbox-tts
+    restart: unless-stopped
+    ports:
+      - "${CHATTERBOX_TTS_PORT:-8881}:8000"
+    profiles:
+      - premium-tts
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 180s
+    networks:
+      - mosaic-internal
+    labels:
+      - "com.mosaic.service=speech-tts-premium"
+      - "com.mosaic.description=Chatterbox premium TTS with voice cloning (GPU)"
+
+# ======================
+# Volumes
+# ======================
+volumes:
+  speaches_models:
+    name: mosaic-speaches-models
+    driver: local
+
+# ======================
+# Networks
+# ======================
+networks:
+  mosaic-internal:
+    external: true
+    name: mosaic-internal
--- a/docker/docker-compose.sample.speech.yml
+++ b/docker/docker-compose.sample.speech.yml
@@ -0,0 +1,164 @@
+# ==============================================
+# Speech Services - Sample Swarm Deployment
+# ==============================================
+#
+# Standalone speech services deployment for use with Mosaic Stack.
+# This is SEPARATE infrastructure — not part of the Mosaic Stack itself.
+# Mosaic connects to it via SPEACHES_URL and TTS_URL environment variables.
+#
+# Provides:
+#   - Speaches: Speech-to-Text (Whisper) + basic TTS fallback
+#   - Kokoro TTS: Default high-quality text-to-speech
+#   - Chatterbox TTS: Premium TTS with voice cloning (optional, requires GPU)
+#
+# Usage (Docker Swarm via Portainer):
+#   1. Create a new stack in Portainer
+#   2. Paste this file or point to the repo
+#   3. Set environment variables in Portainer's env var section
+#   4. Deploy the stack
+#
+# Usage (Docker Swarm CLI):
+#   1. Create .env file with variables below
+#   2. docker stack deploy -c docker-compose.sample.speech.yml speech
+#
+# Required Environment Variables:
+#   STT_DOMAIN=stt.example.com              # Domain for Speaches (STT + basic TTS)
+#   TTS_DOMAIN=tts.example.com              # Domain for Kokoro TTS (default TTS)
+#
+# Optional Environment Variables:
+#   WHISPER_MODEL=Systran/faster-whisper-large-v3-turbo  # Whisper model for STT
+#   CHATTERBOX_TTS_DOMAIN=tts-premium.example.com       # Domain for Chatterbox (premium TTS)
+#   TRAEFIK_ENTRYPOINT=websecure            # Traefik entrypoint name
+#   TRAEFIK_CERTRESOLVER=letsencrypt        # Traefik cert resolver
+#   TRAEFIK_DOCKER_NETWORK=traefik-public   # Traefik network name
+#   TRAEFIK_TLS_ENABLED=true                # Enable TLS on Traefik routers
+#
+# Connecting to Mosaic Stack:
+#   Add to your Mosaic Stack .env:
+#     SPEACHES_URL=http://speaches:8000      (if same Docker network)
+#     SPEACHES_URL=https://stt.example.com   (if external)
+#     TTS_URL=http://kokoro-tts:8880         (if same Docker network)
+#     TTS_URL=https://tts.example.com        (if external)
+#
+# GPU Requirements (Chatterbox only):
+#   - NVIDIA GPU with CUDA support
+#   - nvidia-container-toolkit installed on Docker host
+#   - Docker runtime configured for GPU access
+#   - Note: Docker Swarm requires "generic resources" for GPU scheduling.
+#     See: https://docs.docker.com/engine/daemon/nvidia-gpu/#configure-gpus-for-docker-swarm
+#
+# ==============================================
+
+services:
+  # ======================
+  # Speaches (STT + basic TTS)
+  # ======================
+  # Primary speech-to-text service using Whisper.
+  # Also provides basic TTS as a fallback.
+  speaches:
+    image: ghcr.io/speaches-ai/speaches:latest
+    environment:
+      WHISPER__MODEL: ${WHISPER_MODEL:-Systran/faster-whisper-large-v3-turbo}
+    volumes:
+      - speaches-models:/root/.cache/huggingface
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 120s
+    networks:
+      - internal
+      - traefik-public
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 10s
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.speech-stt.rule=Host(`${STT_DOMAIN}`)"
+        - "traefik.http.routers.speech-stt.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}"
+        - "traefik.http.routers.speech-stt.tls=${TRAEFIK_TLS_ENABLED:-true}"
+        - "traefik.http.routers.speech-stt.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}"
+        - "traefik.http.services.speech-stt.loadbalancer.server.port=8000"
+        - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}"
+
+  # ======================
+  # Kokoro TTS (Default TTS)
+  # ======================
+  # High-quality text-to-speech engine. Always deployed alongside Speaches.
+  kokoro-tts:
+    image: ghcr.io/remsky/kokoro-fastapi:latest-cpu
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8880/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 120s
+    networks:
+      - internal
+      - traefik-public
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 10s
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.speech-tts.rule=Host(`${TTS_DOMAIN}`)"
+        - "traefik.http.routers.speech-tts.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}"
+        - "traefik.http.routers.speech-tts.tls=${TRAEFIK_TLS_ENABLED:-true}"
+        - "traefik.http.routers.speech-tts.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}"
+        - "traefik.http.services.speech-tts.loadbalancer.server.port=8880"
+        - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}"
+
+  # ======================
+  # Chatterbox TTS (Premium TTS - Optional)
+  # ======================
+  # Premium TTS with voice cloning capabilities. Requires NVIDIA GPU.
+  #
+  # To enable: Uncomment this service and set CHATTERBOX_TTS_DOMAIN.
+  #
+  # For Docker Swarm GPU scheduling, configure generic resources on the node:
+  #   /etc/docker/daemon.json:
+  #     { "runtimes": { "nvidia": { ... } },
+  #       "node-generic-resources": ["NVIDIA-GPU=0"] }
+  #
+  # chatterbox-tts:
+  #   image: devnen/chatterbox-tts-server:latest
+  #   healthcheck:
+  #     test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+  #     interval: 30s
+  #     timeout: 10s
+  #     retries: 5
+  #     start_period: 180s
+  #   networks:
+  #     - internal
+  #     - traefik-public
+  #   deploy:
+  #     restart_policy:
+  #       condition: on-failure
+  #       delay: 10s
+  #     resources:
+  #       reservations:
+  #         generic_resources:
+  #           - discrete_resource_spec:
+  #               kind: "NVIDIA-GPU"
+  #               value: 1
+  #     labels:
+  #       - "traefik.enable=true"
+  #       - "traefik.http.routers.speech-tts-premium.rule=Host(`${CHATTERBOX_TTS_DOMAIN}`)"
+  #       - "traefik.http.routers.speech-tts-premium.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}"
+  #       - "traefik.http.routers.speech-tts-premium.tls=${TRAEFIK_TLS_ENABLED:-true}"
+  #       - "traefik.http.routers.speech-tts-premium.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}"
+  #       - "traefik.http.services.speech-tts-premium.loadbalancer.server.port=8000"
+  #       - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}"
+
+volumes:
+  speaches-models:
+
+networks:
+  internal:
+    driver: overlay
+  traefik-public:
+    external: true
+    name: ${TRAEFIK_DOCKER_NETWORK:-traefik-public}
--- a/docs/SPEECH.md
+++ b/docs/SPEECH.md
@@ -0,0 +1,929 @@
+# Speech Services
+
+Mosaic Stack provides integrated speech-to-text (STT) and text-to-speech (TTS) services through a provider abstraction layer. Speech services are optional and modular -- each component can be independently enabled, disabled, or pointed at external infrastructure.
+
+## Table of Contents
+
+- [Architecture Overview](#architecture-overview)
+- [Provider Abstraction](#provider-abstraction)
+- [TTS Tier System and Fallback Chain](#tts-tier-system-and-fallback-chain)
+- [API Endpoint Reference](#api-endpoint-reference)
+- [WebSocket Streaming Protocol](#websocket-streaming-protocol)
+- [Environment Variable Reference](#environment-variable-reference)
+- [Provider Configuration](#provider-configuration)
+- [Voice Cloning Setup (Chatterbox)](#voice-cloning-setup-chatterbox)
+- [Docker Compose Setup](#docker-compose-setup)
+- [GPU VRAM Budget](#gpu-vram-budget)
+- [Frontend Integration](#frontend-integration)
+
+---
+
+## Architecture Overview
+
+```
+                          +-------------------+
+                          |  SpeechController |
+                          |  (REST endpoints) |
+                          +--------+----------+
+                                   |
+                    +--------------+--------------+
+                    |         SpeechService        |
+                    |  (provider selection,         |
+                    |   fallback orchestration)     |
+                    +---------+----------+---------+
+                              |          |
+                 +------------+    +-----+-------+
+                 |                 |             |
+          +------+------+   +-----+-----+ +-----+-----+
+          | STT Provider|   |TTS Provider| |TTS Provider|
+          | (Speaches)  |   |Map<Tier,P> | |Map<Tier,P> |
+          +------+------+   +-----+-----+ +-----+-----+
+                 |                 |             |
+          +------+------+   +-----+-----+ +-----+-----+
+          | Speaches    |   | Kokoro    | | Chatterbox |
+          | (Whisper)   |   | (default) | | (premium)  |
+          +-------------+   +-----------+ +-----+------+
+                                                |
+                                          +-----+-----+
+                                          |   Piper   |
+                                          | (fallback)|
+                                          +-----------+
+
+          +-------------------+
+          |  SpeechGateway    |
+          |  (WebSocket /speech)
+          +--------+----------+
+                   |
+          Uses SpeechService.transcribe()
+```
+
+The speech module (`apps/api/src/speech/`) is a self-contained NestJS module consisting of:
+
+| Component  | File                   | Purpose                                    |
+| ---------- | ---------------------- | ------------------------------------------ |
+| Module     | `speech.module.ts`     | Registers providers, controllers, gateway  |
+| Config     | `speech.config.ts`     | Environment validation and typed config    |
+| Service    | `speech.service.ts`    | High-level speech operations with fallback |
+| Controller | `speech.controller.ts` | REST API endpoints                         |
+| Gateway    | `speech.gateway.ts`    | WebSocket streaming transcription          |
+| Constants  | `speech.constants.ts`  | NestJS injection tokens                    |
+
+### Key Design Decisions
+
+1. **OpenAI-compatible APIs**: All providers (Speaches, Kokoro, Chatterbox, Piper/OpenedAI) expose OpenAI-compatible endpoints. The official OpenAI SDK is used as the HTTP client with a custom `baseURL`.
+
+2. **Provider abstraction**: STT and TTS providers implement well-defined interfaces (`ISTTProvider`, `ITTSProvider`). New providers can be added without modifying the service layer.
+
+3. **Conditional registration**: Providers are only instantiated when their corresponding `*_ENABLED` flag is `true`. The STT provider uses NestJS `@Optional()` injection.
+
+4. **Fail-fast validation**: Configuration is validated at module initialization. If a service is enabled but its URL is missing, the application fails on startup with a descriptive error.
+
+---
+
+## Provider Abstraction
+
+### STT Provider Interface
+
+```typescript
+interface ISTTProvider {
+  readonly name: string;
+  transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult>;
+  isHealthy(): Promise<boolean>;
+}
+```
+
+Currently implemented by `SpeachesSttProvider` which connects to a Speaches (faster-whisper) server.
+
+### TTS Provider Interface
+
+```typescript
+interface ITTSProvider {
+  readonly name: string;
+  readonly tier: SpeechTier;
+  synthesize(text: string, options?: SynthesizeOptions): Promise<SynthesisResult>;
+  listVoices(): Promise<VoiceInfo[]>;
+  isHealthy(): Promise<boolean>;
+}
+```
+
+All TTS providers extend `BaseTTSProvider`, an abstract class that implements common OpenAI-compatible synthesis logic. Concrete providers only need to set `name` and `tier` and optionally override `listVoices()` or `synthesize()`.
+
+### Provider Registration
+
+Providers are created by the `TTS Provider Factory` (`providers/tts-provider.factory.ts`) based on configuration:
+
+| Tier       | Provider Class          | Engine                    | Requirements |
+| ---------- | ----------------------- | ------------------------- | ------------ |
+| `default`  | `KokoroTtsProvider`     | Kokoro-FastAPI            | CPU only     |
+| `premium`  | `ChatterboxTTSProvider` | Chatterbox TTS Server     | NVIDIA GPU   |
+| `fallback` | `PiperTtsProvider`      | Piper via OpenedAI Speech | CPU only     |
+
+---
+
+## TTS Tier System and Fallback Chain
+
+TTS uses a tiered architecture with automatic fallback:
+
+```
+Request with tier="premium"
+    |
+    v
+[premium] Chatterbox available? --yes--> Use Chatterbox
+    |                                         |
+    no                                   (success/fail)
+    |
+    v
+[default] Kokoro available? ------yes--> Use Kokoro
+    |                                         |
+    no                                   (success/fail)
+    |
+    v
+[fallback] Piper available? -----yes--> Use Piper
+    |                                         |
+    no                                   (success/fail)
+    |
+    v
+ServiceUnavailableException
+```
+
+**Fallback order:** `premium` -> `default` -> `fallback`
+
+The fallback chain starts from the requested tier and proceeds downward. A tier is only attempted if:
+
+1. It is enabled in configuration (`TTS_ENABLED`, `TTS_PREMIUM_ENABLED`, `TTS_FALLBACK_ENABLED`)
+2. A provider is registered for that tier
+
+If no tier is specified in the request, `default` is used as the starting point.
+
+---
+
+## API Endpoint Reference
+
+All speech endpoints are under `/api/speech/` and require authentication (Bearer token) plus workspace context (`x-workspace-id` header).
+
+### POST /api/speech/transcribe
+
+Transcribe an uploaded audio file to text.
+
+**Authentication:** Bearer token + workspace membership
+**Content-Type:** `multipart/form-data`
+
+**Form Fields:**
+
+| Field         | Type   | Required | Description                                            |
+| ------------- | ------ | -------- | ------------------------------------------------------ |
+| `file`        | File   | Yes      | Audio file (max 25 MB)                                 |
+| `language`    | string | No       | Language code (e.g., "en", "fr"). Default: from config |
+| `model`       | string | No       | Whisper model override. Default: from config           |
+| `prompt`      | string | No       | Prompt to guide transcription (max 1000 chars)         |
+| `temperature` | number | No       | Temperature 0.0-1.0. Lower = more deterministic        |
+
+**Accepted Audio Formats:**
+`audio/wav`, `audio/mp3`, `audio/mpeg`, `audio/webm`, `audio/ogg`, `audio/flac`, `audio/x-m4a`
+
+**Response:**
+
+```json
+{
+  "data": {
+    "text": "Hello, this is a transcription test.",
+    "language": "en",
+    "durationSeconds": 3.5,
+    "confidence": 0.95,
+    "segments": [
+      {
+        "text": "Hello, this is a transcription test.",
+        "start": 0.0,
+        "end": 3.5,
+        "confidence": 0.95
+      }
+    ]
+  }
+}
+```
+
+**Example:**
+
+```bash
+curl -X POST http://localhost:3001/api/speech/transcribe \
+  -H "Authorization: Bearer YOUR_TOKEN" \
+  -H "x-workspace-id: WORKSPACE_ID" \
+  -F "file=@recording.wav" \
+  -F "language=en"
+```
+
+### POST /api/speech/synthesize
+
+Synthesize text to audio using TTS providers.
+
+**Authentication:** Bearer token + workspace membership
+**Content-Type:** `application/json`
+
+**Request Body:**
+
+| Field    | Type   | Required | Description                                                 |
+| -------- | ------ | -------- | ----------------------------------------------------------- |
+| `text`   | string | Yes      | Text to synthesize (max 4096 chars)                         |
+| `voice`  | string | No       | Voice ID. Default: from config (e.g., "af_heart")           |
+| `speed`  | number | No       | Speed multiplier 0.5-2.0. Default: 1.0                      |
+| `format` | string | No       | Output format: mp3, wav, opus, flac, aac, pcm. Default: mp3 |
+| `tier`   | string | No       | Provider tier: default, premium, fallback. Default: default |
+
+**Response:** Binary audio data with appropriate `Content-Type` header.
+
+| Format | Content-Type |
+| ------ | ------------ |
+| mp3    | `audio/mpeg` |
+| wav    | `audio/wav`  |
+| opus   | `audio/opus` |
+| flac   | `audio/flac` |
+| aac    | `audio/aac`  |
+| pcm    | `audio/pcm`  |
+
+**Example:**
+
+```bash
+curl -X POST http://localhost:3001/api/speech/synthesize \
+  -H "Authorization: Bearer YOUR_TOKEN" \
+  -H "x-workspace-id: WORKSPACE_ID" \
+  -H "Content-Type: application/json" \
+  -d '{"text": "Hello world", "voice": "af_heart", "format": "mp3"}' \
+  --output speech.mp3
+```
+
+### GET /api/speech/voices
+
+List available TTS voices across all tiers.
+
+**Authentication:** Bearer token + workspace access
+**Query Parameters:**
+
+| Parameter | Type   | Required | Description                                |
+| --------- | ------ | -------- | ------------------------------------------ |
+| `tier`    | string | No       | Filter by tier: default, premium, fallback |
+
+**Response:**
+
+```json
+{
+  "data": [
+    {
+      "id": "af_heart",
+      "name": "Heart (American Female)",
+      "language": "en-US",
+      "tier": "default",
+      "isDefault": true
+    },
+    {
+      "id": "am_adam",
+      "name": "Adam (American Male)",
+      "language": "en-US",
+      "tier": "default",
+      "isDefault": false
+    }
+  ]
+}
+```
+
+**Example:**
+
+```bash
+curl -X GET 'http://localhost:3001/api/speech/voices?tier=default' \
+  -H "Authorization: Bearer YOUR_TOKEN" \
+  -H "x-workspace-id: WORKSPACE_ID"
+```
+
+### GET /api/speech/health
+
+Check availability of STT and TTS providers.
+
+**Authentication:** Bearer token + workspace access
+
+**Response:**
+
+```json
+{
+  "data": {
+    "stt": { "available": true },
+    "tts": { "available": true }
+  }
+}
+```
+
+---
+
+## WebSocket Streaming Protocol
+
+The speech module provides a WebSocket gateway at namespace `/speech` for real-time streaming transcription. Audio chunks are accumulated on the server and transcribed when the session is stopped.
+
+### Connection
+
+Connect to the `/speech` namespace with authentication:
+
+```typescript
+import { io } from "socket.io-client";
+
+const socket = io("http://localhost:3001/speech", {
+  auth: { token: "YOUR_SESSION_TOKEN" },
+});
+```
+
+**Authentication methods** (checked in order):
+
+1. `auth.token` in handshake
+2. `query.token` in handshake URL
+3. `Authorization: Bearer <token>` header
+
+Connection is rejected if:
+
+- No valid token is provided
+- Session verification fails
+- User has no workspace membership
+
+**Connection timeout:** 5 seconds for authentication.
+
+### Protocol Flow
+
+```
+Client                          Server
+  |                               |
+  |--- connect (with token) ----->|
+  |                               |  (authenticate, check workspace)
+  |<--- connected ----------------|
+  |                               |
+  |--- start-transcription ------>|  { language?: "en" }
+  |<--- transcription-started ----|  { sessionId, language }
+  |                               |
+  |--- audio-chunk -------------->|  (Buffer/Uint8Array)
+  |--- audio-chunk -------------->|  (Buffer/Uint8Array)
+  |--- audio-chunk -------------->|  (Buffer/Uint8Array)
+  |                               |
+  |--- stop-transcription ------->|
+  |                               |  (concatenate chunks, transcribe)
+  |<--- transcription-final ------|  { text, language, durationSeconds, ... }
+  |                               |
+```
+
+### Client Events (emit)
+
+| Event                 | Payload                  | Description                              |
+| --------------------- | ------------------------ | ---------------------------------------- |
+| `start-transcription` | `{ language?: string }`  | Begin a new transcription session        |
+| `audio-chunk`         | `Buffer` or `Uint8Array` | Send audio data chunk                    |
+| `stop-transcription`  | (none)                   | Stop recording and trigger transcription |
+
+### Server Events (listen)
+
+| Event                   | Payload                                                     | Description                |
+| ----------------------- | ----------------------------------------------------------- | -------------------------- |
+| `transcription-started` | `{ sessionId, language }`                                   | Session created            |
+| `transcription-final`   | `{ text, language, durationSeconds, confidence, segments }` | Transcription result       |
+| `transcription-error`   | `{ message }`                                               | Error during transcription |
+
+### Session Management
+
+- One active transcription session per client connection
+- Starting a new session replaces any existing session
+- Sessions are cleaned up on client disconnect
+- Audio chunks are accumulated in memory
+- Total accumulated size is capped by `SPEECH_MAX_UPLOAD_SIZE` (default: 25 MB)
+
+### Example Client Usage
+
+```typescript
+import { io } from "socket.io-client";
+
+const socket = io("http://localhost:3001/speech", {
+  auth: { token: sessionToken },
+});
+
+// Start recording
+socket.emit("start-transcription", { language: "en" });
+
+socket.on("transcription-started", ({ sessionId }) => {
+  console.log("Session started:", sessionId);
+});
+
+// Stream audio chunks from MediaRecorder
+mediaRecorder.ondataavailable = (event) => {
+  if (event.data.size > 0) {
+    event.data.arrayBuffer().then((buffer) => {
+      socket.emit("audio-chunk", new Uint8Array(buffer));
+    });
+  }
+};
+
+// Stop and get result
+socket.emit("stop-transcription");
+
+socket.on("transcription-final", (result) => {
+  console.log("Transcription:", result.text);
+  console.log("Duration:", result.durationSeconds, "seconds");
+});
+
+socket.on("transcription-error", ({ message }) => {
+  console.error("Transcription error:", message);
+});
+```
+
+---
+
+## Environment Variable Reference
+
+### Speech-to-Text (STT)
+
+| Variable       | Default                                 | Description                                          |
+| -------------- | --------------------------------------- | ---------------------------------------------------- |
+| `STT_ENABLED`  | `false`                                 | Enable speech-to-text transcription                  |
+| `STT_BASE_URL` | `http://speaches:8000/v1`               | Speaches server URL (required when STT_ENABLED=true) |
+| `STT_MODEL`    | `Systran/faster-whisper-large-v3-turbo` | Whisper model for transcription                      |
+| `STT_LANGUAGE` | `en`                                    | Default language code                                |
+
+### Text-to-Speech (TTS) - Default Engine (Kokoro)
+
+| Variable             | Default                     | Description                                         |
+| -------------------- | --------------------------- | --------------------------------------------------- |
+| `TTS_ENABLED`        | `false`                     | Enable default TTS engine                           |
+| `TTS_DEFAULT_URL`    | `http://kokoro-tts:8880/v1` | Kokoro-FastAPI URL (required when TTS_ENABLED=true) |
+| `TTS_DEFAULT_VOICE`  | `af_heart`                  | Default Kokoro voice ID                             |
+| `TTS_DEFAULT_FORMAT` | `mp3`                       | Default audio output format                         |
+
+### Text-to-Speech (TTS) - Premium Engine (Chatterbox)
+
+| Variable              | Default                         | Description                                                 |
+| --------------------- | ------------------------------- | ----------------------------------------------------------- |
+| `TTS_PREMIUM_ENABLED` | `false`                         | Enable premium TTS engine                                   |
+| `TTS_PREMIUM_URL`     | `http://chatterbox-tts:8881/v1` | Chatterbox TTS URL (required when TTS_PREMIUM_ENABLED=true) |
+
+### Text-to-Speech (TTS) - Fallback Engine (Piper/OpenedAI)
+
+| Variable               | Default                          | Description                                                   |
+| ---------------------- | -------------------------------- | ------------------------------------------------------------- |
+| `TTS_FALLBACK_ENABLED` | `false`                          | Enable fallback TTS engine                                    |
+| `TTS_FALLBACK_URL`     | `http://openedai-speech:8000/v1` | OpenedAI Speech URL (required when TTS_FALLBACK_ENABLED=true) |
+
+### Service Limits
+
+| Variable                      | Default    | Description                                    |
+| ----------------------------- | ---------- | ---------------------------------------------- |
+| `SPEECH_MAX_UPLOAD_SIZE`      | `25000000` | Maximum upload file size in bytes (25 MB)      |
+| `SPEECH_MAX_DURATION_SECONDS` | `600`      | Maximum audio duration in seconds (10 minutes) |
+| `SPEECH_MAX_TEXT_LENGTH`      | `4096`     | Maximum text length for TTS in characters      |
+
+### Conditional Validation
+
+When a service is enabled, its URL variable is required. If missing, the application fails at startup with a message like:
+
+```
+STT is enabled (STT_ENABLED=true) but required environment variables are missing or empty: STT_BASE_URL.
+Either set these variables or disable by setting STT_ENABLED=false.
+```
+
+Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values default to `false`.
+
+---
+
+## Provider Configuration
+
+### Kokoro (Default Tier)
+
+**Engine:** [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI)
+**License:** Apache 2.0
+**Requirements:** CPU only
+**Docker Image:** `ghcr.io/remsky/kokoro-fastapi:latest-cpu`
+
+**Capabilities:**
+
+- 53 built-in voices across 8 languages
+- Speed control: 0.25x to 4.0x
+- Output formats: mp3, wav, opus, flac
+- Voice metadata derived from ID prefix (language, gender, accent)
+
+**Voice ID Format:** `{lang}{gender}_{name}`
+
+- First character: language/accent (a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese)
+- Second character: gender (f=Female, m=Male)
+
+**Example voices:**
+| Voice ID | Name | Language | Gender |
+|----------|------|----------|--------|
+| `af_heart` | Heart | en-US | Female |
+| `am_adam` | Adam | en-US | Male |
+| `bf_alice` | Alice | en-GB | Female |
+| `bm_daniel` | Daniel | en-GB | Male |
+| `ef_dora` | Dora | es | Female |
+| `ff_camille` | Camille | fr | Female |
+| `jf_alpha` | Alpha | ja | Female |
+| `zf_xiaobei` | Xiaobei | zh | Female |
+
+### Chatterbox (Premium Tier)
+
+**Engine:** [Chatterbox TTS Server](https://github.com/devnen/chatterbox-tts-server)
+**License:** Proprietary
+**Requirements:** NVIDIA GPU with CUDA
+**Docker Image:** `devnen/chatterbox-tts-server:latest`
+
+**Capabilities:**
+
+- Voice cloning via reference audio sample
+- Emotion exaggeration control (0.0 - 1.0)
+- Cross-language voice transfer (23 languages)
+- Higher quality synthesis than default tier
+
+**Supported Languages:**
+en, fr, de, es, it, pt, nl, pl, ru, uk, ja, zh, ko, ar, hi, tr, sv, da, fi, no, cs, el, ro
+
+**Extended Options (Chatterbox-specific):**
+
+| Option                | Type   | Description                                               |
+| --------------------- | ------ | --------------------------------------------------------- |
+| `referenceAudio`      | Buffer | Audio sample for voice cloning (5-30 seconds recommended) |
+| `emotionExaggeration` | number | Emotion intensity 0.0-1.0 (clamped)                       |
+
+These are passed as extra body parameters to the OpenAI-compatible endpoint. Reference audio is base64-encoded before sending.
+
+### Piper (Fallback Tier)
+
+**Engine:** [Piper](https://github.com/rhasspy/piper) via [OpenedAI Speech](https://github.com/matatonic/openedai-speech)
+**License:** GPL (OpenedAI Speech)
+**Requirements:** CPU only (runs on Raspberry Pi)
+**Docker Image:** Use OpenedAI Speech image
+
+**Capabilities:**
+
+- 100+ voices across 40+ languages
+- 6 standard OpenAI voice names (mapped to Piper voices)
+- Output formats: mp3, wav, opus, flac
+- Ultra-lightweight, designed for low-resource environments
+
+**Standard Voice Mapping:**
+
+| OpenAI Voice | Piper Voice          | Gender | Description           |
+| ------------ | -------------------- | ------ | --------------------- |
+| `alloy`      | en_US-amy-medium     | Female | Warm, balanced        |
+| `echo`       | en_US-ryan-medium    | Male   | Clear, articulate     |
+| `fable`      | en_GB-alan-medium    | Male   | British narrator      |
+| `onyx`       | en_US-danny-low      | Male   | Deep, resonant        |
+| `nova`       | en_US-lessac-medium  | Female | Expressive, versatile |
+| `shimmer`    | en_US-kristin-medium | Female | Bright, energetic     |
+
+### Speaches (STT)
+
+**Engine:** [Speaches](https://github.com/speaches-ai/speaches) (faster-whisper backend)
+**License:** MIT
+**Requirements:** CPU (GPU optional for faster inference)
+**Docker Image:** `ghcr.io/speaches-ai/speaches:latest`
+
+**Capabilities:**
+
+- OpenAI-compatible `/v1/audio/transcriptions` endpoint
+- Whisper models via faster-whisper
+- Verbose JSON response with segments and timestamps
+- Language detection
+
+**Default model:** `Systran/faster-whisper-large-v3-turbo`
+
+---
+
+## Voice Cloning Setup (Chatterbox)
+
+Voice cloning is available through the Chatterbox premium TTS provider.
+
+### Prerequisites
+
+1. NVIDIA GPU with CUDA support
+2. `nvidia-container-toolkit` installed on the Docker host
+3. Docker runtime configured for GPU access
+4. TTS premium tier enabled (`TTS_PREMIUM_ENABLED=true`)
+
+### Basic Voice Cloning
+
+Provide a reference audio sample (WAV or MP3, 5-30 seconds) when calling synthesize:
+
+```typescript
+import { SpeechService } from "./speech.service";
+import type { ChatterboxSynthesizeOptions } from "./interfaces/speech-types";
+
+const options: ChatterboxSynthesizeOptions = {
+  tier: "premium",
+  referenceAudio: myAudioBuffer, // 5-30 second audio sample
+  emotionExaggeration: 0.5, // 0.0 = neutral, 1.0 = maximum emotion
+};
+
+const result = await speechService.synthesize("Hello, this is my cloned voice!", options);
+```
+
+### Voice Cloning Tips
+
+- **Audio quality:** Use clean recordings without background noise
+- **Duration:** 5-30 seconds works best; shorter clips may produce lower quality
+- **Format:** WAV provides the best quality; MP3 is also accepted
+- **Emotion:** Start with 0.5 (moderate) and adjust from there
+- **Cross-language:** You can clone a voice in one language and synthesize in another
+
+---
+
+## Docker Compose Setup
+
+### Development (Local)
+
+Speech services are defined in a separate overlay file `docker-compose.speech.yml`. This keeps them optional and separate from core services.
+
+**Start basic speech services (STT + default TTS):**
+
+```bash
+# Using docker compose directly
+docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d
+
+# Using Makefile
+make speech-up
+```
+
+**Start with premium TTS (requires NVIDIA GPU):**
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.speech.yml --profile premium-tts up -d
+```
+
+**Stop speech services:**
+
+```bash
+# Using docker compose directly
+docker compose -f docker-compose.yml -f docker-compose.speech.yml down --remove-orphans
+
+# Using Makefile
+make speech-down
+```
+
+**View logs:**
+
+```bash
+make speech-logs
+```
+
+### Development Services
+
+| Service        | Container             | Port                            | Image                                      |
+| -------------- | --------------------- | ------------------------------- | ------------------------------------------ |
+| Speaches (STT) | mosaic-speaches       | 8090 (host) -> 8000 (container) | `ghcr.io/speaches-ai/speaches:latest`      |
+| Kokoro TTS     | mosaic-kokoro-tts     | 8880 (host) -> 8880 (container) | `ghcr.io/remsky/kokoro-fastapi:latest-cpu` |
+| Chatterbox TTS | mosaic-chatterbox-tts | 8881 (host) -> 8000 (container) | `devnen/chatterbox-tts-server:latest`      |
+
+### Production (Docker Swarm)
+
+For production deployments, use `docker/docker-compose.sample.speech.yml`. This file is designed for Docker Swarm with Traefik integration.
+
+**Required environment variables:**
+
+```bash
+STT_DOMAIN=stt.example.com
+TTS_DOMAIN=tts.example.com
+```
+
+**Optional environment variables:**
+
+```bash
+WHISPER_MODEL=Systran/faster-whisper-large-v3-turbo
+CHATTERBOX_TTS_DOMAIN=tts-premium.example.com
+TRAEFIK_ENTRYPOINT=websecure
+TRAEFIK_CERTRESOLVER=letsencrypt
+TRAEFIK_DOCKER_NETWORK=traefik-public
+TRAEFIK_TLS_ENABLED=true
+```
+
+**Deploy:**
+
+```bash
+docker stack deploy -c docker/docker-compose.sample.speech.yml speech
+```
+
+**Connecting to Mosaic Stack:** Set the speech URLs in your Mosaic Stack `.env`:
+
+```bash
+# Same Docker network
+STT_BASE_URL=http://speaches:8000/v1
+TTS_DEFAULT_URL=http://kokoro-tts:8880/v1
+
+# External / different network
+STT_BASE_URL=https://stt.example.com/v1
+TTS_DEFAULT_URL=https://tts.example.com/v1
+```
+
+### Health Checks
+
+All speech containers include health checks:
+
+| Service        | Endpoint                       | Interval | Start Period |
+| -------------- | ------------------------------ | -------- | ------------ |
+| Speaches       | `http://localhost:8000/health` | 30s      | 120s         |
+| Kokoro TTS     | `http://localhost:8880/health` | 30s      | 120s         |
+| Chatterbox TTS | `http://localhost:8000/health` | 30s      | 180s         |
+
+Chatterbox has a longer start period (180s) because GPU model loading takes additional time.
+
+---
+
+## GPU VRAM Budget
+
+Only Chatterbox requires GPU resources. The other providers (Speaches, Kokoro, Piper) are CPU-only.
+
+### Chatterbox VRAM Requirements
+
+| Component               | Approximate VRAM   |
+| ----------------------- | ------------------ |
+| Chatterbox TTS model    | ~2-4 GB            |
+| Voice cloning inference | ~1-2 GB additional |
+| **Total recommended**   | **4-6 GB**         |
+
+### Shared GPU Considerations
+
+If running multiple GPU services (e.g., Ollama for LLM + Chatterbox for TTS):
+
+| Service              | VRAM Usage  | Notes                             |
+| -------------------- | ----------- | --------------------------------- |
+| Ollama (7B model)    | ~4-6 GB     | Depends on model size             |
+| Ollama (13B model)   | ~8-10 GB    | Larger models need more           |
+| Chatterbox TTS       | ~4-6 GB     | Voice cloning is memory-intensive |
+| **Combined minimum** | **8-12 GB** | For 7B LLM + Chatterbox           |
+
+**Recommendations:**
+
+- 8 GB VRAM: Adequate for small LLM + Chatterbox (may need to alternate)
+- 12 GB VRAM: Comfortable for 7B LLM + Chatterbox simultaneously
+- 24 GB VRAM: Supports larger LLMs + Chatterbox with headroom
+
+If VRAM is limited, consider:
+
+1. Disabling Chatterbox (`TTS_PREMIUM_ENABLED=false`) and using Kokoro (CPU) as default
+2. Using the fallback chain so Kokoro handles requests when Chatterbox is busy
+3. Running Chatterbox on a separate GPU host
+
+### Docker Swarm GPU Scheduling
+
+For Docker Swarm deployments with GPU, configure generic resources on the node:
+
+```json
+// /etc/docker/daemon.json
+{
+  "runtimes": {
+    "nvidia": {
+      "path": "nvidia-container-runtime"
+    }
+  },
+  "node-generic-resources": ["NVIDIA-GPU=0"]
+}
+```
+
+See the [Docker GPU Swarm documentation](https://docs.docker.com/engine/daemon/nvidia-gpu/#configure-gpus-for-docker-swarm) for details.
+
+---
+
+## Frontend Integration
+
+Speech services are consumed from the frontend through the REST API and WebSocket gateway.
+
+### REST API Usage
+
+**Transcribe audio:**
+
+```typescript
+async function transcribeAudio(file: File, token: string, workspaceId: string) {
+  const formData = new FormData();
+  formData.append("file", file);
+  formData.append("language", "en");
+
+  const response = await fetch("/api/speech/transcribe", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+    },
+    body: formData,
+  });
+
+  const { data } = await response.json();
+  return data.text;
+}
+```
+
+**Synthesize speech:**
+
+```typescript
+async function synthesizeSpeech(
+  text: string,
+  token: string,
+  workspaceId: string,
+  voice = "af_heart"
+) {
+  const response = await fetch("/api/speech/synthesize", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+      "Content-Type": "application/json",
+    },
+    body: JSON.stringify({ text, voice, format: "mp3" }),
+  });
+
+  const audioBlob = await response.blob();
+  const audioUrl = URL.createObjectURL(audioBlob);
+  const audio = new Audio(audioUrl);
+  audio.play();
+}
+```
+
+**List voices:**
+
+```typescript
+async function listVoices(token: string, workspaceId: string, tier?: string) {
+  const url = tier ? `/api/speech/voices?tier=${tier}` : "/api/speech/voices";
+
+  const response = await fetch(url, {
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+    },
+  });
+
+  const { data } = await response.json();
+  return data; // VoiceInfo[]
+}
+```
+
+### WebSocket Streaming Usage
+
+For real-time transcription using the browser's MediaRecorder API:
+
+```typescript
+import { io } from "socket.io-client";
+
+function createSpeechSocket(token: string) {
+  const socket = io("/speech", {
+    auth: { token },
+  });
+
+  let mediaRecorder: MediaRecorder | null = null;
+
+  async function startRecording() {
+    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+    mediaRecorder = new MediaRecorder(stream, {
+      mimeType: "audio/webm;codecs=opus",
+    });
+
+    socket.emit("start-transcription", { language: "en" });
+
+    mediaRecorder.ondataavailable = (event) => {
+      if (event.data.size > 0) {
+        event.data.arrayBuffer().then((buffer) => {
+          socket.emit("audio-chunk", new Uint8Array(buffer));
+        });
+      }
+    };
+
+    mediaRecorder.start(250); // Send chunks every 250ms
+  }
+
+  async function stopRecording(): Promise<string> {
+    return new Promise((resolve, reject) => {
+      socket.once("transcription-final", (result) => {
+        resolve(result.text);
+      });
+
+      socket.once("transcription-error", ({ message }) => {
+        reject(new Error(message));
+      });
+
+      if (mediaRecorder) {
+        mediaRecorder.stop();
+        mediaRecorder.stream.getTracks().forEach((track) => track.stop());
+        mediaRecorder = null;
+      }
+
+      socket.emit("stop-transcription");
+    });
+  }
+
+  return { socket, startRecording, stopRecording };
+}
+```
+
+### Check Speech Availability
+
+Before showing speech UI elements, check provider availability:
+
+```typescript
+async function checkSpeechHealth(token: string, workspaceId: string) {
+  const response = await fetch("/api/speech/health", {
+    headers: {
+      Authorization: `Bearer ${token}`,
+      "x-workspace-id": workspaceId,
+    },
+  });
+
+  const { data } = await response.json();
+  return {
+    canTranscribe: data.stt.available,
+    canSynthesize: data.tts.available,
+  };
+}
+```
--- a/docs/tasks.md
+++ b/docs/tasks.md
@@ -24,69 +24,10 @@

 **Orchestrator:** Claude Code
 **Started:** 2026-02-12
-**Branch:** develop
-**Reports:** docs/reports/ci/mosaic-stack-360-\*.log
+**Branch:** fix/ci-\*
+**Epic:** #360

-| id          | status | description                                                                                | issue | repo        | branch             | depends_on            | blocks      | agent    | started_at        | completed_at      | estimate | used      |
-| ----------- | ------ | ------------------------------------------------------------------------------------------ | ----- | ----------- | ------------------ | --------------------- | ----------- | -------- | ----------------- | ----------------- | -------- | --------- |
-| CI-SEC-001  | done   | Update OpenBao Docker image to fix CRITICAL CVE-2025-68121 + 4 HIGH CVEs                   | #363  | docker      | fix/ci-security    |                       | CI-SEC-003  | worker-1 | 2026-02-12T12:40Z | 2026-02-12T12:42Z | 10K      | 8K        |
-| CI-SEC-002  | done   | Update Postgres Docker image/gosu to fix CRITICAL CVE-2025-68121 + 5 HIGH CVEs             | #363  | docker      | fix/ci-security    |                       | CI-SEC-003  | worker-2 | 2026-02-12T12:40Z | 2026-02-12T12:44Z | 10K      | 25K       |
-| CI-SEC-003  | done   | Phase 1 verification: validate Docker image security fixes                                 | #363  | docker      | fix/ci-security    | CI-SEC-001,CI-SEC-002 | CI-PIPE-001 | orch     | 2026-02-12T12:45Z | 2026-02-12T12:47Z | 5K       | 2K        |
-| CI-PIPE-001 | done   | Fix .woodpecker/api.yml lint step to depend on prisma-generate (fixes 3,919 ESLint errors) | #364  | ci          | fix/ci-pipeline    | CI-SEC-003            | CI-PIPE-002 | worker-3 | 2026-02-12T12:48Z | 2026-02-12T12:50Z | 3K       | 8K        |
-| CI-PIPE-002 | done   | Phase 2 verification: validate CI pipeline fix                                             | #364  | ci          | fix/ci-pipeline    | CI-PIPE-001           | CI-CQ-001   | orch     | 2026-02-12T12:50Z | 2026-02-12T12:51Z | 3K       | 1K        |
-| CI-CQ-001   | done   | Fix ruff check errors in coordinator (20 errors: StrEnum, imports, line length)            | #365  | coordinator | fix/ci-coordinator | CI-PIPE-002           | CI-CQ-002   | worker-4 | 2026-02-12T12:52Z | 2026-02-12T12:57Z | 8K       | 25K       |
-| CI-CQ-002   | done   | Fix mypy error in coordinator src/main.py:144 (add_exception_handler type)                 | #365  | coordinator | fix/ci-coordinator | CI-CQ-001             | CI-CQ-003   | worker-4 | 2026-02-12T12:52Z | 2026-02-12T12:57Z | 5K       | (batched) |
-| CI-CQ-003   | done   | Upgrade pip in coordinator Dockerfile and document bandit B104 finding                     | #365  | coordinator | fix/ci-coordinator | CI-CQ-002             | CI-CQ-004   | worker-4 | 2026-02-12T12:52Z | 2026-02-12T12:57Z | 5K       | (batched) |
-| CI-CQ-004   | done   | Phase 3 verification: validate all coordinator fixes                                       | #365  | coordinator | fix/ci-coordinator | CI-CQ-003             |             | orch     | 2026-02-12T12:58Z | 2026-02-12T12:58Z | 5K       | 1K        |
-
-## Pipeline #361 Follow-up Fixes
-
-| id         | status | description                                                                              | issue | repo        | branch  | depends_on                       | blocks     | agent    | started_at        | completed_at      | estimate | used      |
-| ---------- | ------ | ---------------------------------------------------------------------------------------- | ----- | ----------- | ------- | -------------------------------- | ---------- | -------- | ----------------- | ----------------- | -------- | --------- |
-| CI-FIX-001 | done   | Fix Postgres Docker build: use COPY --from=tianon/gosu instead of go install             | #363  | docker      | develop |                                  | CI-FIX-004 | worker-5 | 2026-02-12T16:10Z | 2026-02-12T16:15Z | 5K       | 4K        |
-| CI-FIX-002 | done   | Add build-shared step to API pipeline (fixes lint + typecheck: @mosaic/shared not found) | #364  | ci          | develop |                                  | CI-FIX-004 | worker-6 | 2026-02-12T16:10Z | 2026-02-12T16:17Z | 8K       | 12K       |
-| CI-FIX-003 | done   | Fix coordinator CI: use bandit.yaml config, upgrade pip in CI venv install step          | #365  | coordinator | develop |                                  | CI-FIX-004 | worker-6 | 2026-02-12T16:10Z | 2026-02-12T16:17Z | 5K       | (batched) |
-| CI-FIX-004 | done   | Verification: all pipeline #361 fixes validated                                          |       | all         | develop | CI-FIX-001,CI-FIX-002,CI-FIX-003 |            | orch     | 2026-02-12T16:18Z | 2026-02-12T16:20Z | 3K       | 1K        |
-
-## Pipeline #362 Follow-up Fixes
-
-| id          | status | description                                                                                    | issue | repo        | branch  | depends_on                          | blocks      | agent    | started_at        | completed_at      | estimate | used |
-| ----------- | ------ | ---------------------------------------------------------------------------------------------- | ----- | ----------- | ------- | ----------------------------------- | ----------- | -------- | ----------------- | ----------------- | -------- | ---- |
-| CI-FIX2-001 | done   | Fix Postgres Dockerfile: remove setuid bit (chmod +sx → chmod +x) — gosu rejects setuid        | #363  | docker      | develop |                                     | CI-FIX2-004 | worker-7 | 2026-02-12T16:30Z | 2026-02-12T16:32Z | 3K       | 2K   |
-| CI-FIX2-002 | done   | Fix Trivy coordinator: upgrade setuptools>=80.9 and wheel>=0.46.2 to fix 5 HIGH CVEs           | #365  | coordinator | develop |                                     | CI-FIX2-004 | worker-8 | 2026-02-12T16:30Z | 2026-02-12T16:32Z | 5K       | 3K   |
-| CI-FIX2-003 | done   | Exclude 4 pre-existing integration test files from CI test step (M4/M5 debt, no DB migrations) | #364  | ci          | develop |                                     | CI-FIX2-004 | worker-9 | 2026-02-12T16:30Z | 2026-02-12T16:32Z | 5K       | 3K   |
-| CI-FIX2-004 | done   | Verification: validate all pipeline #362 fixes                                                 |       | all         | develop | CI-FIX2-001,CI-FIX2-002,CI-FIX2-003 |             | orch     | 2026-02-12T16:33Z | 2026-02-12T16:34Z | 3K       | 2K   |
-
-## Pipeline #363 Follow-up Fixes
-
-| id          | status | description                                                                                           | issue | repo | branch  | depends_on              | blocks      | agent | started_at        | completed_at      | estimate | used |
-| ----------- | ------ | ----------------------------------------------------------------------------------------------------- | ----- | ---- | ------- | ----------------------- | ----------- | ----- | ----------------- | ----------------- | -------- | ---- |
-| CI-FIX3-001 | done   | Create .trivyignore for upstream CVEs (Go stdlib in openbao/gosu, npm bundled pkgs in node:20-alpine) |       | ci   | develop |                         | CI-FIX3-002 | orch  | 2026-02-12T17:00Z | 2026-02-12T17:02Z | 5K       | 3K   |
-| CI-FIX3-002 | done   | Update all Trivy CI steps (6 steps across 5 pipelines) to use --ignorefile .trivyignore               |       | ci   | develop | CI-FIX3-001             | CI-FIX3-003 | orch  | 2026-02-12T17:02Z | 2026-02-12T17:04Z | 5K       | 3K   |
-| CI-FIX3-003 | done   | Verification: validate all pipeline #363 fixes                                                        |       | all  | develop | CI-FIX3-001,CI-FIX3-002 |             | orch  | 2026-02-12T17:04Z | 2026-02-12T17:05Z | 3K       | 1K   |
-
-## Pipeline #363 CVE Mitigation (proper fixes, not just suppression)
-
-| id         | status | description                                                                              | issue | repo   | branch  | depends_on                       | blocks     | agent     | started_at        | completed_at      | estimate | used |
-| ---------- | ------ | ---------------------------------------------------------------------------------------- | ----- | ------ | ------- | -------------------------------- | ---------- | --------- | ----------------- | ----------------- | -------- | ---- |
-| CI-MIT-001 | done   | Build gosu from source with Go 1.26 (eliminates 6 Go stdlib CVEs in postgres image)      | #363  | docker | develop |                                  | CI-MIT-003 | worker-10 | 2026-02-12T17:10Z | 2026-02-12T17:12Z | 8K       | 5K   |
-| CI-MIT-002 | done   | Remove npm from 3 Node.js production images (eliminates 5 npm bundled CVEs)              |       | apps   | develop |                                  | CI-MIT-003 | worker-11 | 2026-02-12T17:10Z | 2026-02-12T17:12Z | 5K       | 5K   |
-| CI-MIT-003 | done   | Trim .trivyignore to OpenBao-only (5 CVEs: 4 false positives + 1 upstream Go stdlib)     |       | ci     | develop | CI-MIT-001,CI-MIT-002            | CI-MIT-004 | orch      | 2026-02-12T17:13Z | 2026-02-12T17:14Z | 3K       | 2K   |
-| CI-MIT-004 | done   | Verification: 11 of 16 CVEs eliminated at source, 5 remaining documented in .trivyignore |       | all    | develop | CI-MIT-001,CI-MIT-002,CI-MIT-003 |            | orch      | 2026-02-12T17:14Z | 2026-02-12T17:15Z | 3K       | 1K   |
-
-## Pipeline #365 Follow-up Fixes
-
-| id          | status | description                                                                                       | issue | repo         | branch  | depends_on              | blocks      | agent     | started_at        | completed_at      | estimate | used |
-| ----------- | ------ | ------------------------------------------------------------------------------------------------- | ----- | ------------ | ------- | ----------------------- | ----------- | --------- | ----------------- | ----------------- | -------- | ---- |
-| CI-FIX5-001 | done   | Add build-shared step to web.yml (fixes lint/typecheck/test: @mosaic/shared not found)            | #364  | ci           | develop |                         | CI-FIX5-003 | worker-12 | 2026-02-12T18:00Z | 2026-02-12T18:02Z | 5K       | 3K   |
-| CI-FIX5-002 | done   | Remove compiled test files from orchestrator production image (Trivy secret scan false positives) | #365  | orchestrator | develop |                         | CI-FIX5-003 | worker-13 | 2026-02-12T18:00Z | 2026-02-12T18:02Z | 5K       | 3K   |
-| CI-FIX5-003 | done   | Verification: validate all pipeline #365 fixes                                                    |       | all          | develop | CI-FIX5-001,CI-FIX5-002 |             | orch      | 2026-02-12T18:03Z | 2026-02-12T18:04Z | 3K       | 1K   |
-
-## Pipeline #366 Fixes
-
-**Branch:** fix/ci-366
-**Reports:** docs/reports/ci/mosaic-stack-366-\*.log
-**Root causes:** (1) web.yml build-shared missing @mosaic/ui build, (2) Dockerfile find -o without parens, (3) untyped event handlers
+### CI Fix Round 6

 | id          | status | description                                                                                  | issue | repo         | branch     | depends_on              | blocks      | agent | started_at        | completed_at      | estimate | used |
 | ----------- | ------ | -------------------------------------------------------------------------------------------- | ----- | ------------ | ---------- | ----------------------- | ----------- | ----- | ----------------- | ----------------- | -------- | ---- |
@@ -152,3 +93,66 @@
 - #387 already completed in commit 6e20fc5
 - #377 is the EPIC issue — closed after all reviews remediated
 - 187 tests passing after remediation (41 matrix, 20 streaming, 10 room, 26 integration, 27 herald, 25 discord, + others)
+
+---
+
+## M13-SpeechServices (0.0.13) — TTS & STT Integration
+
+**Orchestrator:** Claude Code
+**Started:** 2026-02-15
+**Branch:** feature/m13-speech-services
+**Milestone:** M13-SpeechServices (0.0.13)
+**Epic:** #388
+
+### Phase 1: Foundation (Config + Module + Providers)
+
+| id         | status | description                                                              | issue | repo | branch                      | depends_on | blocks                           | agent    | started_at        | completed_at      | estimate | used | notes             |
+| ---------- | ------ | ------------------------------------------------------------------------ | ----- | ---- | --------------------------- | ---------- | -------------------------------- | -------- | ----------------- | ----------------- | -------- | ---- | ----------------- |
+| SP-CFG-001 | done   | #401: Speech services environment variables and ConfigModule integration | #401  | api  | feature/m13-speech-services |            | SP-MOD-001,SP-DOC-001            | worker-1 | 2026-02-15T06:00Z | 2026-02-15T06:07Z | 15K      | 15K  | 51 tests, 4cc43be |
+| SP-MOD-001 | done   | #389: Create SpeechModule with provider abstraction layer                | #389  | api  | feature/m13-speech-services | SP-CFG-001 | SP-STT-001,SP-TTS-001,SP-MID-001 | worker-2 | 2026-02-15T06:08Z | 2026-02-15T06:14Z | 25K      | 25K  | 27 tests, c40373f |
+
+### Phase 2: Providers (STT + TTS)
+
+| id         | status | description                                                            | issue | repo | branch                      | depends_on | blocks                                     | agent    | started_at        | completed_at      | estimate | used | notes             |
+| ---------- | ------ | ---------------------------------------------------------------------- | ----- | ---- | --------------------------- | ---------- | ------------------------------------------ | -------- | ----------------- | ----------------- | -------- | ---- | ----------------- |
+| SP-STT-001 | done   | #390: Implement STT provider with Speaches/faster-whisper integration  | #390  | api  | feature/m13-speech-services | SP-MOD-001 | SP-EP-001,SP-WS-001                        | worker-4 | 2026-02-15T06:15Z | 2026-02-15T06:25Z | 20K      | 50K  | 27 tests, 3ae9e53 |
+| SP-TTS-001 | done   | #391: Implement tiered TTS provider architecture                       | #391  | api  | feature/m13-speech-services | SP-MOD-001 | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-EP-002 | worker-5 | 2026-02-15T06:15Z | 2026-02-15T06:25Z | 20K      | 35K  | 30 tests, b5edb4f |
+| SP-TTS-002 | done   | #393: Implement Kokoro-FastAPI TTS provider (default tier)             | #393  | api  | feature/m13-speech-services | SP-TTS-001 | SP-EP-002                                  | worker-6 | 2026-02-15T06:26Z | 2026-02-15T06:33Z | 15K      | 25K  | 48 tests, 79b1d81 |
+| SP-TTS-003 | done   | #394: Implement Chatterbox TTS provider (premium tier, voice cloning)  | #394  | api  | feature/m13-speech-services | SP-TTS-001 | SP-EP-002                                  | worker-7 | 2026-02-15T06:26Z | 2026-02-15T06:34Z | 15K      | 25K  | 26 tests, d37c78f |
+| SP-TTS-004 | done   | #395: Implement Piper TTS provider via OpenedAI Speech (fallback tier) | #395  | api  | feature/m13-speech-services | SP-TTS-001 | SP-EP-002                                  | worker-8 | 2026-02-15T06:35Z | 2026-02-15T06:44Z | 12K      | 15K  | 37 tests, 6c46556 |
+
+### Phase 3: Middleware + REST Endpoints
+
+| id         | status | description                                                | issue | repo | branch                      | depends_on                                  | blocks              | agent     | started_at        | completed_at      | estimate | used | notes             |
+| ---------- | ------ | ---------------------------------------------------------- | ----- | ---- | --------------------------- | ------------------------------------------- | ------------------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- |
+| SP-MID-001 | done   | #398: Audio format validation and preprocessing middleware | #398  | api  | feature/m13-speech-services | SP-MOD-001                                  | SP-EP-001,SP-EP-002 | worker-9  | 2026-02-15T06:35Z | 2026-02-15T06:42Z | 15K      | 25K  | 36 tests, 7b4fda6 |
+| SP-EP-001  | done   | #392: Create /api/speech/transcribe REST endpoint          | #392  | api  | feature/m13-speech-services | SP-STT-001,SP-MID-001                       | SP-WS-001,SP-FE-001 | worker-10 | 2026-02-15T06:45Z | 2026-02-15T06:52Z | 20K      | 25K  | 10 tests, 527262a |
+| SP-EP-002  | done   | #396: Create /api/speech/synthesize REST endpoint          | #396  | api  | feature/m13-speech-services | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-MID-001 | SP-FE-002           | worker-11 | 2026-02-15T06:45Z | 2026-02-15T06:53Z | 20K      | 35K  | 17 tests, 527262a |
+
+### Phase 4: WebSocket Streaming
+
+| id        | status | description                                                | issue | repo | branch                      | depends_on           | blocks    | agent     | started_at        | completed_at      | estimate | used | notes             |
+| --------- | ------ | ---------------------------------------------------------- | ----- | ---- | --------------------------- | -------------------- | --------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- |
+| SP-WS-001 | done   | #397: Implement WebSocket streaming transcription endpoint | #397  | api  | feature/m13-speech-services | SP-STT-001,SP-EP-001 | SP-FE-001 | worker-12 | 2026-02-15T06:54Z | 2026-02-15T07:00Z | 20K      | 30K  | 29 tests, 28c9e6f |
+
+### Phase 5: Docker/DevOps
+
+| id         | status | description                                                    | issue | repo   | branch                      | depends_on | blocks     | agent     | started_at        | completed_at      | estimate | used | notes   |
+| ---------- | ------ | -------------------------------------------------------------- | ----- | ------ | --------------------------- | ---------- | ---------- | --------- | ----------------- | ----------------- | -------- | ---- | ------- |
+| SP-DOC-001 | done   | #399: Docker Compose dev overlay for speech services           | #399  | devops | feature/m13-speech-services | SP-CFG-001 | SP-DOC-002 | worker-3  | 2026-02-15T06:08Z | 2026-02-15T06:10Z | 10K      | 15K  | 52553c8 |
+| SP-DOC-002 | done   | #400: Docker Compose swarm/prod deployment for speech services | #400  | devops | feature/m13-speech-services | SP-DOC-001 |            | worker-13 | 2026-02-15T06:54Z | 2026-02-15T06:56Z | 10K      | 8K   | b3d6d73 |
+
+### Phase 6: Frontend
+
+| id        | status | description                                                               | issue | repo | branch                      | depends_on          | blocks     | agent     | started_at        | completed_at      | estimate | used | notes             |
+| --------- | ------ | ------------------------------------------------------------------------- | ----- | ---- | --------------------------- | ------------------- | ---------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- |
+| SP-FE-001 | done   | #402: Frontend voice input component (microphone capture + transcription) | #402  | web  | feature/m13-speech-services | SP-EP-001,SP-WS-001 | SP-FE-003  | worker-14 | 2026-02-15T07:01Z | 2026-02-15T07:12Z | 25K      | 50K  | 34 tests, 74d6c10 |
+| SP-FE-002 | done   | #403: Frontend audio playback component for TTS output                    | #403  | web  | feature/m13-speech-services | SP-EP-002           | SP-FE-003  | worker-15 | 2026-02-15T07:01Z | 2026-02-15T07:11Z | 20K      | 50K  | 32 tests, 74d6c10 |
+| SP-FE-003 | done   | #404: Frontend speech settings page (provider selection, voice config)    | #404  | web  | feature/m13-speech-services | SP-FE-001,SP-FE-002 | SP-E2E-001 | worker-16 | 2026-02-15T07:13Z | 2026-02-15T07:22Z | 20K      | 35K  | 30 tests, bc86947 |
+
+### Phase 7: Testing + Documentation
+
+| id          | status | description                                                             | issue | repo | branch                      | depends_on                              | blocks      | agent     | started_at        | completed_at      | estimate | used | notes             |
+| ----------- | ------ | ----------------------------------------------------------------------- | ----- | ---- | --------------------------- | --------------------------------------- | ----------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- |
+| SP-E2E-001  | done   | #405: E2E integration tests for speech services                         | #405  | api  | feature/m13-speech-services | SP-EP-001,SP-EP-002,SP-WS-001,SP-FE-003 | SP-DOCS-001 | worker-17 | 2026-02-15T07:23Z | 2026-02-15T07:32Z | 25K      | 35K  | 30 tests, d2c7602 |
+| SP-DOCS-001 | done   | #406: Documentation - Speech services architecture, API, and deployment | #406  | docs | feature/m13-speech-services | SP-E2E-001                              |             | worker-18 | 2026-02-15T07:23Z | 2026-02-15T07:29Z | 15K      | 35K  | 24065aa           |