From fb53272fa97df8c20fd968987e2d7105d221a7d7 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 01:56:06 -0600 Subject: [PATCH 01/19] chore(orchestrator): Bootstrap M13-SpeechServices tasks.md 18 tasks across 7 phases for TTS & STT integration. Estimated total: ~322K tokens. Co-Authored-By: Claude Opus 4.6 --- docs/tasks.md | 104 +++++++++++++++++++++----------------------------- 1 file changed, 44 insertions(+), 60 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 036bde0..431cfec 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -1,76 +1,60 @@ -# Tasks - -## M11-CIPipeline (0.0.11) — CI Pipeline #360 Remediation +# Tasks — M13-SpeechServices (0.0.13) **Orchestrator:** Claude Code -**Started:** 2026-02-12 -**Branch:** develop -**Reports:** docs/reports/ci/mosaic-stack-360-\*.log +**Started:** 2026-02-15 +**Branch:** feature/m13-speech-services +**Milestone:** M13-SpeechServices (0.0.13) +**Epic:** #388 -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | -| ----------- | ------ | ------------------------------------------------------------------------------------------ | ----- | ----------- | ------------------ | --------------------- | ----------- | -------- | ----------------- | ----------------- | -------- | --------- | -| CI-SEC-001 | done | Update OpenBao Docker image to fix CRITICAL CVE-2025-68121 + 4 HIGH CVEs | #363 | docker | fix/ci-security | | CI-SEC-003 | worker-1 | 2026-02-12T12:40Z | 2026-02-12T12:42Z | 10K | 8K | -| CI-SEC-002 | done | Update Postgres Docker image/gosu to fix CRITICAL CVE-2025-68121 + 5 HIGH CVEs | #363 | docker | fix/ci-security | | CI-SEC-003 | worker-2 | 2026-02-12T12:40Z | 2026-02-12T12:44Z | 10K | 25K | -| CI-SEC-003 | done | Phase 1 verification: validate Docker image security fixes | #363 | docker | fix/ci-security | CI-SEC-001,CI-SEC-002 | CI-PIPE-001 | orch | 2026-02-12T12:45Z | 2026-02-12T12:47Z | 5K | 2K | -| CI-PIPE-001 | done | Fix .woodpecker/api.yml lint step to depend on prisma-generate (fixes 3,919 ESLint errors) | #364 | ci | fix/ci-pipeline | CI-SEC-003 | CI-PIPE-002 | worker-3 | 2026-02-12T12:48Z | 2026-02-12T12:50Z | 3K | 8K | -| CI-PIPE-002 | done | Phase 2 verification: validate CI pipeline fix | #364 | ci | fix/ci-pipeline | CI-PIPE-001 | CI-CQ-001 | orch | 2026-02-12T12:50Z | 2026-02-12T12:51Z | 3K | 1K | -| CI-CQ-001 | done | Fix ruff check errors in coordinator (20 errors: StrEnum, imports, line length) | #365 | coordinator | fix/ci-coordinator | CI-PIPE-002 | CI-CQ-002 | worker-4 | 2026-02-12T12:52Z | 2026-02-12T12:57Z | 8K | 25K | -| CI-CQ-002 | done | Fix mypy error in coordinator src/main.py:144 (add_exception_handler type) | #365 | coordinator | fix/ci-coordinator | CI-CQ-001 | CI-CQ-003 | worker-4 | 2026-02-12T12:52Z | 2026-02-12T12:57Z | 5K | (batched) | -| CI-CQ-003 | done | Upgrade pip in coordinator Dockerfile and document bandit B104 finding | #365 | coordinator | fix/ci-coordinator | CI-CQ-002 | CI-CQ-004 | worker-4 | 2026-02-12T12:52Z | 2026-02-12T12:57Z | 5K | (batched) | -| CI-CQ-004 | done | Phase 3 verification: validate all coordinator fixes | #365 | coordinator | fix/ci-coordinator | CI-CQ-003 | | orch | 2026-02-12T12:58Z | 2026-02-12T12:58Z | 5K | 1K | +## Phase 1: Foundation (Config + Module + Providers) -## Pipeline #361 Follow-up Fixes +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| SP-CFG-001 | not-started | #401: Speech services environment variables and ConfigModule integration | #401 | api | feature/m13-speech-services | | SP-MOD-001,SP-DOC-001 | | | | 15K | | | +| SP-MOD-001 | not-started | #389: Create SpeechModule with provider abstraction layer | #389 | api | feature/m13-speech-services | SP-CFG-001 | SP-STT-001,SP-TTS-001,SP-MID-001 | | | | 25K | | | -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | -| ---------- | ------ | ---------------------------------------------------------------------------------------- | ----- | ----------- | ------- | -------------------------------- | ---------- | -------- | ----------------- | ----------------- | -------- | --------- | -| CI-FIX-001 | done | Fix Postgres Docker build: use COPY --from=tianon/gosu instead of go install | #363 | docker | develop | | CI-FIX-004 | worker-5 | 2026-02-12T16:10Z | 2026-02-12T16:15Z | 5K | 4K | -| CI-FIX-002 | done | Add build-shared step to API pipeline (fixes lint + typecheck: @mosaic/shared not found) | #364 | ci | develop | | CI-FIX-004 | worker-6 | 2026-02-12T16:10Z | 2026-02-12T16:17Z | 8K | 12K | -| CI-FIX-003 | done | Fix coordinator CI: use bandit.yaml config, upgrade pip in CI venv install step | #365 | coordinator | develop | | CI-FIX-004 | worker-6 | 2026-02-12T16:10Z | 2026-02-12T16:17Z | 5K | (batched) | -| CI-FIX-004 | done | Verification: all pipeline #361 fixes validated | | all | develop | CI-FIX-001,CI-FIX-002,CI-FIX-003 | | orch | 2026-02-12T16:18Z | 2026-02-12T16:20Z | 3K | 1K | +## Phase 2: Providers (STT + TTS) -## Pipeline #362 Follow-up Fixes +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| SP-STT-001 | not-started | #390: Implement STT provider with Speaches/faster-whisper integration | #390 | api | feature/m13-speech-services | SP-MOD-001 | SP-EP-001,SP-WS-001 | | | | 20K | | | +| SP-TTS-001 | not-started | #391: Implement tiered TTS provider architecture | #391 | api | feature/m13-speech-services | SP-MOD-001 | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-EP-002 | | | | 20K | | | +| SP-TTS-002 | not-started | #393: Implement Kokoro-FastAPI TTS provider (default tier) | #393 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | | | | 15K | | | +| SP-TTS-003 | not-started | #394: Implement Chatterbox TTS provider (premium tier, voice cloning) | #394 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | | | | 15K | | | +| SP-TTS-004 | not-started | #395: Implement Piper TTS provider via OpenedAI Speech (fallback tier) | #395 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | | | | 12K | | | -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | -| ----------- | ------ | ---------------------------------------------------------------------------------------------- | ----- | ----------- | ------- | ----------------------------------- | ----------- | -------- | ----------------- | ----------------- | -------- | ---- | -| CI-FIX2-001 | done | Fix Postgres Dockerfile: remove setuid bit (chmod +sx → chmod +x) — gosu rejects setuid | #363 | docker | develop | | CI-FIX2-004 | worker-7 | 2026-02-12T16:30Z | 2026-02-12T16:32Z | 3K | 2K | -| CI-FIX2-002 | done | Fix Trivy coordinator: upgrade setuptools>=80.9 and wheel>=0.46.2 to fix 5 HIGH CVEs | #365 | coordinator | develop | | CI-FIX2-004 | worker-8 | 2026-02-12T16:30Z | 2026-02-12T16:32Z | 5K | 3K | -| CI-FIX2-003 | done | Exclude 4 pre-existing integration test files from CI test step (M4/M5 debt, no DB migrations) | #364 | ci | develop | | CI-FIX2-004 | worker-9 | 2026-02-12T16:30Z | 2026-02-12T16:32Z | 5K | 3K | -| CI-FIX2-004 | done | Verification: validate all pipeline #362 fixes | | all | develop | CI-FIX2-001,CI-FIX2-002,CI-FIX2-003 | | orch | 2026-02-12T16:33Z | 2026-02-12T16:34Z | 3K | 2K | +## Phase 3: Middleware + REST Endpoints -## Pipeline #363 Follow-up Fixes +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| SP-MID-001 | not-started | #398: Audio format validation and preprocessing middleware | #398 | api | feature/m13-speech-services | SP-MOD-001 | SP-EP-001,SP-EP-002 | | | | 15K | | | +| SP-EP-001 | not-started | #392: Create /api/speech/transcribe REST endpoint | #392 | api | feature/m13-speech-services | SP-STT-001,SP-MID-001 | SP-WS-001,SP-FE-001 | | | | 20K | | | +| SP-EP-002 | not-started | #396: Create /api/speech/synthesize REST endpoint | #396 | api | feature/m13-speech-services | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-MID-001 | SP-FE-002 | | | | 20K | | | -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | -| ----------- | ------ | ----------------------------------------------------------------------------------------------------- | ----- | ---- | ------- | ----------------------- | ----------- | ----- | ----------------- | ----------------- | -------- | ---- | -| CI-FIX3-001 | done | Create .trivyignore for upstream CVEs (Go stdlib in openbao/gosu, npm bundled pkgs in node:20-alpine) | | ci | develop | | CI-FIX3-002 | orch | 2026-02-12T17:00Z | 2026-02-12T17:02Z | 5K | 3K | -| CI-FIX3-002 | done | Update all Trivy CI steps (6 steps across 5 pipelines) to use --ignorefile .trivyignore | | ci | develop | CI-FIX3-001 | CI-FIX3-003 | orch | 2026-02-12T17:02Z | 2026-02-12T17:04Z | 5K | 3K | -| CI-FIX3-003 | done | Verification: validate all pipeline #363 fixes | | all | develop | CI-FIX3-001,CI-FIX3-002 | | orch | 2026-02-12T17:04Z | 2026-02-12T17:05Z | 3K | 1K | +## Phase 4: WebSocket Streaming -## Pipeline #363 CVE Mitigation (proper fixes, not just suppression) +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| SP-WS-001 | not-started | #397: Implement WebSocket streaming transcription endpoint | #397 | api | feature/m13-speech-services | SP-STT-001,SP-EP-001 | SP-FE-001 | | | | 20K | | | -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | -| ---------- | ------ | ---------------------------------------------------------------------------------------- | ----- | ------ | ------- | -------------------------------- | ---------- | --------- | ----------------- | ----------------- | -------- | ---- | -| CI-MIT-001 | done | Build gosu from source with Go 1.26 (eliminates 6 Go stdlib CVEs in postgres image) | #363 | docker | develop | | CI-MIT-003 | worker-10 | 2026-02-12T17:10Z | 2026-02-12T17:12Z | 8K | 5K | -| CI-MIT-002 | done | Remove npm from 3 Node.js production images (eliminates 5 npm bundled CVEs) | | apps | develop | | CI-MIT-003 | worker-11 | 2026-02-12T17:10Z | 2026-02-12T17:12Z | 5K | 5K | -| CI-MIT-003 | done | Trim .trivyignore to OpenBao-only (5 CVEs: 4 false positives + 1 upstream Go stdlib) | | ci | develop | CI-MIT-001,CI-MIT-002 | CI-MIT-004 | orch | 2026-02-12T17:13Z | 2026-02-12T17:14Z | 3K | 2K | -| CI-MIT-004 | done | Verification: 11 of 16 CVEs eliminated at source, 5 remaining documented in .trivyignore | | all | develop | CI-MIT-001,CI-MIT-002,CI-MIT-003 | | orch | 2026-02-12T17:14Z | 2026-02-12T17:15Z | 3K | 1K | +## Phase 5: Docker/DevOps -## Pipeline #365 Follow-up Fixes +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| SP-DOC-001 | not-started | #399: Docker Compose dev overlay for speech services | #399 | devops | feature/m13-speech-services | SP-CFG-001 | SP-DOC-002 | | | | 10K | | | +| SP-DOC-002 | not-started | #400: Docker Compose swarm/prod deployment for speech services | #400 | devops | feature/m13-speech-services | SP-DOC-001 | | | | | 10K | | | -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | -| ----------- | ------ | ------------------------------------------------------------------------------------------------- | ----- | ------------ | ------- | ----------------------- | ----------- | --------- | ----------------- | ----------------- | -------- | ---- | -| CI-FIX5-001 | done | Add build-shared step to web.yml (fixes lint/typecheck/test: @mosaic/shared not found) | #364 | ci | develop | | CI-FIX5-003 | worker-12 | 2026-02-12T18:00Z | 2026-02-12T18:02Z | 5K | 3K | -| CI-FIX5-002 | done | Remove compiled test files from orchestrator production image (Trivy secret scan false positives) | #365 | orchestrator | develop | | CI-FIX5-003 | worker-13 | 2026-02-12T18:00Z | 2026-02-12T18:02Z | 5K | 3K | -| CI-FIX5-003 | done | Verification: validate all pipeline #365 fixes | | all | develop | CI-FIX5-001,CI-FIX5-002 | | orch | 2026-02-12T18:03Z | 2026-02-12T18:04Z | 3K | 1K | +## Phase 6: Frontend -## Pipeline #366 Fixes +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| SP-FE-001 | not-started | #402: Frontend voice input component (microphone capture + transcription) | #402 | web | feature/m13-speech-services | SP-EP-001,SP-WS-001 | SP-FE-003 | | | | 25K | | | +| SP-FE-002 | not-started | #403: Frontend audio playback component for TTS output | #403 | web | feature/m13-speech-services | SP-EP-002 | SP-FE-003 | | | | 20K | | | +| SP-FE-003 | not-started | #404: Frontend speech settings page (provider selection, voice config) | #404 | web | feature/m13-speech-services | SP-FE-001,SP-FE-002 | SP-E2E-001 | | | | 20K | | | -**Branch:** fix/ci-366 -**Reports:** docs/reports/ci/mosaic-stack-366-\*.log -**Root causes:** (1) web.yml build-shared missing @mosaic/ui build, (2) Dockerfile find -o without parens, (3) untyped event handlers +## Phase 7: Testing + Documentation -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | -| ----------- | ------ | -------------------------------------------------------------------------------------------- | ----- | ------------ | ---------- | ----------------------- | ----------- | ----- | ----------------- | ----------------- | -------- | ---- | -| CI-FIX6-001 | done | Add @mosaic/ui build to web.yml build-shared step (fixes 10 test suites + 20 typecheck errs) | | ci | fix/ci-366 | | CI-FIX6-003 | w-14 | 2026-02-12T21:00Z | 2026-02-12T21:01Z | 3K | 3K | -| CI-FIX6-002 | done | Move spec file removal to builder stage (layer-aware); add tar CVEs to .trivyignore | | orchestrator | fix/ci-366 | | CI-FIX6-004 | w-15 | 2026-02-12T21:00Z | 2026-02-12T21:15Z | 3K | 5K | -| CI-FIX6-003 | done | Add React.ChangeEvent types to ~10 web files with untyped event handlers (49 lint + 19 TS) | | web | fix/ci-366 | CI-FIX6-001 | CI-FIX6-004 | w-16 | 2026-02-12T21:02Z | 2026-02-12T21:08Z | 12K | 8K | -| CI-FIX6-004 | done | Verification: pnpm lint && pnpm typecheck && pnpm test on web; Dockerfile find validation | | all | fix/ci-366 | CI-FIX6-002,CI-FIX6-003 | | orch | 2026-02-12T21:08Z | 2026-02-12T21:10Z | 5K | 2K | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| SP-E2E-001 | not-started | #405: E2E integration tests for speech services | #405 | api | feature/m13-speech-services | SP-EP-001,SP-EP-002,SP-WS-001,SP-FE-003 | SP-DOCS-001 | | | | 25K | | | +| SP-DOCS-001 | not-started | #406: Documentation - Speech services architecture, API, and deployment | #406 | docs | feature/m13-speech-services | SP-E2E-001 | | | | | 15K | | | From 4cc43bece6c9a67d86a953688c81ebd4cf5c72fa Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:03:21 -0600 Subject: [PATCH 02/19] feat(#401): add speech services config and env vars Add SpeechConfig with typed configuration and startup validation for STT (Whisper/Speaches), TTS default (Kokoro), TTS premium (Chatterbox), and TTS fallback (Piper/OpenedAI). Includes registerAs factory for NestJS ConfigModule integration, .env.example documentation, and 51 unit tests covering all validation paths. Refs #401 --- .env.example | 39 ++ apps/api/AGENTS.md | 19 +- apps/api/src/speech/speech.config.spec.ts | 458 ++++++++++++++++++++++ apps/api/src/speech/speech.config.ts | 304 ++++++++++++++ 4 files changed, 814 insertions(+), 6 deletions(-) create mode 100644 apps/api/src/speech/speech.config.spec.ts create mode 100644 apps/api/src/speech/speech.config.ts diff --git a/.env.example b/.env.example index 9ca59fd..05a7d8d 100644 --- a/.env.example +++ b/.env.example @@ -350,6 +350,45 @@ OLLAMA_MODEL=llama3.1:latest # Get your API key from: https://platform.openai.com/api-keys # OPENAI_API_KEY=sk-... +# ====================== +# Speech Services (STT / TTS) +# ====================== +# Speech-to-Text (STT) - Whisper via Speaches +# Set STT_ENABLED=true to enable speech-to-text transcription +# STT_BASE_URL is required when STT_ENABLED=true +STT_ENABLED=true +STT_BASE_URL=http://speaches:8000/v1 +STT_MODEL=Systran/faster-whisper-large-v3-turbo +STT_LANGUAGE=en + +# Text-to-Speech (TTS) - Default Engine (Kokoro) +# Set TTS_ENABLED=true to enable text-to-speech synthesis +# TTS_DEFAULT_URL is required when TTS_ENABLED=true +TTS_ENABLED=true +TTS_DEFAULT_URL=http://kokoro-tts:8880/v1 +TTS_DEFAULT_VOICE=af_heart +TTS_DEFAULT_FORMAT=mp3 + +# Text-to-Speech (TTS) - Premium Engine (Chatterbox) - Optional +# Higher quality voice cloning engine, disabled by default +# TTS_PREMIUM_URL is required when TTS_PREMIUM_ENABLED=true +TTS_PREMIUM_ENABLED=false +TTS_PREMIUM_URL=http://chatterbox-tts:8881/v1 + +# Text-to-Speech (TTS) - Fallback Engine (Piper/OpenedAI) - Optional +# Lightweight fallback engine, disabled by default +# TTS_FALLBACK_URL is required when TTS_FALLBACK_ENABLED=true +TTS_FALLBACK_ENABLED=false +TTS_FALLBACK_URL=http://openedai-speech:8000/v1 + +# Speech Service Limits +# Maximum upload file size in bytes (default: 25MB) +SPEECH_MAX_UPLOAD_SIZE=25000000 +# Maximum audio duration in seconds (default: 600 = 10 minutes) +SPEECH_MAX_DURATION_SECONDS=600 +# Maximum text length for TTS in characters (default: 4096) +SPEECH_MAX_TEXT_LENGTH=4096 + # ====================== # Logging & Debugging # ====================== diff --git a/apps/api/AGENTS.md b/apps/api/AGENTS.md index 7c937ef..db1a989 100644 --- a/apps/api/AGENTS.md +++ b/apps/api/AGENTS.md @@ -4,15 +4,22 @@ ## Patterns - +- **Config validation pattern**: Config files use exported validation functions + typed getter functions (not class-validator). See `auth.config.ts`, `federation.config.ts`, `speech/speech.config.ts`. Pattern: export `isXEnabled()`, `validateXConfig()`, and `getXConfig()` functions. +- **Config registerAs**: `speech.config.ts` also exports a `registerAs("speech", ...)` factory for NestJS ConfigModule namespaced injection. Use `ConfigModule.forFeature(speechConfig)` in module imports and access via `this.config.get('speech.stt.baseUrl')`. +- **Conditional config validation**: When a service has an enabled flag (e.g., `STT_ENABLED`), URL/connection vars are only required when enabled. Validation throws with a helpful message suggesting how to disable. +- **Boolean env parsing**: Use `value === "true" || value === "1"` pattern. No default-true -- all services default to disabled when env var is unset. ## Gotchas - +- **Prisma client must be generated** before `tsc --noEmit` will pass. Run `pnpm prisma:generate` first. Pre-existing type errors from Prisma are expected in worktrees without generated client. +- **Pre-commit hooks**: lint-staged runs on staged files. If other packages' files are staged, their lint must pass too. Only stage files you intend to commit. +- **vitest runs all test files**: Even when targeting a specific test file, vitest loads all spec files. Many will fail if Prisma client isn't generated -- this is expected. Check only your target file's pass/fail status. ## Key Files -| File | Purpose | -| ---- | ------- | - - +| File | Purpose | +| ------------------------------------- | ---------------------------------------------------------------------- | +| `src/speech/speech.config.ts` | Speech services env var validation and typed config (STT, TTS, limits) | +| `src/speech/speech.config.spec.ts` | Unit tests for speech config validation (51 tests) | +| `src/auth/auth.config.ts` | Auth/OIDC config validation (reference pattern) | +| `src/federation/federation.config.ts` | Federation config validation (reference pattern) | diff --git a/apps/api/src/speech/speech.config.spec.ts b/apps/api/src/speech/speech.config.spec.ts new file mode 100644 index 0000000..f88be85 --- /dev/null +++ b/apps/api/src/speech/speech.config.spec.ts @@ -0,0 +1,458 @@ +/** + * Speech Configuration Tests + * + * Issue #401: Tests for speech services environment variable validation + * Tests cover STT, TTS (default, premium, fallback), and speech limits configuration. + */ + +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import { + isSttEnabled, + isTtsEnabled, + isTtsPremiumEnabled, + isTtsFallbackEnabled, + validateSpeechConfig, + getSpeechConfig, + type SpeechConfig, +} from "./speech.config"; + +describe("speech.config", () => { + const originalEnv = { ...process.env }; + + beforeEach(() => { + // Clear all speech-related env vars before each test + delete process.env.STT_ENABLED; + delete process.env.STT_BASE_URL; + delete process.env.STT_MODEL; + delete process.env.STT_LANGUAGE; + delete process.env.TTS_ENABLED; + delete process.env.TTS_DEFAULT_URL; + delete process.env.TTS_DEFAULT_VOICE; + delete process.env.TTS_DEFAULT_FORMAT; + delete process.env.TTS_PREMIUM_ENABLED; + delete process.env.TTS_PREMIUM_URL; + delete process.env.TTS_FALLBACK_ENABLED; + delete process.env.TTS_FALLBACK_URL; + delete process.env.SPEECH_MAX_UPLOAD_SIZE; + delete process.env.SPEECH_MAX_DURATION_SECONDS; + delete process.env.SPEECH_MAX_TEXT_LENGTH; + }); + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + // ========================================== + // STT enabled check + // ========================================== + describe("isSttEnabled", () => { + it("should return false when STT_ENABLED is not set", () => { + expect(isSttEnabled()).toBe(false); + }); + + it("should return false when STT_ENABLED is 'false'", () => { + process.env.STT_ENABLED = "false"; + expect(isSttEnabled()).toBe(false); + }); + + it("should return false when STT_ENABLED is '0'", () => { + process.env.STT_ENABLED = "0"; + expect(isSttEnabled()).toBe(false); + }); + + it("should return false when STT_ENABLED is empty string", () => { + process.env.STT_ENABLED = ""; + expect(isSttEnabled()).toBe(false); + }); + + it("should return true when STT_ENABLED is 'true'", () => { + process.env.STT_ENABLED = "true"; + expect(isSttEnabled()).toBe(true); + }); + + it("should return true when STT_ENABLED is '1'", () => { + process.env.STT_ENABLED = "1"; + expect(isSttEnabled()).toBe(true); + }); + }); + + // ========================================== + // TTS enabled check + // ========================================== + describe("isTtsEnabled", () => { + it("should return false when TTS_ENABLED is not set", () => { + expect(isTtsEnabled()).toBe(false); + }); + + it("should return false when TTS_ENABLED is 'false'", () => { + process.env.TTS_ENABLED = "false"; + expect(isTtsEnabled()).toBe(false); + }); + + it("should return true when TTS_ENABLED is 'true'", () => { + process.env.TTS_ENABLED = "true"; + expect(isTtsEnabled()).toBe(true); + }); + + it("should return true when TTS_ENABLED is '1'", () => { + process.env.TTS_ENABLED = "1"; + expect(isTtsEnabled()).toBe(true); + }); + }); + + // ========================================== + // TTS premium enabled check + // ========================================== + describe("isTtsPremiumEnabled", () => { + it("should return false when TTS_PREMIUM_ENABLED is not set", () => { + expect(isTtsPremiumEnabled()).toBe(false); + }); + + it("should return false when TTS_PREMIUM_ENABLED is 'false'", () => { + process.env.TTS_PREMIUM_ENABLED = "false"; + expect(isTtsPremiumEnabled()).toBe(false); + }); + + it("should return true when TTS_PREMIUM_ENABLED is 'true'", () => { + process.env.TTS_PREMIUM_ENABLED = "true"; + expect(isTtsPremiumEnabled()).toBe(true); + }); + }); + + // ========================================== + // TTS fallback enabled check + // ========================================== + describe("isTtsFallbackEnabled", () => { + it("should return false when TTS_FALLBACK_ENABLED is not set", () => { + expect(isTtsFallbackEnabled()).toBe(false); + }); + + it("should return false when TTS_FALLBACK_ENABLED is 'false'", () => { + process.env.TTS_FALLBACK_ENABLED = "false"; + expect(isTtsFallbackEnabled()).toBe(false); + }); + + it("should return true when TTS_FALLBACK_ENABLED is 'true'", () => { + process.env.TTS_FALLBACK_ENABLED = "true"; + expect(isTtsFallbackEnabled()).toBe(true); + }); + }); + + // ========================================== + // validateSpeechConfig + // ========================================== + describe("validateSpeechConfig", () => { + describe("when all services are disabled", () => { + it("should not throw when no speech services are enabled", () => { + expect(() => validateSpeechConfig()).not.toThrow(); + }); + + it("should not throw when services are explicitly disabled", () => { + process.env.STT_ENABLED = "false"; + process.env.TTS_ENABLED = "false"; + process.env.TTS_PREMIUM_ENABLED = "false"; + process.env.TTS_FALLBACK_ENABLED = "false"; + expect(() => validateSpeechConfig()).not.toThrow(); + }); + }); + + describe("STT validation", () => { + beforeEach(() => { + process.env.STT_ENABLED = "true"; + }); + + it("should throw when STT is enabled but STT_BASE_URL is missing", () => { + expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL"); + expect(() => validateSpeechConfig()).toThrow( + "STT is enabled (STT_ENABLED=true) but required environment variables are missing" + ); + }); + + it("should throw when STT_BASE_URL is empty string", () => { + process.env.STT_BASE_URL = ""; + expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL"); + }); + + it("should throw when STT_BASE_URL is whitespace only", () => { + process.env.STT_BASE_URL = " "; + expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL"); + }); + + it("should not throw when STT is enabled and STT_BASE_URL is set", () => { + process.env.STT_BASE_URL = "http://speaches:8000/v1"; + expect(() => validateSpeechConfig()).not.toThrow(); + }); + + it("should suggest disabling STT in error message", () => { + expect(() => validateSpeechConfig()).toThrow("STT_ENABLED=false"); + }); + }); + + describe("TTS default validation", () => { + beforeEach(() => { + process.env.TTS_ENABLED = "true"; + }); + + it("should throw when TTS is enabled but TTS_DEFAULT_URL is missing", () => { + expect(() => validateSpeechConfig()).toThrow("TTS_DEFAULT_URL"); + expect(() => validateSpeechConfig()).toThrow( + "TTS is enabled (TTS_ENABLED=true) but required environment variables are missing" + ); + }); + + it("should throw when TTS_DEFAULT_URL is empty string", () => { + process.env.TTS_DEFAULT_URL = ""; + expect(() => validateSpeechConfig()).toThrow("TTS_DEFAULT_URL"); + }); + + it("should not throw when TTS is enabled and TTS_DEFAULT_URL is set", () => { + process.env.TTS_DEFAULT_URL = "http://kokoro-tts:8880/v1"; + expect(() => validateSpeechConfig()).not.toThrow(); + }); + + it("should suggest disabling TTS in error message", () => { + expect(() => validateSpeechConfig()).toThrow("TTS_ENABLED=false"); + }); + }); + + describe("TTS premium validation", () => { + beforeEach(() => { + process.env.TTS_PREMIUM_ENABLED = "true"; + }); + + it("should throw when TTS premium is enabled but TTS_PREMIUM_URL is missing", () => { + expect(() => validateSpeechConfig()).toThrow("TTS_PREMIUM_URL"); + expect(() => validateSpeechConfig()).toThrow( + "TTS premium is enabled (TTS_PREMIUM_ENABLED=true) but required environment variables are missing" + ); + }); + + it("should throw when TTS_PREMIUM_URL is empty string", () => { + process.env.TTS_PREMIUM_URL = ""; + expect(() => validateSpeechConfig()).toThrow("TTS_PREMIUM_URL"); + }); + + it("should not throw when TTS premium is enabled and TTS_PREMIUM_URL is set", () => { + process.env.TTS_PREMIUM_URL = "http://chatterbox-tts:8881/v1"; + expect(() => validateSpeechConfig()).not.toThrow(); + }); + + it("should suggest disabling TTS premium in error message", () => { + expect(() => validateSpeechConfig()).toThrow("TTS_PREMIUM_ENABLED=false"); + }); + }); + + describe("TTS fallback validation", () => { + beforeEach(() => { + process.env.TTS_FALLBACK_ENABLED = "true"; + }); + + it("should throw when TTS fallback is enabled but TTS_FALLBACK_URL is missing", () => { + expect(() => validateSpeechConfig()).toThrow("TTS_FALLBACK_URL"); + expect(() => validateSpeechConfig()).toThrow( + "TTS fallback is enabled (TTS_FALLBACK_ENABLED=true) but required environment variables are missing" + ); + }); + + it("should throw when TTS_FALLBACK_URL is empty string", () => { + process.env.TTS_FALLBACK_URL = ""; + expect(() => validateSpeechConfig()).toThrow("TTS_FALLBACK_URL"); + }); + + it("should not throw when TTS fallback is enabled and TTS_FALLBACK_URL is set", () => { + process.env.TTS_FALLBACK_URL = "http://openedai-speech:8000/v1"; + expect(() => validateSpeechConfig()).not.toThrow(); + }); + + it("should suggest disabling TTS fallback in error message", () => { + expect(() => validateSpeechConfig()).toThrow("TTS_FALLBACK_ENABLED=false"); + }); + }); + + describe("multiple services enabled simultaneously", () => { + it("should validate all enabled services", () => { + process.env.STT_ENABLED = "true"; + process.env.TTS_ENABLED = "true"; + // Missing both STT_BASE_URL and TTS_DEFAULT_URL + + expect(() => validateSpeechConfig()).toThrow("STT_BASE_URL"); + }); + + it("should pass when all enabled services are properly configured", () => { + process.env.STT_ENABLED = "true"; + process.env.STT_BASE_URL = "http://speaches:8000/v1"; + process.env.TTS_ENABLED = "true"; + process.env.TTS_DEFAULT_URL = "http://kokoro-tts:8880/v1"; + process.env.TTS_PREMIUM_ENABLED = "true"; + process.env.TTS_PREMIUM_URL = "http://chatterbox-tts:8881/v1"; + process.env.TTS_FALLBACK_ENABLED = "true"; + process.env.TTS_FALLBACK_URL = "http://openedai-speech:8000/v1"; + + expect(() => validateSpeechConfig()).not.toThrow(); + }); + }); + + describe("limits validation", () => { + it("should throw when SPEECH_MAX_UPLOAD_SIZE is not a valid number", () => { + process.env.SPEECH_MAX_UPLOAD_SIZE = "not-a-number"; + expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_UPLOAD_SIZE"); + expect(() => validateSpeechConfig()).toThrow("must be a positive integer"); + }); + + it("should throw when SPEECH_MAX_UPLOAD_SIZE is negative", () => { + process.env.SPEECH_MAX_UPLOAD_SIZE = "-100"; + expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_UPLOAD_SIZE"); + }); + + it("should throw when SPEECH_MAX_UPLOAD_SIZE is zero", () => { + process.env.SPEECH_MAX_UPLOAD_SIZE = "0"; + expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_UPLOAD_SIZE"); + }); + + it("should throw when SPEECH_MAX_DURATION_SECONDS is not a valid number", () => { + process.env.SPEECH_MAX_DURATION_SECONDS = "abc"; + expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_DURATION_SECONDS"); + }); + + it("should throw when SPEECH_MAX_TEXT_LENGTH is not a valid number", () => { + process.env.SPEECH_MAX_TEXT_LENGTH = "xyz"; + expect(() => validateSpeechConfig()).toThrow("SPEECH_MAX_TEXT_LENGTH"); + }); + + it("should not throw when limits are valid positive integers", () => { + process.env.SPEECH_MAX_UPLOAD_SIZE = "50000000"; + process.env.SPEECH_MAX_DURATION_SECONDS = "1200"; + process.env.SPEECH_MAX_TEXT_LENGTH = "8192"; + expect(() => validateSpeechConfig()).not.toThrow(); + }); + + it("should not throw when limits are not set (uses defaults)", () => { + expect(() => validateSpeechConfig()).not.toThrow(); + }); + }); + }); + + // ========================================== + // getSpeechConfig + // ========================================== + describe("getSpeechConfig", () => { + it("should return default values when no env vars are set", () => { + const config = getSpeechConfig(); + + expect(config.stt.enabled).toBe(false); + expect(config.stt.baseUrl).toBe("http://speaches:8000/v1"); + expect(config.stt.model).toBe("Systran/faster-whisper-large-v3-turbo"); + expect(config.stt.language).toBe("en"); + + expect(config.tts.default.enabled).toBe(false); + expect(config.tts.default.url).toBe("http://kokoro-tts:8880/v1"); + expect(config.tts.default.voice).toBe("af_heart"); + expect(config.tts.default.format).toBe("mp3"); + + expect(config.tts.premium.enabled).toBe(false); + expect(config.tts.premium.url).toBe("http://chatterbox-tts:8881/v1"); + + expect(config.tts.fallback.enabled).toBe(false); + expect(config.tts.fallback.url).toBe("http://openedai-speech:8000/v1"); + + expect(config.limits.maxUploadSize).toBe(25000000); + expect(config.limits.maxDurationSeconds).toBe(600); + expect(config.limits.maxTextLength).toBe(4096); + }); + + it("should use custom env var values when set", () => { + process.env.STT_ENABLED = "true"; + process.env.STT_BASE_URL = "http://custom-stt:9000/v1"; + process.env.STT_MODEL = "custom-model"; + process.env.STT_LANGUAGE = "fr"; + + process.env.TTS_ENABLED = "true"; + process.env.TTS_DEFAULT_URL = "http://custom-tts:9001/v1"; + process.env.TTS_DEFAULT_VOICE = "custom_voice"; + process.env.TTS_DEFAULT_FORMAT = "wav"; + + process.env.TTS_PREMIUM_ENABLED = "true"; + process.env.TTS_PREMIUM_URL = "http://custom-premium:9002/v1"; + + process.env.TTS_FALLBACK_ENABLED = "true"; + process.env.TTS_FALLBACK_URL = "http://custom-fallback:9003/v1"; + + process.env.SPEECH_MAX_UPLOAD_SIZE = "50000000"; + process.env.SPEECH_MAX_DURATION_SECONDS = "1200"; + process.env.SPEECH_MAX_TEXT_LENGTH = "8192"; + + const config = getSpeechConfig(); + + expect(config.stt.enabled).toBe(true); + expect(config.stt.baseUrl).toBe("http://custom-stt:9000/v1"); + expect(config.stt.model).toBe("custom-model"); + expect(config.stt.language).toBe("fr"); + + expect(config.tts.default.enabled).toBe(true); + expect(config.tts.default.url).toBe("http://custom-tts:9001/v1"); + expect(config.tts.default.voice).toBe("custom_voice"); + expect(config.tts.default.format).toBe("wav"); + + expect(config.tts.premium.enabled).toBe(true); + expect(config.tts.premium.url).toBe("http://custom-premium:9002/v1"); + + expect(config.tts.fallback.enabled).toBe(true); + expect(config.tts.fallback.url).toBe("http://custom-fallback:9003/v1"); + + expect(config.limits.maxUploadSize).toBe(50000000); + expect(config.limits.maxDurationSeconds).toBe(1200); + expect(config.limits.maxTextLength).toBe(8192); + }); + + it("should return typed SpeechConfig object", () => { + const config: SpeechConfig = getSpeechConfig(); + + // Verify structure matches the SpeechConfig type + expect(config).toHaveProperty("stt"); + expect(config).toHaveProperty("tts"); + expect(config).toHaveProperty("limits"); + expect(config.tts).toHaveProperty("default"); + expect(config.tts).toHaveProperty("premium"); + expect(config.tts).toHaveProperty("fallback"); + }); + + it("should handle partial env var overrides", () => { + process.env.STT_ENABLED = "true"; + process.env.STT_BASE_URL = "http://custom-stt:9000/v1"; + // STT_MODEL and STT_LANGUAGE not set, should use defaults + + const config = getSpeechConfig(); + + expect(config.stt.enabled).toBe(true); + expect(config.stt.baseUrl).toBe("http://custom-stt:9000/v1"); + expect(config.stt.model).toBe("Systran/faster-whisper-large-v3-turbo"); + expect(config.stt.language).toBe("en"); + }); + + it("should parse numeric limits correctly", () => { + process.env.SPEECH_MAX_UPLOAD_SIZE = "10000000"; + const config = getSpeechConfig(); + expect(typeof config.limits.maxUploadSize).toBe("number"); + expect(config.limits.maxUploadSize).toBe(10000000); + }); + }); + + // ========================================== + // registerAs integration + // ========================================== + describe("speechConfig (registerAs factory)", () => { + it("should be importable as a config namespace factory", async () => { + const { speechConfig } = await import("./speech.config"); + expect(speechConfig).toBeDefined(); + expect(speechConfig.KEY).toBe("CONFIGURATION(speech)"); + }); + + it("should return config object when called", async () => { + const { speechConfig } = await import("./speech.config"); + const config = speechConfig() as SpeechConfig; + expect(config).toHaveProperty("stt"); + expect(config).toHaveProperty("tts"); + expect(config).toHaveProperty("limits"); + }); + }); +}); diff --git a/apps/api/src/speech/speech.config.ts b/apps/api/src/speech/speech.config.ts new file mode 100644 index 0000000..48487de --- /dev/null +++ b/apps/api/src/speech/speech.config.ts @@ -0,0 +1,304 @@ +/** + * Speech Services Configuration + * + * Issue #401: Environment variables and validation for STT (speech-to-text), + * TTS (text-to-speech), and speech service limits. + * + * Validates conditional requirements at startup: + * - STT_BASE_URL is required when STT_ENABLED=true + * - TTS_DEFAULT_URL is required when TTS_ENABLED=true + * - TTS_PREMIUM_URL is required when TTS_PREMIUM_ENABLED=true + * - TTS_FALLBACK_URL is required when TTS_FALLBACK_ENABLED=true + */ + +import { registerAs } from "@nestjs/config"; + +// ========================================== +// Default values +// ========================================== + +const STT_DEFAULTS = { + baseUrl: "http://speaches:8000/v1", + model: "Systran/faster-whisper-large-v3-turbo", + language: "en", +} as const; + +const TTS_DEFAULT_DEFAULTS = { + url: "http://kokoro-tts:8880/v1", + voice: "af_heart", + format: "mp3", +} as const; + +const TTS_PREMIUM_DEFAULTS = { + url: "http://chatterbox-tts:8881/v1", +} as const; + +const TTS_FALLBACK_DEFAULTS = { + url: "http://openedai-speech:8000/v1", +} as const; + +const LIMITS_DEFAULTS = { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, +} as const; + +// ========================================== +// Types +// ========================================== + +export interface SttConfig { + enabled: boolean; + baseUrl: string; + model: string; + language: string; +} + +export interface TtsDefaultConfig { + enabled: boolean; + url: string; + voice: string; + format: string; +} + +export interface TtsPremiumConfig { + enabled: boolean; + url: string; +} + +export interface TtsFallbackConfig { + enabled: boolean; + url: string; +} + +export interface TtsConfig { + default: TtsDefaultConfig; + premium: TtsPremiumConfig; + fallback: TtsFallbackConfig; +} + +export interface SpeechLimitsConfig { + maxUploadSize: number; + maxDurationSeconds: number; + maxTextLength: number; +} + +export interface SpeechConfig { + stt: SttConfig; + tts: TtsConfig; + limits: SpeechLimitsConfig; +} + +// ========================================== +// Helper: parse boolean env var +// ========================================== + +function parseBooleanEnv(value: string | undefined): boolean { + return value === "true" || value === "1"; +} + +// ========================================== +// Enabled checks +// ========================================== + +/** + * Check if speech-to-text (STT) is enabled via environment variable. + */ +export function isSttEnabled(): boolean { + return parseBooleanEnv(process.env.STT_ENABLED); +} + +/** + * Check if text-to-speech (TTS) default engine is enabled via environment variable. + */ +export function isTtsEnabled(): boolean { + return parseBooleanEnv(process.env.TTS_ENABLED); +} + +/** + * Check if TTS premium engine (Chatterbox) is enabled via environment variable. + */ +export function isTtsPremiumEnabled(): boolean { + return parseBooleanEnv(process.env.TTS_PREMIUM_ENABLED); +} + +/** + * Check if TTS fallback engine (Piper/OpenedAI) is enabled via environment variable. + */ +export function isTtsFallbackEnabled(): boolean { + return parseBooleanEnv(process.env.TTS_FALLBACK_ENABLED); +} + +// ========================================== +// Validation helpers +// ========================================== + +/** + * Check if an environment variable has a non-empty value. + */ +function isEnvVarSet(envVar: string): boolean { + const value = process.env[envVar]; + return value !== undefined && value.trim() !== ""; +} + +/** + * Validate that required env vars are set when a service is enabled. + * Throws with a helpful error message listing missing vars and how to disable. + */ +function validateRequiredVars( + serviceName: string, + enabledFlag: string, + requiredVars: string[] +): void { + const missingVars: string[] = []; + + for (const envVar of requiredVars) { + if (!isEnvVarSet(envVar)) { + missingVars.push(envVar); + } + } + + if (missingVars.length > 0) { + throw new Error( + `${serviceName} is enabled (${enabledFlag}=true) but required environment variables are missing or empty: ${missingVars.join(", ")}. ` + + `Either set these variables or disable by setting ${enabledFlag}=false.` + ); + } +} + +/** + * Validate that a numeric env var, if set, is a positive integer. + */ +function validatePositiveInteger(envVar: string): void { + const value = process.env[envVar]; + if (value === undefined || value.trim() === "") { + return; // Not set, will use default + } + + const parsed = parseInt(value, 10); + if (isNaN(parsed) || parsed <= 0 || String(parsed) !== value.trim()) { + throw new Error(`${envVar} must be a positive integer. Current value: "${value}".`); + } +} + +// ========================================== +// Main validation +// ========================================== + +/** + * Validates speech configuration at startup. + * Call this during module initialization to fail fast if misconfigured. + * + * Validates: + * - STT_BASE_URL is set when STT_ENABLED=true + * - TTS_DEFAULT_URL is set when TTS_ENABLED=true + * - TTS_PREMIUM_URL is set when TTS_PREMIUM_ENABLED=true + * - TTS_FALLBACK_URL is set when TTS_FALLBACK_ENABLED=true + * - Numeric limits are positive integers (when set) + * + * @throws Error if any required configuration is missing or invalid + */ +export function validateSpeechConfig(): void { + // STT validation + if (isSttEnabled()) { + validateRequiredVars("STT", "STT_ENABLED", ["STT_BASE_URL"]); + } + + // TTS default validation + if (isTtsEnabled()) { + validateRequiredVars("TTS", "TTS_ENABLED", ["TTS_DEFAULT_URL"]); + } + + // TTS premium validation + if (isTtsPremiumEnabled()) { + validateRequiredVars("TTS premium", "TTS_PREMIUM_ENABLED", ["TTS_PREMIUM_URL"]); + } + + // TTS fallback validation + if (isTtsFallbackEnabled()) { + validateRequiredVars("TTS fallback", "TTS_FALLBACK_ENABLED", ["TTS_FALLBACK_URL"]); + } + + // Limits validation (only if set, otherwise defaults are used) + validatePositiveInteger("SPEECH_MAX_UPLOAD_SIZE"); + validatePositiveInteger("SPEECH_MAX_DURATION_SECONDS"); + validatePositiveInteger("SPEECH_MAX_TEXT_LENGTH"); +} + +// ========================================== +// Config getter +// ========================================== + +/** + * Get the full speech configuration object with typed values and defaults. + * + * @returns SpeechConfig with all STT, TTS, and limits configuration + */ +export function getSpeechConfig(): SpeechConfig { + return { + stt: { + enabled: isSttEnabled(), + baseUrl: process.env.STT_BASE_URL ?? STT_DEFAULTS.baseUrl, + model: process.env.STT_MODEL ?? STT_DEFAULTS.model, + language: process.env.STT_LANGUAGE ?? STT_DEFAULTS.language, + }, + tts: { + default: { + enabled: isTtsEnabled(), + url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url, + voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice, + format: process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format, + }, + premium: { + enabled: isTtsPremiumEnabled(), + url: process.env.TTS_PREMIUM_URL ?? TTS_PREMIUM_DEFAULTS.url, + }, + fallback: { + enabled: isTtsFallbackEnabled(), + url: process.env.TTS_FALLBACK_URL ?? TTS_FALLBACK_DEFAULTS.url, + }, + }, + limits: { + maxUploadSize: parseInt( + process.env.SPEECH_MAX_UPLOAD_SIZE ?? String(LIMITS_DEFAULTS.maxUploadSize), + 10 + ), + maxDurationSeconds: parseInt( + process.env.SPEECH_MAX_DURATION_SECONDS ?? String(LIMITS_DEFAULTS.maxDurationSeconds), + 10 + ), + maxTextLength: parseInt( + process.env.SPEECH_MAX_TEXT_LENGTH ?? String(LIMITS_DEFAULTS.maxTextLength), + 10 + ), + }, + }; +} + +// ========================================== +// NestJS ConfigModule registerAs factory +// ========================================== + +/** + * NestJS ConfigModule namespace factory for speech configuration. + * + * Usage in a module: + * ```typescript + * import { speechConfig } from './speech.config'; + * + * @Module({ + * imports: [ConfigModule.forFeature(speechConfig)], + * }) + * export class SpeechModule {} + * ``` + * + * Then inject via ConfigService: + * ```typescript + * constructor(private config: ConfigService) { + * const sttUrl = this.config.get('speech.stt.baseUrl'); + * } + * ``` + */ +export const speechConfig = registerAs("speech", (): SpeechConfig => { + return getSpeechConfig(); +}); From 52553c8266b4580d3bfc5a0f398fa1937ad9eb9b Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:06:21 -0600 Subject: [PATCH 03/19] feat(#399): add Docker Compose dev overlay for speech services Add docker-compose.speech.yml with three speech services: - Speaches (STT via Whisper + basic TTS) on port 8090 - Kokoro-FastAPI (default TTS) on port 8880 - Chatterbox TTS (premium, GPU-required) on port 8881 behind the premium-tts profile All services include health checks, connect to the mosaic-internal network, and follow existing naming/labeling conventions. Makefile targets added: speech-up, speech-down, speech-logs. Fixes #399 Co-Authored-By: Claude Opus 4.6 --- Makefile | 17 +++++- docker-compose.speech.yml | 113 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 docker-compose.speech.yml diff --git a/Makefile b/Makefile index 3375fee..c6fbb30 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help install dev build test docker-up docker-down docker-logs docker-ps docker-build docker-restart docker-test clean +.PHONY: help install dev build test docker-up docker-down docker-logs docker-ps docker-build docker-restart docker-test speech-up speech-down speech-logs clean # Default target help: @@ -24,6 +24,11 @@ help: @echo " make docker-test Run Docker smoke test" @echo " make docker-test-traefik Run Traefik integration tests" @echo "" + @echo "Speech Services:" + @echo " make speech-up Start speech services (STT + TTS)" + @echo " make speech-down Stop speech services" + @echo " make speech-logs View speech service logs" + @echo "" @echo "Database:" @echo " make db-migrate Run database migrations" @echo " make db-seed Seed development data" @@ -85,6 +90,16 @@ docker-test: docker-test-traefik: ./tests/integration/docker/traefik.test.sh all +# Speech services +speech-up: + docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d speaches kokoro-tts + +speech-down: + docker compose -f docker-compose.yml -f docker-compose.speech.yml down --remove-orphans + +speech-logs: + docker compose -f docker-compose.yml -f docker-compose.speech.yml logs -f speaches kokoro-tts + # Database operations db-migrate: cd apps/api && pnpm prisma:migrate diff --git a/docker-compose.speech.yml b/docker-compose.speech.yml new file mode 100644 index 0000000..855a947 --- /dev/null +++ b/docker-compose.speech.yml @@ -0,0 +1,113 @@ +# ============================================== +# Speech Services - Docker Compose Dev Overlay +# ============================================== +# +# Adds STT and TTS services for local development. +# +# Usage: +# Basic (STT + default TTS): +# docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d +# +# With premium TTS (requires GPU): +# docker compose -f docker-compose.yml -f docker-compose.speech.yml --profile premium-tts up -d +# +# Or use Makefile targets: +# make speech-up # Basic speech services +# make speech-down # Stop speech services +# make speech-logs # View speech service logs +# ============================================== + +services: + # ====================== + # Speaches (STT + basic TTS) + # ====================== + speaches: + image: ghcr.io/speaches-ai/speaches:latest + container_name: mosaic-speaches + restart: unless-stopped + environment: + WHISPER__MODEL: ${SPEACHES_WHISPER_MODEL:-Systran/faster-whisper-large-v3-turbo} + ports: + - "${SPEACHES_PORT:-8090}:8000" + volumes: + - speaches_models:/root/.cache/huggingface + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 120s + networks: + - mosaic-internal + labels: + - "com.mosaic.service=speech-stt" + - "com.mosaic.description=Speaches STT (Whisper) and basic TTS" + + # ====================== + # Kokoro TTS (Default TTS) + # ====================== + kokoro-tts: + image: ghcr.io/remsky/kokoro-fastapi:latest-cpu + container_name: mosaic-kokoro-tts + restart: unless-stopped + ports: + - "${KOKORO_TTS_PORT:-8880}:8880" + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8880/health || exit 1"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 120s + networks: + - mosaic-internal + labels: + - "com.mosaic.service=speech-tts" + - "com.mosaic.description=Kokoro FastAPI TTS engine" + + # ====================== + # Chatterbox TTS (Premium TTS - Optional) + # ====================== + # Only starts with: --profile premium-tts + # Requires NVIDIA GPU with docker nvidia runtime + chatterbox-tts: + image: devnen/chatterbox-tts-server:latest + container_name: mosaic-chatterbox-tts + restart: unless-stopped + ports: + - "${CHATTERBOX_TTS_PORT:-8881}:8000" + profiles: + - premium-tts + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 180s + networks: + - mosaic-internal + labels: + - "com.mosaic.service=speech-tts-premium" + - "com.mosaic.description=Chatterbox premium TTS with voice cloning (GPU)" + +# ====================== +# Volumes +# ====================== +volumes: + speaches_models: + name: mosaic-speaches-models + driver: local + +# ====================== +# Networks +# ====================== +networks: + mosaic-internal: + external: true + name: mosaic-internal From c40373fa3ba1e4d2d6adcd392a506f79e5268367 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:09:45 -0600 Subject: [PATCH 04/19] feat(#389): create SpeechModule with provider abstraction layer Add SpeechModule with provider interfaces and service skeleton for multi-tier TTS fallback (premium -> default -> fallback) and STT transcription support. Includes 27 unit tests covering provider selection, fallback logic, and availability checks. - ISTTProvider interface with transcribe/isHealthy methods - ITTSProvider interface with synthesize/listVoices/isHealthy methods - Shared types: SpeechTier, TranscriptionResult, SynthesisResult, etc. - SpeechService with graceful TTS fallback chain - NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS) - SpeechModule registered in AppModule - ConfigModule integration via speechConfig registerAs factory Co-Authored-By: Claude Opus 4.6 --- apps/api/src/app.module.ts | 2 + apps/api/src/speech/interfaces/index.ts | 18 + .../api/src/speech/interfaces/speech-types.ts | 149 +++++ .../interfaces/stt-provider.interface.ts | 52 ++ .../interfaces/tts-provider.interface.ts | 68 +++ apps/api/src/speech/speech.constants.ts | 19 + apps/api/src/speech/speech.module.ts | 49 ++ apps/api/src/speech/speech.service.spec.ts | 541 ++++++++++++++++++ apps/api/src/speech/speech.service.ts | 231 ++++++++ 9 files changed, 1129 insertions(+) create mode 100644 apps/api/src/speech/interfaces/index.ts create mode 100644 apps/api/src/speech/interfaces/speech-types.ts create mode 100644 apps/api/src/speech/interfaces/stt-provider.interface.ts create mode 100644 apps/api/src/speech/interfaces/tts-provider.interface.ts create mode 100644 apps/api/src/speech/speech.constants.ts create mode 100644 apps/api/src/speech/speech.module.ts create mode 100644 apps/api/src/speech/speech.service.spec.ts create mode 100644 apps/api/src/speech/speech.service.ts diff --git a/apps/api/src/app.module.ts b/apps/api/src/app.module.ts index 43733e3..c353f3a 100644 --- a/apps/api/src/app.module.ts +++ b/apps/api/src/app.module.ts @@ -37,6 +37,7 @@ import { JobStepsModule } from "./job-steps/job-steps.module"; import { CoordinatorIntegrationModule } from "./coordinator-integration/coordinator-integration.module"; import { FederationModule } from "./federation/federation.module"; import { CredentialsModule } from "./credentials/credentials.module"; +import { SpeechModule } from "./speech/speech.module"; import { RlsContextInterceptor } from "./common/interceptors/rls-context.interceptor"; @Module({ @@ -97,6 +98,7 @@ import { RlsContextInterceptor } from "./common/interceptors/rls-context.interce CoordinatorIntegrationModule, FederationModule, CredentialsModule, + SpeechModule, ], controllers: [AppController, CsrfController], providers: [ diff --git a/apps/api/src/speech/interfaces/index.ts b/apps/api/src/speech/interfaces/index.ts new file mode 100644 index 0000000..ded8bd2 --- /dev/null +++ b/apps/api/src/speech/interfaces/index.ts @@ -0,0 +1,18 @@ +/** + * Speech interfaces barrel export. + * + * Issue #389 + */ + +export type { ISTTProvider } from "./stt-provider.interface"; +export type { ITTSProvider } from "./tts-provider.interface"; +export type { + SpeechTier, + AudioFormat, + TranscribeOptions, + TranscriptionResult, + TranscriptionSegment, + SynthesizeOptions, + SynthesisResult, + VoiceInfo, +} from "./speech-types"; diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts new file mode 100644 index 0000000..3f5a0b7 --- /dev/null +++ b/apps/api/src/speech/interfaces/speech-types.ts @@ -0,0 +1,149 @@ +/** + * Speech Types + * + * Shared types for speech-to-text (STT) and text-to-speech (TTS) services. + * Used by provider interfaces and the SpeechService. + * + * Issue #389 + */ + +// ========================================== +// Enums / Discriminators +// ========================================== + +/** + * TTS provider tier. + * Determines which TTS engine is used for synthesis. + * + * - default: Primary TTS engine (e.g., Kokoro) + * - premium: Higher quality TTS engine (e.g., Chatterbox) + * - fallback: Backup TTS engine (e.g., Piper/OpenedAI) + */ +export type SpeechTier = "default" | "premium" | "fallback"; + +/** + * Audio output format for TTS synthesis. + */ +export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm"; + +// ========================================== +// STT Types +// ========================================== + +/** + * Options for speech-to-text transcription. + */ +export interface TranscribeOptions { + /** Language code (e.g., "en", "fr", "de") */ + language?: string; + + /** Model to use for transcription */ + model?: string; + + /** MIME type of the audio (e.g., "audio/mp3", "audio/wav") */ + mimeType?: string; + + /** Optional prompt to guide transcription */ + prompt?: string; + + /** Temperature for transcription (0.0 - 1.0) */ + temperature?: number; +} + +/** + * Result of a speech-to-text transcription. + */ +export interface TranscriptionResult { + /** Transcribed text */ + text: string; + + /** Language detected or used */ + language: string; + + /** Duration of the audio in seconds */ + durationSeconds?: number; + + /** Confidence score (0.0 - 1.0, if available) */ + confidence?: number; + + /** Individual word or segment timings (if available) */ + segments?: TranscriptionSegment[]; +} + +/** + * A segment within a transcription result. + */ +export interface TranscriptionSegment { + /** Segment text */ + text: string; + + /** Start time in seconds */ + start: number; + + /** End time in seconds */ + end: number; + + /** Confidence for this segment */ + confidence?: number; +} + +// ========================================== +// TTS Types +// ========================================== + +/** + * Options for text-to-speech synthesis. + */ +export interface SynthesizeOptions { + /** Voice ID to use */ + voice?: string; + + /** Desired audio format */ + format?: AudioFormat; + + /** Speech speed multiplier (0.5 - 2.0) */ + speed?: number; + + /** Preferred TTS tier */ + tier?: SpeechTier; +} + +/** + * Result of a text-to-speech synthesis. + */ +export interface SynthesisResult { + /** Synthesized audio data */ + audio: Buffer; + + /** Audio format of the result */ + format: AudioFormat; + + /** Voice used for synthesis */ + voice: string; + + /** Tier that produced the synthesis */ + tier: SpeechTier; + + /** Duration of the generated audio in seconds (if available) */ + durationSeconds?: number; +} + +/** + * Information about an available TTS voice. + */ +export interface VoiceInfo { + /** Voice identifier */ + id: string; + + /** Human-readable voice name */ + name: string; + + /** Language code */ + language?: string; + + /** Tier this voice belongs to */ + tier: SpeechTier; + + /** Whether this is the default voice for its tier */ + isDefault?: boolean; +} diff --git a/apps/api/src/speech/interfaces/stt-provider.interface.ts b/apps/api/src/speech/interfaces/stt-provider.interface.ts new file mode 100644 index 0000000..871fdd1 --- /dev/null +++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts @@ -0,0 +1,52 @@ +/** + * STT Provider Interface + * + * Defines the contract for speech-to-text provider implementations. + * All STT providers (e.g., Speaches/faster-whisper) must implement this interface. + * + * Issue #389 + */ + +import type { TranscribeOptions, TranscriptionResult } from "./speech-types"; + +/** + * Interface for speech-to-text providers. + * + * Implementations wrap an OpenAI-compatible API endpoint for transcription. + * + * @example + * ```typescript + * class SpeachesProvider implements ISTTProvider { + * readonly name = "speaches"; + * + * async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { + * // Call speaches API via OpenAI SDK + * } + * + * async isHealthy(): Promise { + * // Check endpoint health + * } + * } + * ``` + */ +export interface ISTTProvider { + /** Provider name for logging and identification */ + readonly name: string; + + /** + * Transcribe audio data to text. + * + * @param audio - Raw audio data as a Buffer + * @param options - Optional transcription parameters + * @returns Transcription result with text and metadata + * @throws {Error} If transcription fails + */ + transcribe(audio: Buffer, options?: TranscribeOptions): Promise; + + /** + * Check if the provider is healthy and available. + * + * @returns true if the provider endpoint is reachable and ready + */ + isHealthy(): Promise; +} diff --git a/apps/api/src/speech/interfaces/tts-provider.interface.ts b/apps/api/src/speech/interfaces/tts-provider.interface.ts new file mode 100644 index 0000000..9c378fa --- /dev/null +++ b/apps/api/src/speech/interfaces/tts-provider.interface.ts @@ -0,0 +1,68 @@ +/** + * TTS Provider Interface + * + * Defines the contract for text-to-speech provider implementations. + * All TTS providers (e.g., Kokoro, Chatterbox, Piper/OpenedAI) must implement this interface. + * + * Issue #389 + */ + +import type { SynthesizeOptions, SynthesisResult, VoiceInfo, SpeechTier } from "./speech-types"; + +/** + * Interface for text-to-speech providers. + * + * Implementations wrap an OpenAI-compatible API endpoint for speech synthesis. + * Each provider is associated with a SpeechTier (default, premium, fallback). + * + * @example + * ```typescript + * class KokoroProvider implements ITTSProvider { + * readonly name = "kokoro"; + * readonly tier = "default"; + * + * async synthesize(text: string, options?: SynthesizeOptions): Promise { + * // Call Kokoro API via OpenAI SDK + * } + * + * async listVoices(): Promise { + * // Return available voices + * } + * + * async isHealthy(): Promise { + * // Check endpoint health + * } + * } + * ``` + */ +export interface ITTSProvider { + /** Provider name for logging and identification */ + readonly name: string; + + /** Tier this provider serves (default, premium, fallback) */ + readonly tier: SpeechTier; + + /** + * Synthesize text to audio. + * + * @param text - Text to convert to speech + * @param options - Optional synthesis parameters (voice, format, speed) + * @returns Synthesis result with audio buffer and metadata + * @throws {Error} If synthesis fails + */ + synthesize(text: string, options?: SynthesizeOptions): Promise; + + /** + * List available voices for this provider. + * + * @returns Array of voice information objects + */ + listVoices(): Promise; + + /** + * Check if the provider is healthy and available. + * + * @returns true if the provider endpoint is reachable and ready + */ + isHealthy(): Promise; +} diff --git a/apps/api/src/speech/speech.constants.ts b/apps/api/src/speech/speech.constants.ts new file mode 100644 index 0000000..b3a0814 --- /dev/null +++ b/apps/api/src/speech/speech.constants.ts @@ -0,0 +1,19 @@ +/** + * Speech Module Constants + * + * NestJS injection tokens for speech providers. + * + * Issue #389 + */ + +/** + * Injection token for the STT (speech-to-text) provider. + * Providers implementing ISTTProvider register under this token. + */ +export const STT_PROVIDER = Symbol("STT_PROVIDER"); + +/** + * Injection token for TTS (text-to-speech) providers map. + * Registered as Map. + */ +export const TTS_PROVIDERS = Symbol("TTS_PROVIDERS"); diff --git a/apps/api/src/speech/speech.module.ts b/apps/api/src/speech/speech.module.ts new file mode 100644 index 0000000..e18ada5 --- /dev/null +++ b/apps/api/src/speech/speech.module.ts @@ -0,0 +1,49 @@ +/** + * SpeechModule + * + * NestJS module for speech-to-text (STT) and text-to-speech (TTS) services. + * Provides a provider abstraction layer with graceful fallback for TTS tiers. + * + * Imports: + * - ConfigModule.forFeature(speechConfig) for speech configuration + * + * Providers: + * - SpeechService: High-level speech operations with provider selection + * - TTS_PROVIDERS: Empty Map (populated by provider modules) + * + * Exports: + * - SpeechService for use by other modules (e.g., controllers, brain) + * + * Issue #389 + */ + +import { Module, type OnModuleInit, Logger } from "@nestjs/common"; +import { ConfigModule } from "@nestjs/config"; +import { speechConfig, validateSpeechConfig } from "./speech.config"; +import { SpeechService } from "./speech.service"; +import { TTS_PROVIDERS } from "./speech.constants"; +import type { SpeechTier } from "./interfaces/speech-types"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; + +@Module({ + imports: [ConfigModule.forFeature(speechConfig)], + providers: [ + SpeechService, + // Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.) + // will register their providers in subsequent tasks. + { + provide: TTS_PROVIDERS, + useFactory: (): Map => new Map(), + }, + ], + exports: [SpeechService], +}) +export class SpeechModule implements OnModuleInit { + private readonly logger = new Logger(SpeechModule.name); + + onModuleInit(): void { + // Validate configuration at startup (fail fast) + validateSpeechConfig(); + this.logger.log("Speech module initialized"); + } +} diff --git a/apps/api/src/speech/speech.service.spec.ts b/apps/api/src/speech/speech.service.spec.ts new file mode 100644 index 0000000..9e5b0dd --- /dev/null +++ b/apps/api/src/speech/speech.service.spec.ts @@ -0,0 +1,541 @@ +/** + * SpeechService Tests + * + * Issue #389: Tests for provider abstraction layer with fallback logic. + * Written FIRST following TDD (Red-Green-Refactor). + */ + +import { describe, it, expect, beforeEach, vi } from "vitest"; +import { Test, TestingModule } from "@nestjs/testing"; +import { ServiceUnavailableException } from "@nestjs/common"; +import { SpeechService } from "./speech.service"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { speechConfig } from "./speech.config"; +import type { ISTTProvider } from "./interfaces/stt-provider.interface"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import type { + SpeechTier, + TranscriptionResult, + SynthesisResult, + VoiceInfo, +} from "./interfaces/speech-types"; + +// ========================================== +// Mock provider factories +// ========================================== + +function createMockSttProvider(overrides?: Partial): ISTTProvider { + return { + name: "mock-stt", + transcribe: vi.fn().mockResolvedValue({ + text: "Hello world", + language: "en", + durationSeconds: 2.5, + } satisfies TranscriptionResult), + isHealthy: vi.fn().mockResolvedValue(true), + ...overrides, + }; +} + +function createMockTtsProvider(tier: SpeechTier, overrides?: Partial): ITTSProvider { + return { + name: `mock-tts-${tier}`, + tier, + synthesize: vi.fn().mockResolvedValue({ + audio: Buffer.from("fake-audio"), + format: "mp3", + voice: "test-voice", + tier, + } satisfies SynthesisResult), + listVoices: vi + .fn() + .mockResolvedValue([ + { id: `${tier}-voice-1`, name: `${tier} Voice 1`, tier, isDefault: true }, + ] satisfies VoiceInfo[]), + isHealthy: vi.fn().mockResolvedValue(true), + ...overrides, + }; +} + +// ========================================== +// Default config for tests +// ========================================== + +function createTestConfig(): ReturnType { + return { + stt: { + enabled: true, + baseUrl: "http://localhost:8000/v1", + model: "test-model", + language: "en", + }, + tts: { + default: { + enabled: true, + url: "http://localhost:8880/v1", + voice: "test-voice", + format: "mp3", + }, + premium: { + enabled: true, + url: "http://localhost:8881/v1", + }, + fallback: { + enabled: true, + url: "http://localhost:8882/v1", + }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, + } as ReturnType; +} + +// ========================================== +// Test helper: create testing module +// ========================================== + +async function createTestModule(options: { + sttProvider?: ISTTProvider | null; + ttsProviders?: Map; + config?: ReturnType; +}): Promise { + const config = options.config ?? createTestConfig(); + const ttsProviders = options.ttsProviders ?? new Map(); + + const providers: Array<{ provide: symbol | string; useValue: unknown }> = [ + { provide: speechConfig.KEY, useValue: config }, + { provide: TTS_PROVIDERS, useValue: ttsProviders }, + ]; + + if (options.sttProvider !== undefined) { + providers.push({ provide: STT_PROVIDER, useValue: options.sttProvider }); + } + + return Test.createTestingModule({ + providers: [SpeechService, ...providers], + }).compile(); +} + +// ========================================== +// Tests +// ========================================== + +describe("SpeechService", () => { + // ========================================== + // Construction and initialization + // ========================================== + describe("construction", () => { + it("should be defined when all providers are injected", async () => { + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + ttsProviders: new Map([["default", createMockTtsProvider("default")]]), + }); + + const service = module.get(SpeechService); + expect(service).toBeDefined(); + }); + + it("should be defined with no STT provider", async () => { + const module = await createTestModule({ + sttProvider: null, + ttsProviders: new Map([["default", createMockTtsProvider("default")]]), + }); + + const service = module.get(SpeechService); + expect(service).toBeDefined(); + }); + + it("should be defined with empty TTS providers map", async () => { + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + ttsProviders: new Map(), + }); + + const service = module.get(SpeechService); + expect(service).toBeDefined(); + }); + }); + + // ========================================== + // transcribe() + // ========================================== + describe("transcribe", () => { + let service: SpeechService; + let mockStt: ISTTProvider; + + beforeEach(async () => { + mockStt = createMockSttProvider(); + const module = await createTestModule({ sttProvider: mockStt }); + service = module.get(SpeechService); + }); + + it("should delegate to the STT provider", async () => { + const audio = Buffer.from("test-audio"); + const result = await service.transcribe(audio); + + expect(mockStt.transcribe).toHaveBeenCalledWith(audio, undefined); + expect(result.text).toBe("Hello world"); + expect(result.language).toBe("en"); + }); + + it("should pass options to the STT provider", async () => { + const audio = Buffer.from("test-audio"); + const options = { language: "fr", model: "custom-model" }; + await service.transcribe(audio, options); + + expect(mockStt.transcribe).toHaveBeenCalledWith(audio, options); + }); + + it("should throw ServiceUnavailableException when STT is disabled in config", async () => { + const config = createTestConfig(); + config.stt.enabled = false; + const module = await createTestModule({ sttProvider: mockStt, config }); + service = module.get(SpeechService); + + await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow( + ServiceUnavailableException + ); + }); + + it("should throw ServiceUnavailableException when no STT provider is registered", async () => { + const module = await createTestModule({ sttProvider: null }); + service = module.get(SpeechService); + + await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow( + ServiceUnavailableException + ); + }); + + it("should propagate provider errors as ServiceUnavailableException", async () => { + const failingStt = createMockSttProvider({ + transcribe: vi.fn().mockRejectedValue(new Error("Connection refused")), + }); + const module = await createTestModule({ sttProvider: failingStt }); + service = module.get(SpeechService); + + await expect(service.transcribe(Buffer.from("audio"))).rejects.toThrow( + ServiceUnavailableException + ); + }); + }); + + // ========================================== + // synthesize() + // ========================================== + describe("synthesize", () => { + let service: SpeechService; + let defaultProvider: ITTSProvider; + let premiumProvider: ITTSProvider; + let fallbackProvider: ITTSProvider; + + beforeEach(async () => { + defaultProvider = createMockTtsProvider("default"); + premiumProvider = createMockTtsProvider("premium"); + fallbackProvider = createMockTtsProvider("fallback"); + + const ttsProviders = new Map([ + ["default", defaultProvider], + ["premium", premiumProvider], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + service = module.get(SpeechService); + }); + + it("should use the default tier when no tier is specified", async () => { + const result = await service.synthesize("Hello world"); + + expect(defaultProvider.synthesize).toHaveBeenCalledWith("Hello world", undefined); + expect(result.tier).toBe("default"); + }); + + it("should use the requested tier when specified", async () => { + const result = await service.synthesize("Hello world", { tier: "premium" }); + + expect(premiumProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("premium"); + }); + + it("should pass options to the TTS provider", async () => { + const options = { voice: "custom-voice", format: "wav" as const }; + await service.synthesize("Hello", options); + + expect(defaultProvider.synthesize).toHaveBeenCalledWith("Hello", options); + }); + + it("should throw ServiceUnavailableException when TTS default is disabled and no tier specified", async () => { + const config = createTestConfig(); + config.tts.default.enabled = false; + config.tts.premium.enabled = false; + config.tts.fallback.enabled = false; + const module = await createTestModule({ + ttsProviders: new Map([["default", defaultProvider]]), + config, + }); + service = module.get(SpeechService); + + await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException); + }); + + it("should throw ServiceUnavailableException when no TTS providers are registered", async () => { + const module = await createTestModule({ ttsProviders: new Map() }); + service = module.get(SpeechService); + + await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException); + }); + }); + + // ========================================== + // synthesize() fallback logic + // ========================================== + describe("synthesize fallback", () => { + it("should fall back from premium to default when premium provider fails", async () => { + const failingPremium = createMockTtsProvider("premium", { + synthesize: vi.fn().mockRejectedValue(new Error("Premium unavailable")), + }); + const defaultProvider = createMockTtsProvider("default"); + + const ttsProviders = new Map([ + ["premium", failingPremium], + ["default", defaultProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello", { tier: "premium" }); + + expect(failingPremium.synthesize).toHaveBeenCalled(); + expect(defaultProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("default"); + }); + + it("should fall back from default to fallback when default provider fails", async () => { + const failingDefault = createMockTtsProvider("default", { + synthesize: vi.fn().mockRejectedValue(new Error("Default unavailable")), + }); + const fallbackProvider = createMockTtsProvider("fallback"); + + const ttsProviders = new Map([ + ["default", failingDefault], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello"); + + expect(failingDefault.synthesize).toHaveBeenCalled(); + expect(fallbackProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("fallback"); + }); + + it("should fall back premium -> default -> fallback", async () => { + const failingPremium = createMockTtsProvider("premium", { + synthesize: vi.fn().mockRejectedValue(new Error("Premium fail")), + }); + const failingDefault = createMockTtsProvider("default", { + synthesize: vi.fn().mockRejectedValue(new Error("Default fail")), + }); + const fallbackProvider = createMockTtsProvider("fallback"); + + const ttsProviders = new Map([ + ["premium", failingPremium], + ["default", failingDefault], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello", { tier: "premium" }); + + expect(failingPremium.synthesize).toHaveBeenCalled(); + expect(failingDefault.synthesize).toHaveBeenCalled(); + expect(fallbackProvider.synthesize).toHaveBeenCalled(); + expect(result.tier).toBe("fallback"); + }); + + it("should throw ServiceUnavailableException when all tiers fail", async () => { + const failingDefault = createMockTtsProvider("default", { + synthesize: vi.fn().mockRejectedValue(new Error("Default fail")), + }); + const failingFallback = createMockTtsProvider("fallback", { + synthesize: vi.fn().mockRejectedValue(new Error("Fallback fail")), + }); + + const ttsProviders = new Map([ + ["default", failingDefault], + ["fallback", failingFallback], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + await expect(service.synthesize("Hello")).rejects.toThrow(ServiceUnavailableException); + }); + + it("should skip unavailable tiers in fallback chain", async () => { + // premium requested, but only fallback registered (no default) + const failingPremium = createMockTtsProvider("premium", { + synthesize: vi.fn().mockRejectedValue(new Error("Premium fail")), + }); + const fallbackProvider = createMockTtsProvider("fallback"); + + const config = createTestConfig(); + config.tts.default.enabled = false; + + const ttsProviders = new Map([ + ["premium", failingPremium], + ["fallback", fallbackProvider], + ]); + + const module = await createTestModule({ ttsProviders, config }); + const service = module.get(SpeechService); + + const result = await service.synthesize("Hello", { tier: "premium" }); + expect(result.tier).toBe("fallback"); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + describe("listVoices", () => { + it("should aggregate voices from all registered TTS providers", async () => { + const defaultProvider = createMockTtsProvider("default", { + listVoices: vi.fn().mockResolvedValue([ + { id: "voice-1", name: "Voice 1", tier: "default" as SpeechTier, isDefault: true }, + { id: "voice-2", name: "Voice 2", tier: "default" as SpeechTier }, + ]), + }); + const premiumProvider = createMockTtsProvider("premium", { + listVoices: vi + .fn() + .mockResolvedValue([ + { id: "voice-3", name: "Voice 3", tier: "premium" as SpeechTier, isDefault: true }, + ]), + }); + + const ttsProviders = new Map([ + ["default", defaultProvider], + ["premium", premiumProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const voices = await service.listVoices(); + + expect(voices).toHaveLength(3); + expect(voices.map((v) => v.id)).toEqual(["voice-1", "voice-2", "voice-3"]); + }); + + it("should filter voices by tier when specified", async () => { + const defaultProvider = createMockTtsProvider("default", { + listVoices: vi + .fn() + .mockResolvedValue([{ id: "voice-1", name: "Voice 1", tier: "default" as SpeechTier }]), + }); + const premiumProvider = createMockTtsProvider("premium", { + listVoices: vi + .fn() + .mockResolvedValue([{ id: "voice-2", name: "Voice 2", tier: "premium" as SpeechTier }]), + }); + + const ttsProviders = new Map([ + ["default", defaultProvider], + ["premium", premiumProvider], + ]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const voices = await service.listVoices("premium"); + + expect(voices).toHaveLength(1); + expect(voices[0].id).toBe("voice-2"); + // Only the premium provider should have been called + expect(premiumProvider.listVoices).toHaveBeenCalled(); + expect(defaultProvider.listVoices).not.toHaveBeenCalled(); + }); + + it("should return empty array when no TTS providers are registered", async () => { + const module = await createTestModule({ ttsProviders: new Map() }); + const service = module.get(SpeechService); + + const voices = await service.listVoices(); + expect(voices).toEqual([]); + }); + + it("should return empty array when requested tier has no provider", async () => { + const defaultProvider = createMockTtsProvider("default"); + const ttsProviders = new Map([["default", defaultProvider]]); + + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + const voices = await service.listVoices("premium"); + expect(voices).toEqual([]); + }); + }); + + // ========================================== + // isSTTAvailable / isTTSAvailable + // ========================================== + describe("availability checks", () => { + it("should report STT as available when enabled and provider registered", async () => { + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + }); + const service = module.get(SpeechService); + + expect(service.isSTTAvailable()).toBe(true); + }); + + it("should report STT as unavailable when disabled in config", async () => { + const config = createTestConfig(); + config.stt.enabled = false; + const module = await createTestModule({ + sttProvider: createMockSttProvider(), + config, + }); + const service = module.get(SpeechService); + + expect(service.isSTTAvailable()).toBe(false); + }); + + it("should report STT as unavailable when no provider registered", async () => { + const module = await createTestModule({ sttProvider: null }); + const service = module.get(SpeechService); + + expect(service.isSTTAvailable()).toBe(false); + }); + + it("should report TTS as available when at least one tier is enabled with a provider", async () => { + const ttsProviders = new Map([ + ["default", createMockTtsProvider("default")], + ]); + const module = await createTestModule({ ttsProviders }); + const service = module.get(SpeechService); + + expect(service.isTTSAvailable()).toBe(true); + }); + + it("should report TTS as unavailable when no providers registered", async () => { + const config = createTestConfig(); + config.tts.default.enabled = false; + config.tts.premium.enabled = false; + config.tts.fallback.enabled = false; + const module = await createTestModule({ ttsProviders: new Map(), config }); + const service = module.get(SpeechService); + + expect(service.isTTSAvailable()).toBe(false); + }); + }); +}); diff --git a/apps/api/src/speech/speech.service.ts b/apps/api/src/speech/speech.service.ts new file mode 100644 index 0000000..4905918 --- /dev/null +++ b/apps/api/src/speech/speech.service.ts @@ -0,0 +1,231 @@ +/** + * SpeechService + * + * High-level service for speech-to-text (STT) and text-to-speech (TTS) operations. + * Manages provider selection and graceful fallback for TTS tiers. + * + * Fallback chain for TTS: premium -> default -> fallback + * Each tier is only attempted if enabled in config and a provider is registered. + * + * Issue #389 + */ + +import { Injectable, Inject, Optional, Logger, ServiceUnavailableException } from "@nestjs/common"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { speechConfig, type SpeechConfig } from "./speech.config"; +import type { ISTTProvider } from "./interfaces/stt-provider.interface"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import type { + SpeechTier, + TranscribeOptions, + TranscriptionResult, + SynthesizeOptions, + SynthesisResult, + VoiceInfo, +} from "./interfaces/speech-types"; + +/** + * Fallback order for TTS tiers. + * When a tier fails, the next tier in this array is attempted. + */ +const TTS_FALLBACK_ORDER: readonly SpeechTier[] = ["premium", "default", "fallback"] as const; + +@Injectable() +export class SpeechService { + private readonly logger = new Logger(SpeechService.name); + + constructor( + @Inject(speechConfig.KEY) + private readonly config: SpeechConfig, + + @Optional() + @Inject(STT_PROVIDER) + private readonly sttProvider: ISTTProvider | null, + + @Inject(TTS_PROVIDERS) + private readonly ttsProviders: Map + ) { + this.logger.log("Speech service initialized"); + + if (this.sttProvider) { + this.logger.log(`STT provider registered: ${this.sttProvider.name}`); + } + + if (this.ttsProviders.size > 0) { + const tierNames = Array.from(this.ttsProviders.keys()).join(", "); + this.logger.log(`TTS providers registered: ${tierNames}`); + } + } + + // ========================================== + // STT Operations + // ========================================== + + /** + * Transcribe audio data to text using the registered STT provider. + * + * @param audio - Raw audio data as a Buffer + * @param options - Optional transcription parameters + * @returns Transcription result with text and metadata + * @throws {ServiceUnavailableException} If STT is disabled or no provider is registered + */ + async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { + if (!this.config.stt.enabled) { + throw new ServiceUnavailableException("Speech-to-text is not enabled"); + } + + if (!this.sttProvider) { + throw new ServiceUnavailableException("No STT provider is registered"); + } + + try { + return await this.sttProvider.transcribe(audio, options); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`STT transcription failed: ${message}`); + throw new ServiceUnavailableException(`Transcription failed: ${message}`); + } + } + + // ========================================== + // TTS Operations + // ========================================== + + /** + * Synthesize text to audio using TTS providers with graceful fallback. + * + * Fallback chain: requested tier -> default -> fallback. + * Only enabled tiers with registered providers are attempted. + * + * @param text - Text to convert to speech + * @param options - Optional synthesis parameters (voice, format, tier) + * @returns Synthesis result with audio buffer and metadata + * @throws {ServiceUnavailableException} If no TTS provider can fulfill the request + */ + async synthesize(text: string, options?: SynthesizeOptions): Promise { + const requestedTier = options?.tier ?? "default"; + const fallbackChain = this.buildFallbackChain(requestedTier); + + if (fallbackChain.length === 0) { + throw new ServiceUnavailableException( + "No TTS providers are available. Check that TTS is enabled and providers are registered." + ); + } + + let lastError: Error | undefined; + + for (const tier of fallbackChain) { + const provider = this.ttsProviders.get(tier); + if (!provider) { + continue; + } + + try { + return await provider.synthesize(text, options); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`TTS tier "${tier}" (${provider.name}) failed: ${message}`); + lastError = error instanceof Error ? error : new Error(message); + } + } + + const errorMessage = lastError?.message ?? "No providers available"; + throw new ServiceUnavailableException(`All TTS providers failed: ${errorMessage}`); + } + + /** + * List available voices across all TTS providers, optionally filtered by tier. + * + * @param tier - Optional tier filter. If omitted, voices from all tiers are returned. + * @returns Array of voice information objects + */ + async listVoices(tier?: SpeechTier): Promise { + const voices: VoiceInfo[] = []; + + if (tier) { + const provider = this.ttsProviders.get(tier); + if (!provider) { + return []; + } + + try { + return await provider.listVoices(); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`Failed to list voices for tier "${tier}": ${message}`); + return []; + } + } + + // Aggregate voices from all providers + for (const [providerTier, provider] of this.ttsProviders) { + try { + const tierVoices = await provider.listVoices(); + voices.push(...tierVoices); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`Failed to list voices for tier "${providerTier}": ${message}`); + } + } + + return voices; + } + + // ========================================== + // Availability Checks + // ========================================== + + /** + * Check if STT is available (enabled in config and provider registered). + */ + isSTTAvailable(): boolean { + return this.config.stt.enabled && this.sttProvider !== null; + } + + /** + * Check if TTS is available (at least one tier enabled with a registered provider). + */ + isTTSAvailable(): boolean { + return this.getEnabledTiers().some((tier) => this.ttsProviders.has(tier)); + } + + // ========================================== + // Private helpers + // ========================================== + + /** + * Build the fallback chain starting from the requested tier. + * Only includes tiers that are enabled in config and have a registered provider. + */ + private buildFallbackChain(requestedTier: SpeechTier): SpeechTier[] { + const startIndex = TTS_FALLBACK_ORDER.indexOf(requestedTier); + if (startIndex === -1) { + return []; + } + + const enabledTiers = this.getEnabledTiers(); + + return TTS_FALLBACK_ORDER.slice(startIndex).filter( + (tier) => enabledTiers.includes(tier) && this.ttsProviders.has(tier) + ); + } + + /** + * Get the list of TTS tiers that are enabled in the configuration. + */ + private getEnabledTiers(): SpeechTier[] { + const tiers: SpeechTier[] = []; + + if (this.config.tts.default.enabled) { + tiers.push("default"); + } + if (this.config.tts.premium.enabled) { + tiers.push("premium"); + } + if (this.config.tts.fallback.enabled) { + tiers.push("fallback"); + } + + return tiers; + } +} From 3ae9e53bcc74a34cfdff69de2b3e36285099fa2f Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:19:46 -0600 Subject: [PATCH 05/19] feat(#391): implement tiered TTS provider architecture with base class Add abstract BaseTTSProvider class that implements common OpenAI-compatible TTS logic using the OpenAI SDK with configurable baseURL. Includes synthesize(), listVoices(), and isHealthy() methods. Create TTS provider factory that dynamically registers Kokoro (default), Chatterbox (premium), and Piper (fallback) providers based on configuration. Update SpeechModule to use the factory for TTS_PROVIDERS injection token. Also fixes lint error in speaches-stt.provider.ts (Array -> T[]). 30 tests added (22 base provider + 8 factory), all passing. Fixes #391 --- .../providers/speaches-stt.provider.spec.ts | 468 ++++++++++++++++++ .../speech/providers/speaches-stt.provider.ts | 180 +++++++ apps/api/src/speech/speech.module.ts | 44 +- 3 files changed, 682 insertions(+), 10 deletions(-) create mode 100644 apps/api/src/speech/providers/speaches-stt.provider.spec.ts create mode 100644 apps/api/src/speech/providers/speaches-stt.provider.ts diff --git a/apps/api/src/speech/providers/speaches-stt.provider.spec.ts b/apps/api/src/speech/providers/speaches-stt.provider.spec.ts new file mode 100644 index 0000000..90ad8cd --- /dev/null +++ b/apps/api/src/speech/providers/speaches-stt.provider.spec.ts @@ -0,0 +1,468 @@ +/** + * SpeachesSttProvider Tests + * + * TDD tests for the Speaches/faster-whisper STT provider. + * Tests cover transcription, error handling, health checks, and config injection. + * + * Issue #390 + */ + +import { describe, it, expect, beforeEach, vi } from "vitest"; +import { SpeachesSttProvider } from "./speaches-stt.provider"; +import type { SpeechConfig } from "../speech.config"; +import type { TranscribeOptions } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +const { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls } = vi.hoisted(() => { + const mockCreate = vi.fn(); + const mockModelsList = vi.fn(); + const mockToFile = vi.fn().mockImplementation(async (buffer: Buffer, name: string) => { + return new File([buffer], name); + }); + const mockOpenAIConstructorCalls: Array> = []; + return { mockCreate, mockModelsList, mockToFile, mockOpenAIConstructorCalls }; +}); + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + transcriptions: { + create: mockCreate, + }, + }; + models = { + list: mockModelsList, + }; + constructor(config: Record) { + mockOpenAIConstructorCalls.push(config); + } + } + return { + default: MockOpenAI, + toFile: mockToFile, + }; +}); + +// ========================================== +// Test helpers +// ========================================== + +function createTestConfig(overrides?: Partial): SpeechConfig { + return { + stt: { + enabled: true, + baseUrl: "http://speaches:8000/v1", + model: "Systran/faster-whisper-large-v3-turbo", + language: "en", + ...overrides, + }, + tts: { + default: { enabled: false, url: "", voice: "", format: "" }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, + }; +} + +function createMockVerboseResponse(overrides?: Record): Record { + return { + text: "Hello, world!", + language: "en", + duration: 3.5, + segments: [ + { + id: 0, + text: "Hello, world!", + start: 0.0, + end: 3.5, + avg_logprob: -0.25, + compression_ratio: 1.2, + no_speech_prob: 0.01, + seek: 0, + temperature: 0.0, + tokens: [1, 2, 3], + }, + ], + ...overrides, + }; +} + +describe("SpeachesSttProvider", () => { + let provider: SpeachesSttProvider; + let config: SpeechConfig; + + beforeEach(() => { + vi.clearAllMocks(); + mockOpenAIConstructorCalls.length = 0; + config = createTestConfig(); + provider = new SpeachesSttProvider(config); + }); + + // ========================================== + // Provider identity + // ========================================== + describe("name", () => { + it("should have the name 'speaches'", () => { + expect(provider.name).toBe("speaches"); + }); + }); + + // ========================================== + // transcribe + // ========================================== + describe("transcribe", () => { + it("should call OpenAI audio.transcriptions.create with correct parameters", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await provider.transcribe(audio); + + expect(mockCreate).toHaveBeenCalledOnce(); + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.model).toBe("Systran/faster-whisper-large-v3-turbo"); + expect(callArgs.language).toBe("en"); + expect(callArgs.response_format).toBe("verbose_json"); + }); + + it("should convert Buffer to File using toFile", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await provider.transcribe(audio); + + expect(mockToFile).toHaveBeenCalledWith(audio, "audio.wav", { + type: "audio/wav", + }); + }); + + it("should return TranscriptionResult with text and language", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Hello, world!"); + expect(result.language).toBe("en"); + }); + + it("should return durationSeconds from verbose response", async () => { + const mockResponse = createMockVerboseResponse({ duration: 5.25 }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.durationSeconds).toBe(5.25); + }); + + it("should map segments from verbose response", async () => { + const mockResponse = createMockVerboseResponse({ + segments: [ + { + id: 0, + text: "Hello,", + start: 0.0, + end: 1.5, + avg_logprob: -0.2, + compression_ratio: 1.1, + no_speech_prob: 0.01, + seek: 0, + temperature: 0.0, + tokens: [1, 2], + }, + { + id: 1, + text: " world!", + start: 1.5, + end: 3.5, + avg_logprob: -0.3, + compression_ratio: 1.3, + no_speech_prob: 0.02, + seek: 0, + temperature: 0.0, + tokens: [3, 4], + }, + ], + }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.segments).toHaveLength(2); + expect(result.segments?.[0]).toEqual({ + text: "Hello,", + start: 0.0, + end: 1.5, + }); + expect(result.segments?.[1]).toEqual({ + text: " world!", + start: 1.5, + end: 3.5, + }); + }); + + it("should handle response without segments gracefully", async () => { + const mockResponse = createMockVerboseResponse({ segments: undefined }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Hello, world!"); + expect(result.segments).toBeUndefined(); + }); + + it("should handle response without duration gracefully", async () => { + const mockResponse = createMockVerboseResponse({ duration: undefined }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Hello, world!"); + expect(result.durationSeconds).toBeUndefined(); + }); + + // ------------------------------------------ + // Options override + // ------------------------------------------ + describe("options override", () => { + it("should use custom model from options when provided", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { model: "custom-whisper-model" }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.model).toBe("custom-whisper-model"); + }); + + it("should use custom language from options when provided", async () => { + const mockResponse = createMockVerboseResponse({ language: "fr" }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { language: "fr" }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.language).toBe("fr"); + }); + + it("should pass through prompt option", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { prompt: "This is a meeting about project planning." }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.prompt).toBe("This is a meeting about project planning."); + }); + + it("should pass through temperature option", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { temperature: 0.3 }; + await provider.transcribe(audio, options); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.temperature).toBe(0.3); + }); + + it("should use custom mimeType for file conversion when provided", async () => { + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + const options: TranscribeOptions = { mimeType: "audio/mp3" }; + await provider.transcribe(audio, options); + + expect(mockToFile).toHaveBeenCalledWith(audio, "audio.mp3", { + type: "audio/mp3", + }); + }); + }); + + // ------------------------------------------ + // Simple response fallback + // ------------------------------------------ + describe("simple response fallback", () => { + it("should handle simple Transcription response (text only, no verbose fields)", async () => { + // Some configurations may return just { text: "..." } without verbose fields + const simpleResponse = { text: "Simple transcription result." }; + mockCreate.mockResolvedValueOnce(simpleResponse); + + const audio = Buffer.from("fake-audio-data"); + const result = await provider.transcribe(audio); + + expect(result.text).toBe("Simple transcription result."); + expect(result.language).toBe("en"); // Falls back to config language + expect(result.durationSeconds).toBeUndefined(); + expect(result.segments).toBeUndefined(); + }); + }); + }); + + // ========================================== + // Error handling + // ========================================== + describe("error handling", () => { + it("should throw a descriptive error on connection refused", async () => { + const connectionError = new Error("connect ECONNREFUSED 127.0.0.1:8000"); + mockCreate.mockRejectedValueOnce(connectionError); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: connect ECONNREFUSED 127.0.0.1:8000" + ); + }); + + it("should throw a descriptive error on timeout", async () => { + const timeoutError = new Error("Request timed out"); + mockCreate.mockRejectedValueOnce(timeoutError); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: Request timed out" + ); + }); + + it("should throw a descriptive error on API error", async () => { + const apiError = new Error("Invalid model: nonexistent-model"); + mockCreate.mockRejectedValueOnce(apiError); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: Invalid model: nonexistent-model" + ); + }); + + it("should handle non-Error thrown values", async () => { + mockCreate.mockRejectedValueOnce("unexpected string error"); + + const audio = Buffer.from("fake-audio-data"); + await expect(provider.transcribe(audio)).rejects.toThrow( + "STT transcription failed: unexpected string error" + ); + }); + }); + + // ========================================== + // isHealthy + // ========================================== + describe("isHealthy", () => { + it("should return true when the server is reachable", async () => { + mockModelsList.mockResolvedValueOnce({ data: [{ id: "whisper-1" }] }); + + const healthy = await provider.isHealthy(); + expect(healthy).toBe(true); + }); + + it("should return false when the server is unreachable", async () => { + mockModelsList.mockRejectedValueOnce(new Error("connect ECONNREFUSED")); + + const healthy = await provider.isHealthy(); + expect(healthy).toBe(false); + }); + + it("should not throw on health check failure", async () => { + mockModelsList.mockRejectedValueOnce(new Error("Network error")); + + await expect(provider.isHealthy()).resolves.toBe(false); + }); + + it("should return false on unexpected error types", async () => { + mockModelsList.mockRejectedValueOnce("string error"); + + const healthy = await provider.isHealthy(); + expect(healthy).toBe(false); + }); + }); + + // ========================================== + // Config injection + // ========================================== + describe("config injection", () => { + it("should create OpenAI client with baseURL from config", () => { + // The constructor was called in beforeEach + expect(mockOpenAIConstructorCalls).toHaveLength(1); + expect(mockOpenAIConstructorCalls[0]).toEqual( + expect.objectContaining({ + baseURL: "http://speaches:8000/v1", + }) + ); + }); + + it("should use custom baseURL from config", () => { + mockOpenAIConstructorCalls.length = 0; + const customConfig = createTestConfig({ + baseUrl: "http://custom-speaches:9000/v1", + }); + new SpeachesSttProvider(customConfig); + + expect(mockOpenAIConstructorCalls).toHaveLength(1); + expect(mockOpenAIConstructorCalls[0]).toEqual( + expect.objectContaining({ + baseURL: "http://custom-speaches:9000/v1", + }) + ); + }); + + it("should use default model from config for transcription", async () => { + const customConfig = createTestConfig({ + model: "Systran/faster-whisper-small", + }); + const customProvider = new SpeachesSttProvider(customConfig); + + const mockResponse = createMockVerboseResponse(); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await customProvider.transcribe(audio); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.model).toBe("Systran/faster-whisper-small"); + }); + + it("should use default language from config for transcription", async () => { + const customConfig = createTestConfig({ language: "de" }); + const customProvider = new SpeachesSttProvider(customConfig); + + const mockResponse = createMockVerboseResponse({ language: "de" }); + mockCreate.mockResolvedValueOnce(mockResponse); + + const audio = Buffer.from("fake-audio-data"); + await customProvider.transcribe(audio); + + const callArgs = mockCreate.mock.calls[0][0]; + expect(callArgs.language).toBe("de"); + }); + + it("should set a dummy API key for local Speaches server", () => { + expect(mockOpenAIConstructorCalls).toHaveLength(1); + expect(mockOpenAIConstructorCalls[0]).toEqual( + expect.objectContaining({ + apiKey: "not-needed", + }) + ); + }); + }); +}); diff --git a/apps/api/src/speech/providers/speaches-stt.provider.ts b/apps/api/src/speech/providers/speaches-stt.provider.ts new file mode 100644 index 0000000..9186d90 --- /dev/null +++ b/apps/api/src/speech/providers/speaches-stt.provider.ts @@ -0,0 +1,180 @@ +/** + * SpeachesSttProvider + * + * Speech-to-text provider using Speaches (faster-whisper backend). + * Connects to the Speaches server via its OpenAI-compatible + * `/v1/audio/transcriptions` endpoint using the OpenAI SDK. + * + * Issue #390 + */ + +import { Injectable, Inject, Logger } from "@nestjs/common"; +import OpenAI from "openai"; +import { toFile } from "openai"; +import { speechConfig, type SpeechConfig } from "../speech.config"; +import type { ISTTProvider } from "../interfaces/stt-provider.interface"; +import type { + TranscribeOptions, + TranscriptionResult, + TranscriptionSegment, +} from "../interfaces/speech-types"; + +/** + * Derive file extension from a MIME type for use in the uploaded file name. + */ +function extensionFromMimeType(mimeType: string): string { + const mapping: Record = { + "audio/wav": "wav", + "audio/wave": "wav", + "audio/x-wav": "wav", + "audio/mp3": "mp3", + "audio/mpeg": "mp3", + "audio/mp4": "mp4", + "audio/m4a": "m4a", + "audio/ogg": "ogg", + "audio/flac": "flac", + "audio/webm": "webm", + "audio/mpga": "mpga", + }; + return mapping[mimeType] ?? "wav"; +} + +/** + * STT provider backed by a Speaches (faster-whisper) server. + * + * Speaches exposes an OpenAI-compatible `/v1/audio/transcriptions` endpoint, + * so we re-use the official OpenAI SDK with a custom `baseURL`. + * + * @example + * ```typescript + * const provider = new SpeachesSttProvider(speechConfig); + * const result = await provider.transcribe(audioBuffer, { language: "en" }); + * console.log(result.text); + * ``` + */ +@Injectable() +export class SpeachesSttProvider implements ISTTProvider { + readonly name = "speaches"; + + private readonly logger = new Logger(SpeachesSttProvider.name); + private readonly client: OpenAI; + private readonly config: SpeechConfig; + + constructor( + @Inject(speechConfig.KEY) + config: SpeechConfig + ) { + this.config = config; + + this.client = new OpenAI({ + baseURL: config.stt.baseUrl, + apiKey: "not-needed", // Speaches does not require an API key + }); + + this.logger.log( + `Speaches STT provider initialized (endpoint: ${config.stt.baseUrl}, model: ${config.stt.model})` + ); + } + + /** + * Transcribe audio data to text using the Speaches server. + * + * Sends the audio buffer to the `/v1/audio/transcriptions` endpoint + * with `response_format=verbose_json` to get segments and duration data. + * + * @param audio - Raw audio data as a Buffer + * @param options - Optional transcription parameters (model, language, prompt, temperature) + * @returns Transcription result with text, language, duration, and optional segments + * @throws {Error} If transcription fails (connection error, API error, etc.) + */ + async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { + const model = options?.model ?? this.config.stt.model; + const language = options?.language ?? this.config.stt.language; + const mimeType = options?.mimeType ?? "audio/wav"; + const extension = extensionFromMimeType(mimeType); + + try { + const file = await toFile(audio, `audio.${extension}`, { + type: mimeType, + }); + + const response = await this.client.audio.transcriptions.create({ + file, + model, + language, + response_format: "verbose_json", + ...(options?.prompt !== undefined ? { prompt: options.prompt } : {}), + ...(options?.temperature !== undefined ? { temperature: options.temperature } : {}), + }); + + return this.mapResponse(response, language); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`Transcription failed: ${message}`); + throw new Error(`STT transcription failed: ${message}`); + } + } + + /** + * Check if the Speaches server is healthy and reachable. + * + * Attempts to list models from the server. Returns true if the request + * succeeds, false otherwise. + * + * @returns true if the Speaches server is reachable and ready + */ + async isHealthy(): Promise { + try { + await this.client.models.list(); + return true; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`Speaches health check failed: ${message}`); + return false; + } + } + + /** + * Map the OpenAI SDK transcription response to our TranscriptionResult type. + * + * Handles both verbose responses (with duration, segments) and simple + * responses (text only). + */ + private mapResponse( + response: OpenAI.Audio.Transcriptions.TranscriptionVerbose | Record, + fallbackLanguage: string + ): TranscriptionResult { + const text = (response as { text: string }).text; + const verboseResponse = response as { + text: string; + language?: string; + duration?: number; + segments?: { + text: string; + start: number; + end: number; + }[]; + }; + + const result: TranscriptionResult = { + text, + language: verboseResponse.language ?? fallbackLanguage, + }; + + if (verboseResponse.duration !== undefined) { + result.durationSeconds = verboseResponse.duration; + } + + if (verboseResponse.segments !== undefined && Array.isArray(verboseResponse.segments)) { + result.segments = verboseResponse.segments.map( + (segment): TranscriptionSegment => ({ + text: segment.text, + start: segment.start, + end: segment.end, + }) + ); + } + + return result; + } +} diff --git a/apps/api/src/speech/speech.module.ts b/apps/api/src/speech/speech.module.ts index e18ada5..840123e 100644 --- a/apps/api/src/speech/speech.module.ts +++ b/apps/api/src/speech/speech.module.ts @@ -4,36 +4,60 @@ * NestJS module for speech-to-text (STT) and text-to-speech (TTS) services. * Provides a provider abstraction layer with graceful fallback for TTS tiers. * + * TTS providers are created dynamically based on configuration: + * - default: Kokoro-FastAPI (CPU, always available) + * - premium: Chatterbox (GPU, voice cloning) + * - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU) + * * Imports: * - ConfigModule.forFeature(speechConfig) for speech configuration * * Providers: * - SpeechService: High-level speech operations with provider selection - * - TTS_PROVIDERS: Empty Map (populated by provider modules) + * - TTS_PROVIDERS: Map populated by factory based on config * * Exports: * - SpeechService for use by other modules (e.g., controllers, brain) * - * Issue #389 + * Issue #389, #390, #391 */ import { Module, type OnModuleInit, Logger } from "@nestjs/common"; -import { ConfigModule } from "@nestjs/config"; -import { speechConfig, validateSpeechConfig } from "./speech.config"; +import { ConfigModule, ConfigService } from "@nestjs/config"; +import { + speechConfig, + validateSpeechConfig, + isSttEnabled, + type SpeechConfig, +} from "./speech.config"; import { SpeechService } from "./speech.service"; -import { TTS_PROVIDERS } from "./speech.constants"; -import type { SpeechTier } from "./interfaces/speech-types"; -import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { SpeachesSttProvider } from "./providers/speaches-stt.provider"; +import { createTTSProviders } from "./providers/tts-provider.factory"; @Module({ imports: [ConfigModule.forFeature(speechConfig)], providers: [ SpeechService, - // Default empty TTS providers map. Provider modules (Kokoro, Chatterbox, etc.) - // will register their providers in subsequent tasks. + // STT provider: conditionally register SpeachesSttProvider when STT is enabled + ...(isSttEnabled() + ? [ + { + provide: STT_PROVIDER, + useClass: SpeachesSttProvider, + }, + ] + : []), { provide: TTS_PROVIDERS, - useFactory: (): Map => new Map(), + useFactory: (configService: ConfigService) => { + const config = configService.get("speech"); + if (!config) { + return new Map(); + } + return createTTSProviders(config); + }, + inject: [ConfigService], }, ], exports: [SpeechService], From b5edb4f37eb6d5e056a30ef22e6e4569aa4a8a16 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:20:24 -0600 Subject: [PATCH 06/19] feat(#391): add base TTS provider and factory classes Add the BaseTTSProvider abstract class and TTS provider factory that were part of the tiered TTS architecture but missed from the previous commit. - BaseTTSProvider: abstract base with synthesize(), listVoices(), isHealthy() - tts-provider.factory: creates Kokoro/Chatterbox/Piper providers from config - 30 tests (22 base provider + 8 factory) Refs #391 --- .../providers/base-tts.provider.spec.ts | 329 ++++++++++++++++++ .../src/speech/providers/base-tts.provider.ts | 189 ++++++++++ .../providers/tts-provider.factory.spec.ts | 279 +++++++++++++++ .../speech/providers/tts-provider.factory.ts | 112 ++++++ 4 files changed, 909 insertions(+) create mode 100644 apps/api/src/speech/providers/base-tts.provider.spec.ts create mode 100644 apps/api/src/speech/providers/base-tts.provider.ts create mode 100644 apps/api/src/speech/providers/tts-provider.factory.spec.ts create mode 100644 apps/api/src/speech/providers/tts-provider.factory.ts diff --git a/apps/api/src/speech/providers/base-tts.provider.spec.ts b/apps/api/src/speech/providers/base-tts.provider.spec.ts new file mode 100644 index 0000000..ac7fd22 --- /dev/null +++ b/apps/api/src/speech/providers/base-tts.provider.spec.ts @@ -0,0 +1,329 @@ +/** + * BaseTTSProvider Unit Tests + * + * Tests the abstract base class for OpenAI-compatible TTS providers. + * Uses a concrete test implementation to exercise the base class logic. + * + * Issue #391 + */ + +import { describe, it, expect, beforeEach, vi, type Mock } from "vitest"; +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, SynthesizeOptions, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +const mockCreate = vi.fn(); + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: mockCreate, + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Concrete test implementation +// ========================================== + +class TestTTSProvider extends BaseTTSProvider { + readonly name = "test-provider"; + readonly tier: SpeechTier = "default"; + + constructor(baseURL: string, defaultVoice?: string, defaultFormat?: AudioFormat) { + super(baseURL, defaultVoice, defaultFormat); + } +} + +// ========================================== +// Test helpers +// ========================================== + +/** + * Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return. + * The OpenAI SDK returns a Response object with arrayBuffer() method. + */ +function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } { + return { + arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer), + }; +} + +describe("BaseTTSProvider", () => { + let provider: TestTTSProvider; + + const testBaseURL = "http://localhost:8880/v1"; + const testVoice = "af_heart"; + const testFormat: AudioFormat = "mp3"; + + beforeEach(() => { + vi.clearAllMocks(); + provider = new TestTTSProvider(testBaseURL, testVoice, testFormat); + }); + + // ========================================== + // Constructor + // ========================================== + + describe("constructor", () => { + it("should create an instance with provided configuration", () => { + expect(provider).toBeDefined(); + expect(provider.name).toBe("test-provider"); + expect(provider.tier).toBe("default"); + }); + + it("should use default voice 'alloy' when none provided", () => { + const defaultProvider = new TestTTSProvider(testBaseURL); + expect(defaultProvider).toBeDefined(); + }); + + it("should use default format 'mp3' when none provided", () => { + const defaultProvider = new TestTTSProvider(testBaseURL, "voice-1"); + expect(defaultProvider).toBeDefined(); + }); + }); + + // ========================================== + // synthesize() + // ========================================== + + describe("synthesize", () => { + it("should synthesize text and return a SynthesisResult with audio buffer", async () => { + const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello, world!"); + + expect(result).toBeDefined(); + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.audio.length).toBe(audioBytes.length); + expect(result.format).toBe("mp3"); + expect(result.voice).toBe("af_heart"); + expect(result.tier).toBe("default"); + }); + + it("should pass correct parameters to OpenAI SDK", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Test text"); + + expect(mockCreate).toHaveBeenCalledWith({ + model: "tts-1", + input: "Test text", + voice: "af_heart", + response_format: "mp3", + speed: 1.0, + }); + }); + + it("should use custom voice from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: SynthesizeOptions = { voice: "custom_voice" }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "custom_voice" })); + expect(result.voice).toBe("custom_voice"); + }); + + it("should use custom format from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: SynthesizeOptions = { format: "wav" }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "wav" })); + expect(result.format).toBe("wav"); + }); + + it("should use custom speed from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: SynthesizeOptions = { speed: 1.5 }; + await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ speed: 1.5 })); + }); + + it("should throw an error when synthesis fails", async () => { + mockCreate.mockRejectedValue(new Error("Connection refused")); + + await expect(provider.synthesize("Hello")).rejects.toThrow( + "TTS synthesis failed for test-provider: Connection refused" + ); + }); + + it("should throw an error when response arrayBuffer fails", async () => { + const mockResponse = { + arrayBuffer: vi.fn().mockRejectedValue(new Error("Read error")), + }; + mockCreate.mockResolvedValue(mockResponse); + + await expect(provider.synthesize("Hello")).rejects.toThrow( + "TTS synthesis failed for test-provider: Read error" + ); + }); + + it("should handle empty text input gracefully", async () => { + const audioBytes = new Uint8Array([]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize(""); + + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.audio.length).toBe(0); + }); + + it("should handle non-Error exceptions", async () => { + mockCreate.mockRejectedValue("string error"); + + await expect(provider.synthesize("Hello")).rejects.toThrow( + "TTS synthesis failed for test-provider: string error" + ); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + it("should return default voice list with the configured default voice", async () => { + const voices = await provider.listVoices(); + + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("af_heart"); + expect(defaultVoice?.tier).toBe("default"); + }); + + it("should set tier correctly on all returned voices", async () => { + const voices = await provider.listVoices(); + + for (const voice of voices) { + expect(voice.tier).toBe("default"); + } + }); + }); + + // ========================================== + // isHealthy() + // ========================================== + + describe("isHealthy", () => { + it("should return true when the TTS server is reachable", async () => { + // Mock global fetch for health check + const mockFetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(true); + expect(mockFetch).toHaveBeenCalled(); + + vi.unstubAllGlobals(); + }); + + it("should return false when the TTS server is unreachable", async () => { + const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED")); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should return false when the TTS server returns an error status", async () => { + const mockFetch = vi.fn().mockResolvedValue({ + ok: false, + status: 503, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should use the base URL for the health check", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200 }); + vi.stubGlobal("fetch", mockFetch); + + await provider.isHealthy(); + + // Should call a health-related endpoint at the base URL + const calledUrl = mockFetch.mock.calls[0][0] as string; + expect(calledUrl).toContain("localhost:8880"); + + vi.unstubAllGlobals(); + }); + + it("should set a timeout for the health check", async () => { + const mockFetch = vi.fn().mockResolvedValue({ ok: true, status: 200 }); + vi.stubGlobal("fetch", mockFetch); + + await provider.isHealthy(); + + // Should pass an AbortSignal for timeout + const fetchOptions = mockFetch.mock.calls[0][1] as RequestInit; + expect(fetchOptions.signal).toBeDefined(); + + vi.unstubAllGlobals(); + }); + }); + + // ========================================== + // Default values + // ========================================== + + describe("default values", () => { + it("should use 'alloy' as default voice when none specified", async () => { + const defaultProvider = new TestTTSProvider(testBaseURL); + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await defaultProvider.synthesize("Hello"); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "alloy" })); + }); + + it("should use 'mp3' as default format when none specified", async () => { + const defaultProvider = new TestTTSProvider(testBaseURL); + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await defaultProvider.synthesize("Hello"); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" })); + }); + + it("should use speed 1.0 as default speed", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Hello"); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ speed: 1.0 })); + }); + }); +}); diff --git a/apps/api/src/speech/providers/base-tts.provider.ts b/apps/api/src/speech/providers/base-tts.provider.ts new file mode 100644 index 0000000..8097bed --- /dev/null +++ b/apps/api/src/speech/providers/base-tts.provider.ts @@ -0,0 +1,189 @@ +/** + * Base TTS Provider + * + * Abstract base class implementing common OpenAI-compatible TTS logic. + * All concrete TTS providers (Kokoro, Chatterbox, Piper) extend this class. + * + * Uses the OpenAI SDK with a configurable baseURL to communicate with + * OpenAI-compatible speech synthesis endpoints. + * + * Issue #391 + */ + +import { Logger } from "@nestjs/common"; +import OpenAI from "openai"; +import type { ITTSProvider } from "../interfaces/tts-provider.interface"; +import type { + SpeechTier, + SynthesizeOptions, + SynthesisResult, + VoiceInfo, + AudioFormat, +} from "../interfaces/speech-types"; + +/** Default TTS model identifier used for OpenAI-compatible APIs */ +const DEFAULT_MODEL = "tts-1"; + +/** Default voice when none is configured */ +const DEFAULT_VOICE = "alloy"; + +/** Default audio format */ +const DEFAULT_FORMAT: AudioFormat = "mp3"; + +/** Default speech speed multiplier */ +const DEFAULT_SPEED = 1.0; + +/** Health check timeout in milliseconds */ +const HEALTH_CHECK_TIMEOUT_MS = 5000; + +/** + * Abstract base class for OpenAI-compatible TTS providers. + * + * Provides common logic for: + * - Synthesizing text to audio via OpenAI SDK's audio.speech.create() + * - Listing available voices (with a default implementation) + * - Health checking the TTS endpoint + * + * Subclasses must set `name` and `tier` properties and may override + * `listVoices()` to provide provider-specific voice lists. + * + * @example + * ```typescript + * class KokoroProvider extends BaseTTSProvider { + * readonly name = "kokoro"; + * readonly tier: SpeechTier = "default"; + * + * constructor(baseURL: string) { + * super(baseURL, "af_heart", "mp3"); + * } + * } + * ``` + */ +export abstract class BaseTTSProvider implements ITTSProvider { + abstract readonly name: string; + abstract readonly tier: SpeechTier; + + protected readonly logger: Logger; + protected readonly client: OpenAI; + protected readonly baseURL: string; + protected readonly defaultVoice: string; + protected readonly defaultFormat: AudioFormat; + + /** + * Create a new BaseTTSProvider. + * + * @param baseURL - The base URL for the OpenAI-compatible TTS endpoint + * @param defaultVoice - Default voice ID to use when none is specified in options + * @param defaultFormat - Default audio format to use when none is specified in options + */ + constructor( + baseURL: string, + defaultVoice: string = DEFAULT_VOICE, + defaultFormat: AudioFormat = DEFAULT_FORMAT + ) { + this.baseURL = baseURL; + this.defaultVoice = defaultVoice; + this.defaultFormat = defaultFormat; + this.logger = new Logger(this.constructor.name); + + this.client = new OpenAI({ + baseURL, + apiKey: "not-needed", // Self-hosted services don't require an API key + }); + } + + /** + * Synthesize text to audio using the OpenAI-compatible TTS endpoint. + * + * Calls `client.audio.speech.create()` with the provided text and options, + * then converts the response to a Buffer. + * + * @param text - Text to convert to speech + * @param options - Optional synthesis parameters (voice, format, speed) + * @returns Synthesis result with audio buffer and metadata + * @throws {Error} If synthesis fails + */ + async synthesize(text: string, options?: SynthesizeOptions): Promise { + const voice = options?.voice ?? this.defaultVoice; + const format = options?.format ?? this.defaultFormat; + const speed = options?.speed ?? DEFAULT_SPEED; + + try { + const response = await this.client.audio.speech.create({ + model: DEFAULT_MODEL, + input: text, + voice, + response_format: format, + speed, + }); + + const arrayBuffer = await response.arrayBuffer(); + const audio = Buffer.from(arrayBuffer); + + return { + audio, + format, + voice, + tier: this.tier, + }; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`TTS synthesis failed: ${message}`); + throw new Error(`TTS synthesis failed for ${this.name}: ${message}`); + } + } + + /** + * List available voices for this provider. + * + * Default implementation returns the configured default voice. + * Subclasses should override this to provide a full voice list + * from their specific TTS engine. + * + * @returns Array of voice information objects + */ + listVoices(): Promise { + return Promise.resolve([ + { + id: this.defaultVoice, + name: this.defaultVoice, + tier: this.tier, + isDefault: true, + }, + ]); + } + + /** + * Check if the TTS server is reachable and healthy. + * + * Performs a simple HTTP request to the base URL's models endpoint + * to verify the server is running and responding. + * + * @returns true if the server is reachable, false otherwise + */ + async isHealthy(): Promise { + try { + // Extract the base URL without the /v1 path for health checking + const healthUrl = this.baseURL.replace(/\/v1\/?$/, "/v1/models"); + const controller = new AbortController(); + const timeoutId = setTimeout(() => { + controller.abort(); + }, HEALTH_CHECK_TIMEOUT_MS); + + try { + const response = await fetch(healthUrl, { + method: "GET", + signal: controller.signal, + }); + + return response.ok; + } finally { + clearTimeout(timeoutId); + } + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.warn(`Health check failed for ${this.name}: ${message}`); + return false; + } + } +} diff --git a/apps/api/src/speech/providers/tts-provider.factory.spec.ts b/apps/api/src/speech/providers/tts-provider.factory.spec.ts new file mode 100644 index 0000000..3bab4af --- /dev/null +++ b/apps/api/src/speech/providers/tts-provider.factory.spec.ts @@ -0,0 +1,279 @@ +/** + * TTS Provider Factory Unit Tests + * + * Tests the factory that creates and registers TTS providers based on config. + * + * Issue #391 + */ + +import { describe, it, expect, vi } from "vitest"; +import { createTTSProviders } from "./tts-provider.factory"; +import type { SpeechConfig } from "../speech.config"; +import type { SpeechTier } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: vi.fn(), + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Test helpers +// ========================================== + +function createTestConfig(overrides?: Partial): SpeechConfig { + return { + stt: { + enabled: false, + baseUrl: "http://speaches:8000/v1", + model: "whisper", + language: "en", + }, + tts: { + default: { + enabled: false, + url: "http://kokoro-tts:8880/v1", + voice: "af_heart", + format: "mp3", + }, + premium: { + enabled: false, + url: "http://chatterbox-tts:8881/v1", + }, + fallback: { + enabled: false, + url: "http://openedai-speech:8000/v1", + }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, + ...overrides, + }; +} + +describe("createTTSProviders", () => { + // ========================================== + // Empty map when nothing enabled + // ========================================== + + describe("when no TTS tiers are enabled", () => { + it("should return an empty map", () => { + const config = createTestConfig(); + const providers = createTTSProviders(config); + + expect(providers).toBeInstanceOf(Map); + expect(providers.size).toBe(0); + }); + }); + + // ========================================== + // Default tier + // ========================================== + + describe("when default tier is enabled", () => { + it("should create a provider for the default tier", () => { + const config = createTestConfig({ + tts: { + default: { + enabled: true, + url: "http://kokoro-tts:8880/v1", + voice: "af_heart", + format: "mp3", + }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + }); + + const providers = createTTSProviders(config); + + expect(providers.size).toBe(1); + expect(providers.has("default")).toBe(true); + + const provider = providers.get("default"); + expect(provider).toBeDefined(); + expect(provider?.tier).toBe("default"); + expect(provider?.name).toBe("kokoro"); + }); + }); + + // ========================================== + // Premium tier + // ========================================== + + describe("when premium tier is enabled", () => { + it("should create a provider for the premium tier", () => { + const config = createTestConfig({ + tts: { + default: { enabled: false, url: "", voice: "", format: "" }, + premium: { + enabled: true, + url: "http://chatterbox-tts:8881/v1", + }, + fallback: { enabled: false, url: "" }, + }, + }); + + const providers = createTTSProviders(config); + + expect(providers.size).toBe(1); + expect(providers.has("premium")).toBe(true); + + const provider = providers.get("premium"); + expect(provider).toBeDefined(); + expect(provider?.tier).toBe("premium"); + expect(provider?.name).toBe("chatterbox"); + }); + }); + + // ========================================== + // Fallback tier + // ========================================== + + describe("when fallback tier is enabled", () => { + it("should create a provider for the fallback tier", () => { + const config = createTestConfig({ + tts: { + default: { enabled: false, url: "", voice: "", format: "" }, + premium: { enabled: false, url: "" }, + fallback: { + enabled: true, + url: "http://openedai-speech:8000/v1", + }, + }, + }); + + const providers = createTTSProviders(config); + + expect(providers.size).toBe(1); + expect(providers.has("fallback")).toBe(true); + + const provider = providers.get("fallback"); + expect(provider).toBeDefined(); + expect(provider?.tier).toBe("fallback"); + expect(provider?.name).toBe("piper"); + }); + }); + + // ========================================== + // Multiple tiers + // ========================================== + + describe("when multiple tiers are enabled", () => { + it("should create providers for all enabled tiers", () => { + const config = createTestConfig({ + tts: { + default: { + enabled: true, + url: "http://kokoro-tts:8880/v1", + voice: "af_heart", + format: "mp3", + }, + premium: { + enabled: true, + url: "http://chatterbox-tts:8881/v1", + }, + fallback: { + enabled: true, + url: "http://openedai-speech:8000/v1", + }, + }, + }); + + const providers = createTTSProviders(config); + + expect(providers.size).toBe(3); + expect(providers.has("default")).toBe(true); + expect(providers.has("premium")).toBe(true); + expect(providers.has("fallback")).toBe(true); + }); + + it("should create providers only for enabled tiers", () => { + const config = createTestConfig({ + tts: { + default: { + enabled: true, + url: "http://kokoro-tts:8880/v1", + voice: "af_heart", + format: "mp3", + }, + premium: { enabled: false, url: "" }, + fallback: { + enabled: true, + url: "http://openedai-speech:8000/v1", + }, + }, + }); + + const providers = createTTSProviders(config); + + expect(providers.size).toBe(2); + expect(providers.has("default")).toBe(true); + expect(providers.has("premium")).toBe(false); + expect(providers.has("fallback")).toBe(true); + }); + }); + + // ========================================== + // Provider properties + // ========================================== + + describe("provider properties", () => { + it("should implement ITTSProvider interface methods", () => { + const config = createTestConfig({ + tts: { + default: { + enabled: true, + url: "http://kokoro-tts:8880/v1", + voice: "af_heart", + format: "mp3", + }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + }); + + const providers = createTTSProviders(config); + const provider = providers.get("default"); + + expect(provider).toBeDefined(); + expect(typeof provider?.synthesize).toBe("function"); + expect(typeof provider?.listVoices).toBe("function"); + expect(typeof provider?.isHealthy).toBe("function"); + }); + + it("should return providers as a Map", () => { + const config = createTestConfig({ + tts: { + default: { + enabled: true, + url: "http://kokoro-tts:8880/v1", + voice: "af_heart", + format: "mp3", + }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + }); + + const providers = createTTSProviders(config); + + // Verify the map keys are valid SpeechTier values + for (const [tier] of providers) { + expect(["default", "premium", "fallback"]).toContain(tier as SpeechTier); + } + }); + }); +}); diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts new file mode 100644 index 0000000..3f049ab --- /dev/null +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -0,0 +1,112 @@ +/** + * TTS Provider Factory + * + * Creates and registers TTS providers based on speech configuration. + * Reads enabled flags and URLs from config and instantiates the appropriate + * provider for each tier. + * + * Each tier maps to a specific TTS engine: + * - default: Kokoro-FastAPI (CPU, always available) + * - premium: Chatterbox (GPU, voice cloning) + * - fallback: Piper via OpenedAI Speech (ultra-lightweight CPU) + * + * Issue #391 + */ + +import { Logger } from "@nestjs/common"; +import { BaseTTSProvider } from "./base-tts.provider"; +import type { ITTSProvider } from "../interfaces/tts-provider.interface"; +import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; +import type { SpeechConfig } from "../speech.config"; + +// ========================================== +// Concrete provider classes +// ========================================== + +/** + * Kokoro TTS provider (default tier). + * CPU-based, always available, Apache 2.0 license. + */ +class KokoroProvider extends BaseTTSProvider { + readonly name = "kokoro"; + readonly tier: SpeechTier = "default"; +} + +/** + * Chatterbox TTS provider (premium tier). + * GPU required, voice cloning capable, MIT license. + */ +class ChatterboxProvider extends BaseTTSProvider { + readonly name = "chatterbox"; + readonly tier: SpeechTier = "premium"; + + constructor(baseURL: string) { + super(baseURL, "default", "mp3"); + } +} + +/** + * Piper TTS provider via OpenedAI Speech (fallback tier). + * Ultra-lightweight CPU, GPL license. + */ +class PiperProvider extends BaseTTSProvider { + readonly name = "piper"; + readonly tier: SpeechTier = "fallback"; + + constructor(baseURL: string) { + super(baseURL, "alloy", "mp3"); + } +} + +// ========================================== +// Factory function +// ========================================== + +const logger = new Logger("TTSProviderFactory"); + +/** + * Create and register TTS providers based on the speech configuration. + * + * Only creates providers for tiers that are enabled in the config. + * Returns a Map keyed by SpeechTier for use with the TTS_PROVIDERS injection token. + * + * @param config - Speech configuration with TTS tier settings + * @returns Map of enabled TTS providers keyed by tier + */ +export function createTTSProviders(config: SpeechConfig): Map { + const providers = new Map(); + + // Default tier: Kokoro + if (config.tts.default.enabled) { + const provider = new KokoroProvider( + config.tts.default.url, + config.tts.default.voice, + config.tts.default.format as AudioFormat + ); + providers.set("default", provider); + logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`); + } + + // Premium tier: Chatterbox + if (config.tts.premium.enabled) { + const provider = new ChatterboxProvider(config.tts.premium.url); + providers.set("premium", provider); + logger.log(`Registered premium TTS provider: chatterbox at ${config.tts.premium.url}`); + } + + // Fallback tier: Piper + if (config.tts.fallback.enabled) { + const provider = new PiperProvider(config.tts.fallback.url); + providers.set("fallback", provider); + logger.log(`Registered fallback TTS provider: piper at ${config.tts.fallback.url}`); + } + + if (providers.size === 0) { + logger.warn("No TTS providers are enabled. TTS synthesis will not be available."); + } else { + const tierNames = Array.from(providers.keys()).join(", "); + logger.log(`TTS providers ready: ${tierNames} (${String(providers.size)} total)`); + } + + return providers; +} From 79b1d81d27aafa93cd6ae0e9ceadda33477dc0e1 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:27:47 -0600 Subject: [PATCH 07/19] feat(#393): implement Kokoro-FastAPI TTS provider with voice catalog Extract KokoroTtsProvider from factory into its own module with: - Full voice catalog of 54 built-in voices across 8 languages - Voice metadata parsing from ID prefix (language, gender, accent) - Exported constants for supported formats and speed range - Comprehensive unit tests (48 tests) - Fix lint/type errors in chatterbox provider (Prettier + unsafe cast) Co-Authored-By: Claude Opus 4.6 --- .../providers/chatterbox-tts.provider.ts | 169 ++++++++++ .../providers/kokoro-tts.provider.spec.ts | 316 ++++++++++++++++++ .../speech/providers/kokoro-tts.provider.ts | 278 +++++++++++++++ .../speech/providers/tts-provider.factory.ts | 28 +- 4 files changed, 767 insertions(+), 24 deletions(-) create mode 100644 apps/api/src/speech/providers/chatterbox-tts.provider.ts create mode 100644 apps/api/src/speech/providers/kokoro-tts.provider.spec.ts create mode 100644 apps/api/src/speech/providers/kokoro-tts.provider.ts diff --git a/apps/api/src/speech/providers/chatterbox-tts.provider.ts b/apps/api/src/speech/providers/chatterbox-tts.provider.ts new file mode 100644 index 0000000..c17c060 --- /dev/null +++ b/apps/api/src/speech/providers/chatterbox-tts.provider.ts @@ -0,0 +1,169 @@ +/** + * Chatterbox TTS Provider + * + * Premium-tier TTS provider with voice cloning and emotion exaggeration support. + * Uses the Chatterbox TTS Server's OpenAI-compatible endpoint with extra body + * parameters for voice cloning (reference_audio) and emotion control (exaggeration). + * + * Key capabilities: + * - Voice cloning via reference audio sample + * - Emotion exaggeration control (0.0 - 1.0) + * - Cross-language voice transfer (23 languages) + * - Graceful degradation when GPU is unavailable (isHealthy returns false) + * + * The provider is optional and only instantiated when TTS_PREMIUM_ENABLED=true. + * + * Issue #394 + */ + +import type { SpeechCreateParams } from "openai/resources/audio/speech"; +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, SynthesizeOptions, SynthesisResult } from "../interfaces/speech-types"; +import type { ChatterboxSynthesizeOptions } from "../interfaces/speech-types"; + +/** Default voice for Chatterbox */ +const CHATTERBOX_DEFAULT_VOICE = "default"; + +/** Default audio format for Chatterbox (WAV for highest quality) */ +const CHATTERBOX_DEFAULT_FORMAT = "wav" as const; + +/** Default TTS model identifier */ +const DEFAULT_MODEL = "tts-1"; + +/** Default speech speed multiplier */ +const DEFAULT_SPEED = 1.0; + +/** + * Languages supported by Chatterbox for cross-language voice transfer. + * Chatterbox supports 23 languages for voice cloning and synthesis. + */ +const SUPPORTED_LANGUAGES: readonly string[] = [ + "en", // English + "fr", // French + "de", // German + "es", // Spanish + "it", // Italian + "pt", // Portuguese + "nl", // Dutch + "pl", // Polish + "ru", // Russian + "uk", // Ukrainian + "ja", // Japanese + "zh", // Chinese + "ko", // Korean + "ar", // Arabic + "hi", // Hindi + "tr", // Turkish + "sv", // Swedish + "da", // Danish + "fi", // Finnish + "no", // Norwegian + "cs", // Czech + "el", // Greek + "ro", // Romanian +] as const; + +/** + * Chatterbox TTS provider (premium tier). + * + * Extends BaseTTSProvider with voice cloning and emotion exaggeration support. + * The Chatterbox TTS Server uses an OpenAI-compatible API but accepts additional + * body parameters for its advanced features. + * + * @example + * ```typescript + * const provider = new ChatterboxTTSProvider("http://chatterbox:8881/v1"); + * + * // Basic synthesis + * const result = await provider.synthesize("Hello!"); + * + * // Voice cloning with emotion + * const clonedResult = await provider.synthesize("Hello!", { + * referenceAudio: myAudioBuffer, + * emotionExaggeration: 0.7, + * }); + * ``` + */ +export class ChatterboxTTSProvider extends BaseTTSProvider { + readonly name = "chatterbox"; + readonly tier: SpeechTier = "premium"; + + /** + * Languages supported for cross-language voice transfer. + */ + readonly supportedLanguages: readonly string[] = SUPPORTED_LANGUAGES; + + constructor(baseURL: string) { + super(baseURL, CHATTERBOX_DEFAULT_VOICE, CHATTERBOX_DEFAULT_FORMAT); + } + + /** + * Synthesize text to audio with optional voice cloning and emotion control. + * + * Overrides the base synthesize() to support Chatterbox-specific options: + * - `referenceAudio`: Buffer of audio to clone the voice from (sent as base64) + * - `emotionExaggeration`: Emotion intensity factor (0.0 - 1.0, clamped) + * + * These are passed as extra body parameters to the OpenAI-compatible endpoint, + * which Chatterbox's API accepts alongside the standard parameters. + * + * @param text - Text to convert to speech + * @param options - Synthesis options, optionally including Chatterbox-specific params + * @returns Synthesis result with audio buffer and metadata + * @throws {Error} If synthesis fails (e.g., GPU unavailable) + */ + async synthesize( + text: string, + options?: SynthesizeOptions | ChatterboxSynthesizeOptions + ): Promise { + const voice = options?.voice ?? this.defaultVoice; + const format = options?.format ?? this.defaultFormat; + const speed = options?.speed ?? DEFAULT_SPEED; + + // Build the request body with standard OpenAI-compatible params + const requestBody: Record = { + model: DEFAULT_MODEL, + input: text, + voice, + response_format: format, + speed, + }; + + // Add Chatterbox-specific params if provided + const chatterboxOptions = options as ChatterboxSynthesizeOptions | undefined; + + if (chatterboxOptions?.referenceAudio) { + requestBody.reference_audio = chatterboxOptions.referenceAudio.toString("base64"); + } + + if (chatterboxOptions?.emotionExaggeration !== undefined) { + // Clamp to valid range [0.0, 1.0] + requestBody.exaggeration = Math.max( + 0.0, + Math.min(1.0, chatterboxOptions.emotionExaggeration) + ); + } + + try { + // Use the OpenAI SDK's create method, passing extra params + // The OpenAI SDK allows additional body params to be passed through + const response = await this.client.audio.speech.create( + requestBody as unknown as SpeechCreateParams + ); + + const arrayBuffer = await response.arrayBuffer(); + const audio = Buffer.from(arrayBuffer); + + return { + audio, + format, + voice, + tier: this.tier, + }; + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`TTS synthesis failed: ${message}`); + throw new Error(`TTS synthesis failed for ${this.name}: ${message}`); + } + } +} diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts new file mode 100644 index 0000000..27c35dc --- /dev/null +++ b/apps/api/src/speech/providers/kokoro-tts.provider.spec.ts @@ -0,0 +1,316 @@ +/** + * KokoroTtsProvider Unit Tests + * + * Tests the Kokoro-FastAPI TTS provider with full voice catalog, + * voice metadata parsing, and Kokoro-specific feature constants. + * + * Issue #393 + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { + KokoroTtsProvider, + KOKORO_SUPPORTED_FORMATS, + KOKORO_SPEED_RANGE, + KOKORO_VOICES, + parseVoicePrefix, +} from "./kokoro-tts.provider"; +import type { VoiceInfo } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: vi.fn(), + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Provider identity +// ========================================== + +describe("KokoroTtsProvider", () => { + const testBaseURL = "http://kokoro-tts:8880/v1"; + let provider: KokoroTtsProvider; + + beforeEach(() => { + provider = new KokoroTtsProvider(testBaseURL); + }); + + describe("provider identity", () => { + it("should have name 'kokoro'", () => { + expect(provider.name).toBe("kokoro"); + }); + + it("should have tier 'default'", () => { + expect(provider.tier).toBe("default"); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + let voices: VoiceInfo[]; + + beforeEach(async () => { + voices = await provider.listVoices(); + }); + + it("should return an array of VoiceInfo objects", () => { + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + }); + + it("should return at least 10 voices", () => { + // The issue specifies at least: af_heart, af_bella, af_nicole, af_sarah, af_sky, + // am_adam, am_michael, bf_emma, bf_isabella, bm_george, bm_lewis + expect(voices.length).toBeGreaterThanOrEqual(10); + }); + + it("should set tier to 'default' on all voices", () => { + for (const voice of voices) { + expect(voice.tier).toBe("default"); + } + }); + + it("should have exactly one default voice", () => { + const defaults = voices.filter((v) => v.isDefault === true); + expect(defaults.length).toBe(1); + }); + + it("should mark af_heart as the default voice", () => { + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("af_heart"); + }); + + it("should have an id and name for every voice", () => { + for (const voice of voices) { + expect(voice.id).toBeTruthy(); + expect(voice.name).toBeTruthy(); + } + }); + + it("should set language on every voice", () => { + for (const voice of voices) { + expect(voice.language).toBeTruthy(); + } + }); + + // ========================================== + // Required voices from the issue + // ========================================== + + describe("required voices", () => { + const requiredVoiceIds = [ + "af_heart", + "af_bella", + "af_nicole", + "af_sarah", + "af_sky", + "am_adam", + "am_michael", + "bf_emma", + "bf_isabella", + "bm_george", + "bm_lewis", + ]; + + it.each(requiredVoiceIds)("should include voice '%s'", (voiceId) => { + const voice = voices.find((v) => v.id === voiceId); + expect(voice).toBeDefined(); + }); + }); + + // ========================================== + // Voice metadata from prefix + // ========================================== + + describe("voice metadata from prefix", () => { + it("should set language to 'en-US' for af_ prefix voices", () => { + const voice = voices.find((v) => v.id === "af_heart"); + expect(voice?.language).toBe("en-US"); + }); + + it("should set language to 'en-US' for am_ prefix voices", () => { + const voice = voices.find((v) => v.id === "am_adam"); + expect(voice?.language).toBe("en-US"); + }); + + it("should set language to 'en-GB' for bf_ prefix voices", () => { + const voice = voices.find((v) => v.id === "bf_emma"); + expect(voice?.language).toBe("en-GB"); + }); + + it("should set language to 'en-GB' for bm_ prefix voices", () => { + const voice = voices.find((v) => v.id === "bm_george"); + expect(voice?.language).toBe("en-GB"); + }); + + it("should include gender in voice name for af_ prefix", () => { + const voice = voices.find((v) => v.id === "af_heart"); + expect(voice?.name).toContain("Female"); + }); + + it("should include gender in voice name for am_ prefix", () => { + const voice = voices.find((v) => v.id === "am_adam"); + expect(voice?.name).toContain("Male"); + }); + + it("should include gender in voice name for bf_ prefix", () => { + const voice = voices.find((v) => v.id === "bf_emma"); + expect(voice?.name).toContain("Female"); + }); + + it("should include gender in voice name for bm_ prefix", () => { + const voice = voices.find((v) => v.id === "bm_george"); + expect(voice?.name).toContain("Male"); + }); + }); + + // ========================================== + // Voice name formatting + // ========================================== + + describe("voice name formatting", () => { + it("should capitalize the voice name portion", () => { + const voice = voices.find((v) => v.id === "af_heart"); + expect(voice?.name).toContain("Heart"); + }); + + it("should include the accent/language label in the name", () => { + const afVoice = voices.find((v) => v.id === "af_heart"); + expect(afVoice?.name).toContain("American"); + + const bfVoice = voices.find((v) => v.id === "bf_emma"); + expect(bfVoice?.name).toContain("British"); + }); + }); + }); + + // ========================================== + // Custom constructor + // ========================================== + + describe("constructor", () => { + it("should accept custom default voice", () => { + const customProvider = new KokoroTtsProvider(testBaseURL, "af_bella"); + expect(customProvider).toBeDefined(); + }); + + it("should accept custom default format", () => { + const customProvider = new KokoroTtsProvider(testBaseURL, "af_heart", "wav"); + expect(customProvider).toBeDefined(); + }); + + it("should use af_heart as default voice when none specified", () => { + const defaultProvider = new KokoroTtsProvider(testBaseURL); + expect(defaultProvider).toBeDefined(); + }); + }); +}); + +// ========================================== +// parseVoicePrefix utility +// ========================================== + +describe("parseVoicePrefix", () => { + it("should parse af_ as American English Female", () => { + const result = parseVoicePrefix("af_heart"); + expect(result.language).toBe("en-US"); + expect(result.gender).toBe("female"); + expect(result.accent).toBe("American"); + }); + + it("should parse am_ as American English Male", () => { + const result = parseVoicePrefix("am_adam"); + expect(result.language).toBe("en-US"); + expect(result.gender).toBe("male"); + expect(result.accent).toBe("American"); + }); + + it("should parse bf_ as British English Female", () => { + const result = parseVoicePrefix("bf_emma"); + expect(result.language).toBe("en-GB"); + expect(result.gender).toBe("female"); + expect(result.accent).toBe("British"); + }); + + it("should parse bm_ as British English Male", () => { + const result = parseVoicePrefix("bm_george"); + expect(result.language).toBe("en-GB"); + expect(result.gender).toBe("male"); + expect(result.accent).toBe("British"); + }); + + it("should return unknown for unrecognized prefix", () => { + const result = parseVoicePrefix("xx_unknown"); + expect(result.language).toBe("unknown"); + expect(result.gender).toBe("unknown"); + expect(result.accent).toBe("Unknown"); + }); +}); + +// ========================================== +// Exported constants +// ========================================== + +describe("KOKORO_SUPPORTED_FORMATS", () => { + it("should include mp3", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("mp3"); + }); + + it("should include wav", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("wav"); + }); + + it("should include opus", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("opus"); + }); + + it("should include flac", () => { + expect(KOKORO_SUPPORTED_FORMATS).toContain("flac"); + }); + + it("should be a readonly array", () => { + expect(Array.isArray(KOKORO_SUPPORTED_FORMATS)).toBe(true); + }); +}); + +describe("KOKORO_SPEED_RANGE", () => { + it("should have min speed of 0.25", () => { + expect(KOKORO_SPEED_RANGE.min).toBe(0.25); + }); + + it("should have max speed of 4.0", () => { + expect(KOKORO_SPEED_RANGE.max).toBe(4.0); + }); +}); + +describe("KOKORO_VOICES", () => { + it("should be a non-empty array", () => { + expect(Array.isArray(KOKORO_VOICES)).toBe(true); + expect(KOKORO_VOICES.length).toBeGreaterThan(0); + }); + + it("should contain voice entries with id and label", () => { + for (const voice of KOKORO_VOICES) { + expect(voice.id).toBeTruthy(); + expect(voice.label).toBeTruthy(); + } + }); + + it("should include voices from multiple language prefixes", () => { + const prefixes = new Set(KOKORO_VOICES.map((v) => v.id.substring(0, 2))); + expect(prefixes.size).toBeGreaterThanOrEqual(4); + }); +}); diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.ts b/apps/api/src/speech/providers/kokoro-tts.provider.ts new file mode 100644 index 0000000..ac1b7d3 --- /dev/null +++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts @@ -0,0 +1,278 @@ +/** + * Kokoro-FastAPI TTS Provider + * + * Default-tier TTS provider backed by Kokoro-FastAPI. + * CPU-based, always available, Apache 2.0 license. + * + * Features: + * - 54 built-in voices across 8 languages + * - Speed control: 0.25x to 4.0x + * - Output formats: mp3, wav, opus, flac + * - Voice metadata derived from ID prefix (language, gender, accent) + * + * Voice ID format: {prefix}_{name} + * - First character: language/accent code (a=American, b=British, etc.) + * - Second character: gender code (f=Female, m=Male) + * + * Issue #393 + */ + +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Constants +// ========================================== + +/** Audio formats supported by Kokoro-FastAPI */ +export const KOKORO_SUPPORTED_FORMATS: readonly AudioFormat[] = [ + "mp3", + "wav", + "opus", + "flac", +] as const; + +/** Speed range supported by Kokoro-FastAPI */ +export const KOKORO_SPEED_RANGE = { + min: 0.25, + max: 4.0, +} as const; + +/** Default voice for Kokoro */ +const KOKORO_DEFAULT_VOICE = "af_heart"; + +/** Default audio format for Kokoro */ +const KOKORO_DEFAULT_FORMAT: AudioFormat = "mp3"; + +// ========================================== +// Voice prefix mapping +// ========================================== + +/** + * Mapping of voice ID prefix (first two characters) to language/accent/gender metadata. + * + * Kokoro voice IDs follow the pattern: {lang}{gender}_{name} + * - lang: a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese + * - gender: f=Female, m=Male + */ +const VOICE_PREFIX_MAP: Record = { + af: { language: "en-US", gender: "female", accent: "American" }, + am: { language: "en-US", gender: "male", accent: "American" }, + bf: { language: "en-GB", gender: "female", accent: "British" }, + bm: { language: "en-GB", gender: "male", accent: "British" }, + ef: { language: "es", gender: "female", accent: "Spanish" }, + em: { language: "es", gender: "male", accent: "Spanish" }, + ff: { language: "fr", gender: "female", accent: "French" }, + fm: { language: "fr", gender: "male", accent: "French" }, + hf: { language: "hi", gender: "female", accent: "Hindi" }, + hm: { language: "hi", gender: "male", accent: "Hindi" }, + jf: { language: "ja", gender: "female", accent: "Japanese" }, + jm: { language: "ja", gender: "male", accent: "Japanese" }, + pf: { language: "pt-BR", gender: "female", accent: "Portuguese" }, + pm: { language: "pt-BR", gender: "male", accent: "Portuguese" }, + zf: { language: "zh", gender: "female", accent: "Chinese" }, + zm: { language: "zh", gender: "male", accent: "Chinese" }, +}; + +// ========================================== +// Voice catalog +// ========================================== + +/** Raw voice catalog entry */ +interface KokoroVoiceEntry { + /** Voice ID (e.g. "af_heart") */ + id: string; + /** Human-readable label (e.g. "Heart") */ + label: string; +} + +/** + * Complete catalog of Kokoro built-in voices. + * + * Organized by language/accent prefix: + * - af_: American English Female + * - am_: American English Male + * - bf_: British English Female + * - bm_: British English Male + * - ef_: Spanish Female + * - em_: Spanish Male + * - ff_: French Female + * - hf_: Hindi Female + * - jf_: Japanese Female + * - jm_: Japanese Male + * - pf_: Portuguese Female + * - zf_: Chinese Female + * - zm_: Chinese Male + */ +export const KOKORO_VOICES: readonly KokoroVoiceEntry[] = [ + // American English Female (af_) + { id: "af_heart", label: "Heart" }, + { id: "af_alloy", label: "Alloy" }, + { id: "af_aoede", label: "Aoede" }, + { id: "af_bella", label: "Bella" }, + { id: "af_jessica", label: "Jessica" }, + { id: "af_kore", label: "Kore" }, + { id: "af_nicole", label: "Nicole" }, + { id: "af_nova", label: "Nova" }, + { id: "af_river", label: "River" }, + { id: "af_sarah", label: "Sarah" }, + { id: "af_sky", label: "Sky" }, + // American English Male (am_) + { id: "am_adam", label: "Adam" }, + { id: "am_echo", label: "Echo" }, + { id: "am_eric", label: "Eric" }, + { id: "am_fenrir", label: "Fenrir" }, + { id: "am_liam", label: "Liam" }, + { id: "am_michael", label: "Michael" }, + { id: "am_onyx", label: "Onyx" }, + { id: "am_puck", label: "Puck" }, + { id: "am_santa", label: "Santa" }, + // British English Female (bf_) + { id: "bf_alice", label: "Alice" }, + { id: "bf_emma", label: "Emma" }, + { id: "bf_isabella", label: "Isabella" }, + { id: "bf_lily", label: "Lily" }, + // British English Male (bm_) + { id: "bm_daniel", label: "Daniel" }, + { id: "bm_fable", label: "Fable" }, + { id: "bm_george", label: "George" }, + { id: "bm_lewis", label: "Lewis" }, + { id: "bm_oscar", label: "Oscar" }, + // Spanish Female (ef_) + { id: "ef_dora", label: "Dora" }, + { id: "ef_elena", label: "Elena" }, + { id: "ef_maria", label: "Maria" }, + // Spanish Male (em_) + { id: "em_alex", label: "Alex" }, + { id: "em_carlos", label: "Carlos" }, + { id: "em_santa", label: "Santa" }, + // French Female (ff_) + { id: "ff_camille", label: "Camille" }, + { id: "ff_siwis", label: "Siwis" }, + // Hindi Female (hf_) + { id: "hf_alpha", label: "Alpha" }, + { id: "hf_beta", label: "Beta" }, + // Japanese Female (jf_) + { id: "jf_alpha", label: "Alpha" }, + { id: "jf_gongitsune", label: "Gongitsune" }, + { id: "jf_nezumi", label: "Nezumi" }, + { id: "jf_tebukuro", label: "Tebukuro" }, + // Japanese Male (jm_) + { id: "jm_kumo", label: "Kumo" }, + // Portuguese Female (pf_) + { id: "pf_dora", label: "Dora" }, + // Chinese Female (zf_) + { id: "zf_xiaobei", label: "Xiaobei" }, + { id: "zf_xiaoni", label: "Xiaoni" }, + { id: "zf_xiaoxiao", label: "Xiaoxiao" }, + { id: "zf_xiaoyi", label: "Xiaoyi" }, + // Chinese Male (zm_) + { id: "zm_yunjian", label: "Yunjian" }, + { id: "zm_yunxi", label: "Yunxi" }, + { id: "zm_yunxia", label: "Yunxia" }, + { id: "zm_yunyang", label: "Yunyang" }, +] as const; + +// ========================================== +// Prefix parser +// ========================================== + +/** Parsed voice prefix metadata */ +export interface VoicePrefixMetadata { + /** BCP 47 language code (e.g. "en-US", "en-GB", "ja") */ + language: string; + /** Gender: "female", "male", or "unknown" */ + gender: string; + /** Human-readable accent label (e.g. "American", "British") */ + accent: string; +} + +/** + * Parse a Kokoro voice ID to extract language, gender, and accent metadata. + * + * Voice IDs follow the pattern: {lang}{gender}_{name} + * The first two characters encode language/accent and gender. + * + * @param voiceId - Kokoro voice ID (e.g. "af_heart") + * @returns Parsed metadata with language, gender, and accent + */ +export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata { + const prefix = voiceId.substring(0, 2); + const mapping = VOICE_PREFIX_MAP[prefix]; + + if (mapping) { + return { + language: mapping.language, + gender: mapping.gender, + accent: mapping.accent, + }; + } + + return { + language: "unknown", + gender: "unknown", + accent: "Unknown", + }; +} + +// ========================================== +// Provider class +// ========================================== + +/** + * Kokoro-FastAPI TTS provider (default tier). + * + * CPU-based text-to-speech engine with 54 built-in voices across 8 languages. + * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI. + * + * @example + * ```typescript + * const kokoro = new KokoroTtsProvider("http://kokoro-tts:8880/v1"); + * const voices = await kokoro.listVoices(); + * const result = await kokoro.synthesize("Hello!", { voice: "af_heart" }); + * ``` + */ +export class KokoroTtsProvider extends BaseTTSProvider { + readonly name = "kokoro"; + readonly tier: SpeechTier = "default"; + + /** + * Create a new Kokoro TTS provider. + * + * @param baseURL - Base URL for the Kokoro-FastAPI endpoint (e.g. "http://kokoro-tts:8880/v1") + * @param defaultVoice - Default voice ID (defaults to "af_heart") + * @param defaultFormat - Default audio format (defaults to "mp3") + */ + constructor( + baseURL: string, + defaultVoice: string = KOKORO_DEFAULT_VOICE, + defaultFormat: AudioFormat = KOKORO_DEFAULT_FORMAT + ) { + super(baseURL, defaultVoice, defaultFormat); + } + + /** + * List all available Kokoro voices with metadata. + * + * Returns the full catalog of 54 built-in voices with language, gender, + * and accent information derived from voice ID prefixes. + * + * @returns Array of VoiceInfo objects for all Kokoro voices + */ + override listVoices(): Promise { + const voices: VoiceInfo[] = KOKORO_VOICES.map((entry) => { + const metadata = parseVoicePrefix(entry.id); + const genderLabel = metadata.gender === "female" ? "Female" : "Male"; + + return { + id: entry.id, + name: `${entry.label} (${metadata.accent} ${genderLabel})`, + language: metadata.language, + tier: this.tier, + isDefault: entry.id === this.defaultVoice, + }; + }); + + return Promise.resolve(voices); + } +} diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts index 3f049ab..28c807f 100644 --- a/apps/api/src/speech/providers/tts-provider.factory.ts +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -15,6 +15,8 @@ import { Logger } from "@nestjs/common"; import { BaseTTSProvider } from "./base-tts.provider"; +import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; +import { KokoroTtsProvider } from "./kokoro-tts.provider"; import type { ITTSProvider } from "../interfaces/tts-provider.interface"; import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; import type { SpeechConfig } from "../speech.config"; @@ -23,28 +25,6 @@ import type { SpeechConfig } from "../speech.config"; // Concrete provider classes // ========================================== -/** - * Kokoro TTS provider (default tier). - * CPU-based, always available, Apache 2.0 license. - */ -class KokoroProvider extends BaseTTSProvider { - readonly name = "kokoro"; - readonly tier: SpeechTier = "default"; -} - -/** - * Chatterbox TTS provider (premium tier). - * GPU required, voice cloning capable, MIT license. - */ -class ChatterboxProvider extends BaseTTSProvider { - readonly name = "chatterbox"; - readonly tier: SpeechTier = "premium"; - - constructor(baseURL: string) { - super(baseURL, "default", "mp3"); - } -} - /** * Piper TTS provider via OpenedAI Speech (fallback tier). * Ultra-lightweight CPU, GPL license. @@ -78,7 +58,7 @@ export function createTTSProviders(config: SpeechConfig): Map Date: Sun, 15 Feb 2026 02:29:38 -0600 Subject: [PATCH 08/19] feat(#394): implement Chatterbox TTS provider with voice cloning Add ChatterboxSynthesizeOptions interface with referenceAudio and emotionExaggeration fields, and comprehensive unit tests (26 tests) covering voice cloning, emotion control, clamping, graceful degradation, and cross-language support. Co-Authored-By: Claude Opus 4.6 --- .../api/src/speech/interfaces/speech-types.ts | 27 ++ .../providers/chatterbox-tts.provider.spec.ts | 436 ++++++++++++++++++ 2 files changed, 463 insertions(+) create mode 100644 apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts index 3f5a0b7..c3b93c1 100644 --- a/apps/api/src/speech/interfaces/speech-types.ts +++ b/apps/api/src/speech/interfaces/speech-types.ts @@ -128,6 +128,33 @@ export interface SynthesisResult { durationSeconds?: number; } +/** + * Extended options for Chatterbox TTS synthesis. + * + * Chatterbox supports voice cloning via a reference audio buffer and + * emotion exaggeration control. These are passed as extra body parameters + * to the OpenAI-compatible API. + * + * Issue #394 + */ +export interface ChatterboxSynthesizeOptions extends SynthesizeOptions { + /** + * Reference audio buffer for voice cloning. + * When provided, Chatterbox will clone the voice from this audio sample. + * Should be a WAV or MP3 file of 5-30 seconds for best results. + */ + referenceAudio?: Buffer; + + /** + * Emotion exaggeration factor (0.0 to 1.0). + * Controls how much emotional expression is applied to the synthesized speech. + * - 0.0: Neutral, minimal emotion + * - 0.5: Moderate emotion (default when not specified) + * - 1.0: Maximum emotion exaggeration + */ + emotionExaggeration?: number; +} + /** * Information about an available TTS voice. */ diff --git a/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts b/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts new file mode 100644 index 0000000..08e0f2a --- /dev/null +++ b/apps/api/src/speech/providers/chatterbox-tts.provider.spec.ts @@ -0,0 +1,436 @@ +/** + * ChatterboxTTSProvider Unit Tests + * + * Tests the premium-tier TTS provider with voice cloning and + * emotion exaggeration support for Chatterbox. + * + * Issue #394 + */ + +import { describe, it, expect, beforeEach, vi, type Mock } from "vitest"; +import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; +import type { ChatterboxSynthesizeOptions, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +const mockCreate = vi.fn(); + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: mockCreate, + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Test helpers +// ========================================== + +/** + * Create a mock Response-like object that mimics OpenAI SDK's audio.speech.create() return. + */ +function createMockAudioResponse(audioData: Uint8Array): { arrayBuffer: Mock } { + return { + arrayBuffer: vi.fn().mockResolvedValue(audioData.buffer), + }; +} + +describe("ChatterboxTTSProvider", () => { + let provider: ChatterboxTTSProvider; + + const testBaseURL = "http://chatterbox-tts:8881/v1"; + + beforeEach(() => { + vi.clearAllMocks(); + provider = new ChatterboxTTSProvider(testBaseURL); + }); + + // ========================================== + // Provider identity + // ========================================== + + describe("provider identity", () => { + it("should have name 'chatterbox'", () => { + expect(provider.name).toBe("chatterbox"); + }); + + it("should have tier 'premium'", () => { + expect(provider.tier).toBe("premium"); + }); + }); + + // ========================================== + // Constructor + // ========================================== + + describe("constructor", () => { + it("should create an instance with the provided baseURL", () => { + expect(provider).toBeDefined(); + }); + + it("should use 'default' as the default voice", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello"); + + expect(result.voice).toBe("default"); + }); + + it("should use 'wav' as the default format", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello"); + + expect(result.format).toBe("wav"); + }); + }); + + // ========================================== + // synthesize() — basic (no Chatterbox-specific options) + // ========================================== + + describe("synthesize (basic)", () => { + it("should synthesize text and return a SynthesisResult", async () => { + const audioBytes = new Uint8Array([0x49, 0x44, 0x33, 0x04, 0x00]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const result = await provider.synthesize("Hello, world!"); + + expect(result).toBeDefined(); + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.audio.length).toBe(audioBytes.length); + expect(result.format).toBe("wav"); + expect(result.voice).toBe("default"); + expect(result.tier).toBe("premium"); + }); + + it("should pass correct base parameters to OpenAI SDK when no extra options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Test text"); + + expect(mockCreate).toHaveBeenCalledWith({ + model: "tts-1", + input: "Test text", + voice: "default", + response_format: "wav", + speed: 1.0, + }); + }); + + it("should use custom voice from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { voice: "cloned_voice_1" }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ voice: "cloned_voice_1" })); + expect(result.voice).toBe("cloned_voice_1"); + }); + + it("should use custom format from options", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { format: "mp3" as AudioFormat }; + const result = await provider.synthesize("Hello", options); + + expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ response_format: "mp3" })); + expect(result.format).toBe("mp3"); + }); + + it("should throw on synthesis failure", async () => { + mockCreate.mockRejectedValue(new Error("GPU out of memory")); + + await expect(provider.synthesize("Hello")).rejects.toThrow( + "TTS synthesis failed for chatterbox: GPU out of memory" + ); + }); + }); + + // ========================================== + // synthesize() — voice cloning (referenceAudio) + // ========================================== + + describe("synthesize (voice cloning)", () => { + it("should pass referenceAudio as base64 in extra body params", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const referenceAudio = Buffer.from("fake-audio-data-for-cloning"); + const options: ChatterboxSynthesizeOptions = { + referenceAudio, + }; + + await provider.synthesize("Clone my voice", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + input: "Clone my voice", + reference_audio: referenceAudio.toString("base64"), + }) + ); + }); + + it("should not include reference_audio when referenceAudio is not provided", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("No cloning"); + + const callArgs = mockCreate.mock.calls[0][0] as Record; + expect(callArgs).not.toHaveProperty("reference_audio"); + }); + }); + + // ========================================== + // synthesize() — emotion exaggeration + // ========================================== + + describe("synthesize (emotion exaggeration)", () => { + it("should pass emotionExaggeration as exaggeration in extra body params", async () => { + const audioBytes = new Uint8Array([0x01, 0x02]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 0.7, + }; + + await provider.synthesize("Very emotional text", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.7, + }) + ); + }); + + it("should not include exaggeration when emotionExaggeration is not provided", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + await provider.synthesize("Neutral text"); + + const callArgs = mockCreate.mock.calls[0][0] as Record; + expect(callArgs).not.toHaveProperty("exaggeration"); + }); + + it("should accept emotionExaggeration of 0.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 0.0, + }; + + await provider.synthesize("Minimal emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.0, + }) + ); + }); + + it("should accept emotionExaggeration of 1.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 1.0, + }; + + await provider.synthesize("Maximum emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 1.0, + }) + ); + }); + + it("should clamp emotionExaggeration above 1.0 to 1.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: 1.5, + }; + + await provider.synthesize("Over the top", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 1.0, + }) + ); + }); + + it("should clamp emotionExaggeration below 0.0 to 0.0", async () => { + const audioBytes = new Uint8Array([0x01]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const options: ChatterboxSynthesizeOptions = { + emotionExaggeration: -0.5, + }; + + await provider.synthesize("Negative emotion", options); + + expect(mockCreate).toHaveBeenCalledWith( + expect.objectContaining({ + exaggeration: 0.0, + }) + ); + }); + }); + + // ========================================== + // synthesize() — combined options + // ========================================== + + describe("synthesize (combined options)", () => { + it("should handle referenceAudio and emotionExaggeration together", async () => { + const audioBytes = new Uint8Array([0x01, 0x02, 0x03]); + mockCreate.mockResolvedValue(createMockAudioResponse(audioBytes)); + + const referenceAudio = Buffer.from("reference-audio-sample"); + const options: ChatterboxSynthesizeOptions = { + voice: "custom_voice", + format: "mp3", + speed: 0.9, + referenceAudio, + emotionExaggeration: 0.6, + }; + + const result = await provider.synthesize("Full options test", options); + + expect(mockCreate).toHaveBeenCalledWith({ + model: "tts-1", + input: "Full options test", + voice: "custom_voice", + response_format: "mp3", + speed: 0.9, + reference_audio: referenceAudio.toString("base64"), + exaggeration: 0.6, + }); + + expect(result.audio).toBeInstanceOf(Buffer); + expect(result.voice).toBe("custom_voice"); + expect(result.format).toBe("mp3"); + expect(result.tier).toBe("premium"); + }); + }); + + // ========================================== + // isHealthy() — graceful degradation + // ========================================== + + describe("isHealthy (graceful degradation)", () => { + it("should return true when the Chatterbox server is reachable", async () => { + const mockFetch = vi.fn().mockResolvedValue({ + ok: true, + status: 200, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(true); + + vi.unstubAllGlobals(); + }); + + it("should return false when GPU is unavailable (server unreachable)", async () => { + const mockFetch = vi.fn().mockRejectedValue(new Error("ECONNREFUSED")); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should return false when the server returns 503 (GPU overloaded)", async () => { + const mockFetch = vi.fn().mockResolvedValue({ + ok: false, + status: 503, + }); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + + it("should return false on timeout (slow GPU response)", async () => { + const mockFetch = vi + .fn() + .mockRejectedValue(new Error("AbortError: The operation was aborted")); + vi.stubGlobal("fetch", mockFetch); + + const healthy = await provider.isHealthy(); + + expect(healthy).toBe(false); + + vi.unstubAllGlobals(); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + it("should return the default voice in the premium tier", async () => { + const voices = await provider.listVoices(); + + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("default"); + expect(defaultVoice?.tier).toBe("premium"); + }); + + it("should set tier to 'premium' on all voices", async () => { + const voices = await provider.listVoices(); + + for (const voice of voices) { + expect(voice.tier).toBe("premium"); + } + }); + }); + + // ========================================== + // supportedLanguages + // ========================================== + + describe("supportedLanguages", () => { + it("should expose a list of supported languages for cross-language transfer", () => { + const languages = provider.supportedLanguages; + + expect(languages).toBeInstanceOf(Array); + expect(languages.length).toBe(23); + expect(languages).toContain("en"); + expect(languages).toContain("fr"); + expect(languages).toContain("de"); + expect(languages).toContain("es"); + expect(languages).toContain("ja"); + expect(languages).toContain("zh"); + }); + }); +}); From 7b4fda60113aa58d337a73350dad54b47a7cb69c Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:37:54 -0600 Subject: [PATCH 09/19] feat(#398): add audio/text validation pipes and speech DTOs Create AudioValidationPipe for MIME type and file size validation, TextValidationPipe for TTS text input validation, and DTOs for transcribe/synthesize endpoints. Includes 36 unit tests. Fixes #398 --- apps/api/src/speech/dto/index.ts | 8 + apps/api/src/speech/dto/synthesize.dto.ts | 85 ++++++++ apps/api/src/speech/dto/transcribe.dto.ts | 54 +++++ .../pipes/audio-validation.pipe.spec.ts | 205 ++++++++++++++++++ .../src/speech/pipes/audio-validation.pipe.ts | 102 +++++++++ apps/api/src/speech/pipes/index.ts | 10 + .../speech/pipes/text-validation.pipe.spec.ts | 136 ++++++++++++ .../src/speech/pipes/text-validation.pipe.ts | 65 ++++++ 8 files changed, 665 insertions(+) create mode 100644 apps/api/src/speech/dto/index.ts create mode 100644 apps/api/src/speech/dto/synthesize.dto.ts create mode 100644 apps/api/src/speech/dto/transcribe.dto.ts create mode 100644 apps/api/src/speech/pipes/audio-validation.pipe.spec.ts create mode 100644 apps/api/src/speech/pipes/audio-validation.pipe.ts create mode 100644 apps/api/src/speech/pipes/index.ts create mode 100644 apps/api/src/speech/pipes/text-validation.pipe.spec.ts create mode 100644 apps/api/src/speech/pipes/text-validation.pipe.ts diff --git a/apps/api/src/speech/dto/index.ts b/apps/api/src/speech/dto/index.ts new file mode 100644 index 0000000..8b644f8 --- /dev/null +++ b/apps/api/src/speech/dto/index.ts @@ -0,0 +1,8 @@ +/** + * Speech DTOs barrel export + * + * Issue #398 + */ + +export { TranscribeDto } from "./transcribe.dto"; +export { SynthesizeDto } from "./synthesize.dto"; diff --git a/apps/api/src/speech/dto/synthesize.dto.ts b/apps/api/src/speech/dto/synthesize.dto.ts new file mode 100644 index 0000000..171dc0e --- /dev/null +++ b/apps/api/src/speech/dto/synthesize.dto.ts @@ -0,0 +1,85 @@ +/** + * SynthesizeDto + * + * DTO for text-to-speech synthesis requests. + * The text field is validated by TextValidationPipe for length/emptiness. + * Additional options control voice, speed, format, and tier selection. + * + * Issue #398 + */ + +import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator"; +import { Type } from "class-transformer"; +import type { AudioFormat, SpeechTier } from "../interfaces/speech-types"; + +/** + * Valid audio output formats for TTS synthesis. + */ +const VALID_AUDIO_FORMATS: readonly AudioFormat[] = [ + "mp3", + "wav", + "opus", + "flac", + "aac", + "pcm", +] as const; + +/** + * Valid TTS tiers for provider selection. + */ +const VALID_SPEECH_TIERS: readonly SpeechTier[] = ["default", "premium", "fallback"] as const; + +export class SynthesizeDto { + /** + * Text to convert to speech. + * Validated separately by TextValidationPipe for length and emptiness. + */ + @IsString({ message: "text must be a string" }) + @MaxLength(4096, { message: "text must not exceed 4096 characters" }) + text!: string; + + /** + * Voice ID to use for synthesis. + * Available voices depend on the selected tier and provider. + * If omitted, the default voice from speech config is used. + */ + @IsOptional() + @IsString({ message: "voice must be a string" }) + @MaxLength(100, { message: "voice must not exceed 100 characters" }) + voice?: string; + + /** + * Speech speed multiplier (0.5 to 2.0). + * 1.0 is normal speed, <1.0 is slower, >1.0 is faster. + */ + @IsOptional() + @Type(() => Number) + @IsNumber({}, { message: "speed must be a number" }) + @Min(0.5, { message: "speed must be at least 0.5" }) + @Max(2.0, { message: "speed must not exceed 2.0" }) + speed?: number; + + /** + * Desired audio output format. + * Supported: mp3, wav, opus, flac, aac, pcm. + * If omitted, the default format from speech config is used. + */ + @IsOptional() + @IsString({ message: "format must be a string" }) + @IsIn(VALID_AUDIO_FORMATS, { + message: `format must be one of: ${VALID_AUDIO_FORMATS.join(", ")}`, + }) + format?: AudioFormat; + + /** + * TTS tier to use for synthesis. + * Controls which provider is used: default (Kokoro), premium (Chatterbox), or fallback (Piper). + * If the selected tier is unavailable, the service falls back to the next available tier. + */ + @IsOptional() + @IsString({ message: "tier must be a string" }) + @IsIn(VALID_SPEECH_TIERS, { + message: `tier must be one of: ${VALID_SPEECH_TIERS.join(", ")}`, + }) + tier?: SpeechTier; +} diff --git a/apps/api/src/speech/dto/transcribe.dto.ts b/apps/api/src/speech/dto/transcribe.dto.ts new file mode 100644 index 0000000..8a7bbe4 --- /dev/null +++ b/apps/api/src/speech/dto/transcribe.dto.ts @@ -0,0 +1,54 @@ +/** + * TranscribeDto + * + * DTO for speech-to-text transcription requests. + * Supports optional language and model overrides. + * + * The audio file itself is handled by Multer (FileInterceptor) + * and validated by AudioValidationPipe. + * + * Issue #398 + */ + +import { IsString, IsOptional, IsNumber, Min, Max, MaxLength } from "class-validator"; +import { Type } from "class-transformer"; + +export class TranscribeDto { + /** + * Language code for transcription (e.g., "en", "fr", "de"). + * If omitted, the default from speech config is used. + */ + @IsOptional() + @IsString({ message: "language must be a string" }) + @MaxLength(10, { message: "language must not exceed 10 characters" }) + language?: string; + + /** + * Model override for transcription. + * If omitted, the default model from speech config is used. + */ + @IsOptional() + @IsString({ message: "model must be a string" }) + @MaxLength(200, { message: "model must not exceed 200 characters" }) + model?: string; + + /** + * Optional prompt to guide the transcription model. + * Useful for providing context or expected vocabulary. + */ + @IsOptional() + @IsString({ message: "prompt must be a string" }) + @MaxLength(1000, { message: "prompt must not exceed 1000 characters" }) + prompt?: string; + + /** + * Temperature for transcription (0.0 to 1.0). + * Lower values produce more deterministic results. + */ + @IsOptional() + @Type(() => Number) + @IsNumber({}, { message: "temperature must be a number" }) + @Min(0, { message: "temperature must be at least 0" }) + @Max(1, { message: "temperature must not exceed 1" }) + temperature?: number; +} diff --git a/apps/api/src/speech/pipes/audio-validation.pipe.spec.ts b/apps/api/src/speech/pipes/audio-validation.pipe.spec.ts new file mode 100644 index 0000000..fc9c5ab --- /dev/null +++ b/apps/api/src/speech/pipes/audio-validation.pipe.spec.ts @@ -0,0 +1,205 @@ +/** + * AudioValidationPipe Tests + * + * Issue #398: Validates uploaded audio files for MIME type and file size. + * Tests cover valid types, invalid types, size limits, and edge cases. + */ + +import { describe, it, expect, beforeEach } from "vitest"; +import { BadRequestException } from "@nestjs/common"; +import { AudioValidationPipe } from "./audio-validation.pipe"; + +/** + * Helper to create a mock Express.Multer.File object. + */ +function createMockFile(overrides: Partial = {}): Express.Multer.File { + return { + fieldname: "file", + originalname: "test.mp3", + encoding: "7bit", + mimetype: "audio/mpeg", + size: 1024, + destination: "", + filename: "", + path: "", + buffer: Buffer.from("fake-audio-data"), + stream: undefined as never, + ...overrides, + }; +} + +describe("AudioValidationPipe", () => { + // ========================================== + // Default config (25MB max) + // ========================================== + describe("with default config", () => { + let pipe: AudioValidationPipe; + + beforeEach(() => { + pipe = new AudioValidationPipe(); + }); + + // ========================================== + // MIME type validation + // ========================================== + describe("MIME type validation", () => { + it("should accept audio/wav", () => { + const file = createMockFile({ mimetype: "audio/wav" }); + expect(pipe.transform(file)).toBe(file); + }); + + it("should accept audio/mp3", () => { + const file = createMockFile({ mimetype: "audio/mp3" }); + expect(pipe.transform(file)).toBe(file); + }); + + it("should accept audio/mpeg", () => { + const file = createMockFile({ mimetype: "audio/mpeg" }); + expect(pipe.transform(file)).toBe(file); + }); + + it("should accept audio/webm", () => { + const file = createMockFile({ mimetype: "audio/webm" }); + expect(pipe.transform(file)).toBe(file); + }); + + it("should accept audio/ogg", () => { + const file = createMockFile({ mimetype: "audio/ogg" }); + expect(pipe.transform(file)).toBe(file); + }); + + it("should accept audio/flac", () => { + const file = createMockFile({ mimetype: "audio/flac" }); + expect(pipe.transform(file)).toBe(file); + }); + + it("should accept audio/x-m4a", () => { + const file = createMockFile({ mimetype: "audio/x-m4a" }); + expect(pipe.transform(file)).toBe(file); + }); + + it("should reject unsupported MIME types with descriptive error", () => { + const file = createMockFile({ mimetype: "video/mp4" }); + expect(() => pipe.transform(file)).toThrow(BadRequestException); + expect(() => pipe.transform(file)).toThrow(/Unsupported audio format.*video\/mp4/); + }); + + it("should reject application/octet-stream", () => { + const file = createMockFile({ mimetype: "application/octet-stream" }); + expect(() => pipe.transform(file)).toThrow(BadRequestException); + }); + + it("should reject text/plain", () => { + const file = createMockFile({ mimetype: "text/plain" }); + expect(() => pipe.transform(file)).toThrow(BadRequestException); + }); + + it("should reject image/png", () => { + const file = createMockFile({ mimetype: "image/png" }); + expect(() => pipe.transform(file)).toThrow(BadRequestException); + }); + + it("should include supported formats in error message", () => { + const file = createMockFile({ mimetype: "video/mp4" }); + try { + pipe.transform(file); + expect.fail("Expected BadRequestException"); + } catch (error) { + expect(error).toBeInstanceOf(BadRequestException); + const response = (error as BadRequestException).getResponse(); + const message = + typeof response === "string" ? response : (response as Record).message; + expect(message).toContain("audio/wav"); + expect(message).toContain("audio/mpeg"); + } + }); + }); + + // ========================================== + // File size validation + // ========================================== + describe("file size validation", () => { + it("should accept files under the size limit", () => { + const file = createMockFile({ size: 1024 * 1024 }); // 1MB + expect(pipe.transform(file)).toBe(file); + }); + + it("should accept files exactly at the size limit", () => { + const file = createMockFile({ size: 25_000_000 }); // 25MB (default) + expect(pipe.transform(file)).toBe(file); + }); + + it("should reject files exceeding the size limit", () => { + const file = createMockFile({ size: 25_000_001 }); // 1 byte over + expect(() => pipe.transform(file)).toThrow(BadRequestException); + expect(() => pipe.transform(file)).toThrow(/exceeds maximum/); + }); + + it("should include human-readable sizes in error message", () => { + const file = createMockFile({ size: 30_000_000 }); // 30MB + try { + pipe.transform(file); + expect.fail("Expected BadRequestException"); + } catch (error) { + expect(error).toBeInstanceOf(BadRequestException); + const response = (error as BadRequestException).getResponse(); + const message = + typeof response === "string" ? response : (response as Record).message; + // Should show something like "28.6 MB" and "23.8 MB" + expect(message).toContain("MB"); + } + }); + + it("should accept zero-size files (MIME check still applies)", () => { + const file = createMockFile({ size: 0 }); + expect(pipe.transform(file)).toBe(file); + }); + }); + + // ========================================== + // Edge cases + // ========================================== + describe("edge cases", () => { + it("should throw if no file is provided (null)", () => { + expect(() => pipe.transform(null as unknown as Express.Multer.File)).toThrow( + BadRequestException + ); + expect(() => pipe.transform(null as unknown as Express.Multer.File)).toThrow( + /No audio file provided/ + ); + }); + + it("should throw if no file is provided (undefined)", () => { + expect(() => pipe.transform(undefined as unknown as Express.Multer.File)).toThrow( + BadRequestException + ); + }); + }); + }); + + // ========================================== + // Custom config + // ========================================== + describe("with custom config", () => { + it("should use custom max file size", () => { + const pipe = new AudioValidationPipe({ maxFileSize: 1_000_000 }); // 1MB + const smallFile = createMockFile({ size: 500_000 }); + expect(pipe.transform(smallFile)).toBe(smallFile); + + const largeFile = createMockFile({ size: 1_000_001 }); + expect(() => pipe.transform(largeFile)).toThrow(BadRequestException); + }); + + it("should allow overriding accepted MIME types", () => { + const pipe = new AudioValidationPipe({ + allowedMimeTypes: ["audio/wav"], + }); + + const wavFile = createMockFile({ mimetype: "audio/wav" }); + expect(pipe.transform(wavFile)).toBe(wavFile); + + const mp3File = createMockFile({ mimetype: "audio/mpeg" }); + expect(() => pipe.transform(mp3File)).toThrow(BadRequestException); + }); + }); +}); diff --git a/apps/api/src/speech/pipes/audio-validation.pipe.ts b/apps/api/src/speech/pipes/audio-validation.pipe.ts new file mode 100644 index 0000000..f5491d6 --- /dev/null +++ b/apps/api/src/speech/pipes/audio-validation.pipe.ts @@ -0,0 +1,102 @@ +/** + * AudioValidationPipe + * + * NestJS PipeTransform that validates uploaded audio files. + * Checks MIME type against an allow-list and file size against a configurable maximum. + * + * Usage: + * ```typescript + * @Post('transcribe') + * @UseInterceptors(FileInterceptor('file')) + * async transcribe( + * @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File, + * ) { ... } + * ``` + * + * Issue #398 + */ + +import { BadRequestException } from "@nestjs/common"; +import type { PipeTransform } from "@nestjs/common"; + +/** + * Default accepted MIME types for audio uploads. + */ +const DEFAULT_ALLOWED_MIME_TYPES: readonly string[] = [ + "audio/wav", + "audio/mp3", + "audio/mpeg", + "audio/webm", + "audio/ogg", + "audio/flac", + "audio/x-m4a", +] as const; + +/** + * Default maximum upload size in bytes (25 MB). + */ +const DEFAULT_MAX_FILE_SIZE = 25_000_000; + +/** + * Options for customizing AudioValidationPipe behavior. + */ +export interface AudioValidationPipeOptions { + /** Maximum file size in bytes. Defaults to 25 MB. */ + maxFileSize?: number; + + /** List of accepted MIME types. Defaults to common audio formats. */ + allowedMimeTypes?: string[]; +} + +/** + * Format bytes into a human-readable string (e.g., "25.0 MB"). + */ +function formatBytes(bytes: number): string { + if (bytes < 1024) { + return `${String(bytes)} B`; + } + if (bytes < 1024 * 1024) { + return `${(bytes / 1024).toFixed(1)} KB`; + } + return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; +} + +export class AudioValidationPipe implements PipeTransform { + private readonly maxFileSize: number; + private readonly allowedMimeTypes: readonly string[]; + + constructor(options?: AudioValidationPipeOptions) { + this.maxFileSize = options?.maxFileSize ?? DEFAULT_MAX_FILE_SIZE; + this.allowedMimeTypes = options?.allowedMimeTypes ?? DEFAULT_ALLOWED_MIME_TYPES; + } + + /** + * Validate the uploaded file's MIME type and size. + * + * @param file - The uploaded file from Multer + * @returns The validated file, unchanged + * @throws {BadRequestException} If the file is missing, has an unsupported MIME type, or exceeds the size limit + */ + transform(file: Express.Multer.File | undefined): Express.Multer.File { + if (!file) { + throw new BadRequestException("No audio file provided"); + } + + // Validate MIME type + if (!this.allowedMimeTypes.includes(file.mimetype)) { + throw new BadRequestException( + `Unsupported audio format: ${file.mimetype}. ` + + `Supported formats: ${this.allowedMimeTypes.join(", ")}` + ); + } + + // Validate file size + if (file.size > this.maxFileSize) { + throw new BadRequestException( + `File size ${formatBytes(file.size)} exceeds maximum allowed size of ${formatBytes(this.maxFileSize)}` + ); + } + + return file; + } +} diff --git a/apps/api/src/speech/pipes/index.ts b/apps/api/src/speech/pipes/index.ts new file mode 100644 index 0000000..8bb0ab5 --- /dev/null +++ b/apps/api/src/speech/pipes/index.ts @@ -0,0 +1,10 @@ +/** + * Speech Pipes barrel export + * + * Issue #398 + */ + +export { AudioValidationPipe } from "./audio-validation.pipe"; +export type { AudioValidationPipeOptions } from "./audio-validation.pipe"; +export { TextValidationPipe } from "./text-validation.pipe"; +export type { TextValidationPipeOptions } from "./text-validation.pipe"; diff --git a/apps/api/src/speech/pipes/text-validation.pipe.spec.ts b/apps/api/src/speech/pipes/text-validation.pipe.spec.ts new file mode 100644 index 0000000..33a263c --- /dev/null +++ b/apps/api/src/speech/pipes/text-validation.pipe.spec.ts @@ -0,0 +1,136 @@ +/** + * TextValidationPipe Tests + * + * Issue #398: Validates text input for TTS synthesis. + * Tests cover text length, empty text, whitespace, and configurable limits. + */ + +import { describe, it, expect, beforeEach } from "vitest"; +import { BadRequestException } from "@nestjs/common"; +import { TextValidationPipe } from "./text-validation.pipe"; + +describe("TextValidationPipe", () => { + // ========================================== + // Default config (4096 max length) + // ========================================== + describe("with default config", () => { + let pipe: TextValidationPipe; + + beforeEach(() => { + pipe = new TextValidationPipe(); + }); + + // ========================================== + // Valid text + // ========================================== + describe("valid text", () => { + it("should accept normal text", () => { + const text = "Hello, world!"; + expect(pipe.transform(text)).toBe(text); + }); + + it("should accept text at exactly the max length", () => { + const text = "a".repeat(4096); + expect(pipe.transform(text)).toBe(text); + }); + + it("should accept single character text", () => { + expect(pipe.transform("a")).toBe("a"); + }); + + it("should accept text with unicode characters", () => { + const text = "Hello, world! 你好世界"; + expect(pipe.transform(text)).toBe(text); + }); + + it("should accept multi-line text", () => { + const text = "Line one.\nLine two.\nLine three."; + expect(pipe.transform(text)).toBe(text); + }); + }); + + // ========================================== + // Text length validation + // ========================================== + describe("text length validation", () => { + it("should reject text exceeding max length", () => { + const text = "a".repeat(4097); + expect(() => pipe.transform(text)).toThrow(BadRequestException); + expect(() => pipe.transform(text)).toThrow(/exceeds maximum/); + }); + + it("should include length details in error message", () => { + const text = "a".repeat(5000); + try { + pipe.transform(text); + expect.fail("Expected BadRequestException"); + } catch (error) { + expect(error).toBeInstanceOf(BadRequestException); + const response = (error as BadRequestException).getResponse(); + const message = + typeof response === "string" ? response : (response as Record).message; + expect(message).toContain("5000"); + expect(message).toContain("4096"); + } + }); + }); + + // ========================================== + // Empty text validation + // ========================================== + describe("empty text validation", () => { + it("should reject empty string", () => { + expect(() => pipe.transform("")).toThrow(BadRequestException); + expect(() => pipe.transform("")).toThrow(/Text cannot be empty/); + }); + + it("should reject whitespace-only string", () => { + expect(() => pipe.transform(" ")).toThrow(BadRequestException); + expect(() => pipe.transform(" ")).toThrow(/Text cannot be empty/); + }); + + it("should reject tabs and newlines only", () => { + expect(() => pipe.transform("\t\n\r")).toThrow(BadRequestException); + }); + + it("should reject null", () => { + expect(() => pipe.transform(null as unknown as string)).toThrow(BadRequestException); + }); + + it("should reject undefined", () => { + expect(() => pipe.transform(undefined as unknown as string)).toThrow(BadRequestException); + }); + }); + + // ========================================== + // Text with leading/trailing whitespace + // ========================================== + describe("whitespace handling", () => { + it("should accept text with leading/trailing whitespace (preserves it)", () => { + const text = " Hello, world! "; + expect(pipe.transform(text)).toBe(text); + }); + }); + }); + + // ========================================== + // Custom config + // ========================================== + describe("with custom config", () => { + it("should use custom max text length", () => { + const pipe = new TextValidationPipe({ maxTextLength: 100 }); + + const shortText = "Hello"; + expect(pipe.transform(shortText)).toBe(shortText); + + const longText = "a".repeat(101); + expect(() => pipe.transform(longText)).toThrow(BadRequestException); + }); + + it("should accept text at exact custom limit", () => { + const pipe = new TextValidationPipe({ maxTextLength: 50 }); + const text = "a".repeat(50); + expect(pipe.transform(text)).toBe(text); + }); + }); +}); diff --git a/apps/api/src/speech/pipes/text-validation.pipe.ts b/apps/api/src/speech/pipes/text-validation.pipe.ts new file mode 100644 index 0000000..36796d1 --- /dev/null +++ b/apps/api/src/speech/pipes/text-validation.pipe.ts @@ -0,0 +1,65 @@ +/** + * TextValidationPipe + * + * NestJS PipeTransform that validates text input for TTS synthesis. + * Checks that text is non-empty and within the configurable maximum length. + * + * Usage: + * ```typescript + * @Post('synthesize') + * async synthesize( + * @Body('text', new TextValidationPipe()) text: string, + * ) { ... } + * ``` + * + * Issue #398 + */ + +import { BadRequestException } from "@nestjs/common"; +import type { PipeTransform } from "@nestjs/common"; + +/** + * Default maximum text length for TTS input (4096 characters). + */ +const DEFAULT_MAX_TEXT_LENGTH = 4096; + +/** + * Options for customizing TextValidationPipe behavior. + */ +export interface TextValidationPipeOptions { + /** Maximum text length in characters. Defaults to 4096. */ + maxTextLength?: number; +} + +export class TextValidationPipe implements PipeTransform { + private readonly maxTextLength: number; + + constructor(options?: TextValidationPipeOptions) { + this.maxTextLength = options?.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH; + } + + /** + * Validate the text input for TTS synthesis. + * + * @param text - The text to validate + * @returns The validated text, unchanged + * @throws {BadRequestException} If text is empty, whitespace-only, or exceeds the max length + */ + transform(text: string | null | undefined): string { + if (text === null || text === undefined) { + throw new BadRequestException("Text cannot be empty"); + } + + if (text.trim().length === 0) { + throw new BadRequestException("Text cannot be empty"); + } + + if (text.length > this.maxTextLength) { + throw new BadRequestException( + `Text length ${String(text.length)} exceeds maximum allowed length of ${String(this.maxTextLength)} characters` + ); + } + + return text; + } +} From 6c465566f6e14f20ac5cb462e61fa5fee5b71804 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:39:20 -0600 Subject: [PATCH 10/19] feat(#395): implement Piper TTS provider via OpenedAI Speech Add fallback-tier TTS provider using Piper via OpenedAI Speech for ultra-lightweight CPU-only synthesis. Maps 6 standard OpenAI voice names (alloy, echo, fable, onyx, nova, shimmer) to Piper voices. Update factory to use the new PiperTtsProvider class, replacing the inline stub. Includes 37 unit tests covering provider identity, voice mapping, and voice listing. Fixes #395 Co-Authored-By: Claude Opus 4.6 --- .../providers/piper-tts.provider.spec.ts | 266 ++++++++++++++++++ .../speech/providers/piper-tts.provider.ts | 212 ++++++++++++++ .../speech/providers/tts-provider.factory.ts | 21 +- 3 files changed, 480 insertions(+), 19 deletions(-) create mode 100644 apps/api/src/speech/providers/piper-tts.provider.spec.ts create mode 100644 apps/api/src/speech/providers/piper-tts.provider.ts diff --git a/apps/api/src/speech/providers/piper-tts.provider.spec.ts b/apps/api/src/speech/providers/piper-tts.provider.spec.ts new file mode 100644 index 0000000..c0c1661 --- /dev/null +++ b/apps/api/src/speech/providers/piper-tts.provider.spec.ts @@ -0,0 +1,266 @@ +/** + * PiperTtsProvider Unit Tests + * + * Tests the Piper TTS provider via OpenedAI Speech (fallback tier). + * Validates provider identity, OpenAI voice name mapping, voice listing, + * and ultra-lightweight CPU-only design characteristics. + * + * Issue #395 + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { + PiperTtsProvider, + PIPER_VOICE_MAP, + PIPER_SUPPORTED_FORMATS, + OPENAI_STANDARD_VOICES, +} from "./piper-tts.provider"; +import type { VoiceInfo } from "../interfaces/speech-types"; + +// ========================================== +// Mock OpenAI SDK +// ========================================== + +vi.mock("openai", () => { + class MockOpenAI { + audio = { + speech: { + create: vi.fn(), + }, + }; + } + return { default: MockOpenAI }; +}); + +// ========================================== +// Provider identity +// ========================================== + +describe("PiperTtsProvider", () => { + const testBaseURL = "http://openedai-speech:8000/v1"; + let provider: PiperTtsProvider; + + beforeEach(() => { + provider = new PiperTtsProvider(testBaseURL); + }); + + describe("provider identity", () => { + it("should have name 'piper'", () => { + expect(provider.name).toBe("piper"); + }); + + it("should have tier 'fallback'", () => { + expect(provider.tier).toBe("fallback"); + }); + }); + + // ========================================== + // Constructor + // ========================================== + + describe("constructor", () => { + it("should use 'alloy' as default voice", () => { + const newProvider = new PiperTtsProvider(testBaseURL); + expect(newProvider).toBeDefined(); + }); + + it("should accept a custom default voice", () => { + const customProvider = new PiperTtsProvider(testBaseURL, "nova"); + expect(customProvider).toBeDefined(); + }); + + it("should accept a custom default format", () => { + const customProvider = new PiperTtsProvider(testBaseURL, "alloy", "wav"); + expect(customProvider).toBeDefined(); + }); + }); + + // ========================================== + // listVoices() + // ========================================== + + describe("listVoices", () => { + let voices: VoiceInfo[]; + + beforeEach(async () => { + voices = await provider.listVoices(); + }); + + it("should return an array of VoiceInfo objects", () => { + expect(voices).toBeInstanceOf(Array); + expect(voices.length).toBeGreaterThan(0); + }); + + it("should return exactly 6 voices (OpenAI standard set)", () => { + expect(voices.length).toBe(6); + }); + + it("should set tier to 'fallback' on all voices", () => { + for (const voice of voices) { + expect(voice.tier).toBe("fallback"); + } + }); + + it("should have exactly one default voice", () => { + const defaults = voices.filter((v) => v.isDefault === true); + expect(defaults.length).toBe(1); + }); + + it("should mark 'alloy' as the default voice", () => { + const defaultVoice = voices.find((v) => v.isDefault === true); + expect(defaultVoice).toBeDefined(); + expect(defaultVoice?.id).toBe("alloy"); + }); + + it("should have an id and name for every voice", () => { + for (const voice of voices) { + expect(voice.id).toBeTruthy(); + expect(voice.name).toBeTruthy(); + } + }); + + it("should set language on every voice", () => { + for (const voice of voices) { + expect(voice.language).toBeTruthy(); + } + }); + + // ========================================== + // All 6 OpenAI standard voices present + // ========================================== + + describe("OpenAI standard voices", () => { + const standardVoiceIds = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]; + + it.each(standardVoiceIds)("should include voice '%s'", (voiceId) => { + const voice = voices.find((v) => v.id === voiceId); + expect(voice).toBeDefined(); + }); + }); + + // ========================================== + // Voice metadata + // ========================================== + + describe("voice metadata", () => { + it("should include gender info in voice names", () => { + const alloy = voices.find((v) => v.id === "alloy"); + expect(alloy?.name).toMatch(/Female|Male/); + }); + + it("should map alloy to a female voice", () => { + const alloy = voices.find((v) => v.id === "alloy"); + expect(alloy?.name).toContain("Female"); + }); + + it("should map echo to a male voice", () => { + const echo = voices.find((v) => v.id === "echo"); + expect(echo?.name).toContain("Male"); + }); + + it("should map fable to a British voice", () => { + const fable = voices.find((v) => v.id === "fable"); + expect(fable?.language).toBe("en-GB"); + }); + + it("should map onyx to a male voice", () => { + const onyx = voices.find((v) => v.id === "onyx"); + expect(onyx?.name).toContain("Male"); + }); + + it("should map nova to a female voice", () => { + const nova = voices.find((v) => v.id === "nova"); + expect(nova?.name).toContain("Female"); + }); + + it("should map shimmer to a female voice", () => { + const shimmer = voices.find((v) => v.id === "shimmer"); + expect(shimmer?.name).toContain("Female"); + }); + }); + }); +}); + +// ========================================== +// PIPER_VOICE_MAP +// ========================================== + +describe("PIPER_VOICE_MAP", () => { + it("should contain all 6 OpenAI standard voice names", () => { + const expectedKeys = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"]; + for (const key of expectedKeys) { + expect(PIPER_VOICE_MAP).toHaveProperty(key); + } + }); + + it("should map each voice to a Piper voice ID", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.piperVoice).toBeTruthy(); + expect(typeof entry.piperVoice).toBe("string"); + } + }); + + it("should have gender for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.gender).toMatch(/^(female|male)$/); + } + }); + + it("should have a language for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.language).toBeTruthy(); + } + }); + + it("should have a description for each voice entry", () => { + for (const entry of Object.values(PIPER_VOICE_MAP)) { + expect(entry.description).toBeTruthy(); + } + }); +}); + +// ========================================== +// OPENAI_STANDARD_VOICES +// ========================================== + +describe("OPENAI_STANDARD_VOICES", () => { + it("should be an array of 6 voice IDs", () => { + expect(Array.isArray(OPENAI_STANDARD_VOICES)).toBe(true); + expect(OPENAI_STANDARD_VOICES.length).toBe(6); + }); + + it("should contain all standard OpenAI voice names", () => { + expect(OPENAI_STANDARD_VOICES).toContain("alloy"); + expect(OPENAI_STANDARD_VOICES).toContain("echo"); + expect(OPENAI_STANDARD_VOICES).toContain("fable"); + expect(OPENAI_STANDARD_VOICES).toContain("onyx"); + expect(OPENAI_STANDARD_VOICES).toContain("nova"); + expect(OPENAI_STANDARD_VOICES).toContain("shimmer"); + }); +}); + +// ========================================== +// PIPER_SUPPORTED_FORMATS +// ========================================== + +describe("PIPER_SUPPORTED_FORMATS", () => { + it("should include mp3", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("mp3"); + }); + + it("should include wav", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("wav"); + }); + + it("should include opus", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("opus"); + }); + + it("should include flac", () => { + expect(PIPER_SUPPORTED_FORMATS).toContain("flac"); + }); + + it("should be a readonly array", () => { + expect(Array.isArray(PIPER_SUPPORTED_FORMATS)).toBe(true); + }); +}); diff --git a/apps/api/src/speech/providers/piper-tts.provider.ts b/apps/api/src/speech/providers/piper-tts.provider.ts new file mode 100644 index 0000000..40e4638 --- /dev/null +++ b/apps/api/src/speech/providers/piper-tts.provider.ts @@ -0,0 +1,212 @@ +/** + * Piper TTS Provider via OpenedAI Speech + * + * Fallback-tier TTS provider using Piper via OpenedAI Speech for + * ultra-lightweight CPU-only synthesis. Designed for low-resource + * environments including Raspberry Pi. + * + * Features: + * - OpenAI-compatible API via OpenedAI Speech server + * - 100+ Piper voices across 40+ languages + * - 6 standard OpenAI voice names mapped to Piper voices + * - Output formats: mp3, wav, opus, flac, aac, pcm + * - CPU-only, no GPU required + * - GPL license (via OpenedAI Speech) + * + * Voice names use the OpenAI standard set (alloy, echo, fable, onyx, + * nova, shimmer) which OpenedAI Speech maps to configured Piper voices. + * + * Issue #395 + */ + +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier, VoiceInfo, AudioFormat } from "../interfaces/speech-types"; + +// ========================================== +// Constants +// ========================================== + +/** Audio formats supported by OpenedAI Speech with Piper backend */ +export const PIPER_SUPPORTED_FORMATS: readonly AudioFormat[] = [ + "mp3", + "wav", + "opus", + "flac", +] as const; + +/** Default voice for Piper (via OpenedAI Speech) */ +const PIPER_DEFAULT_VOICE = "alloy"; + +/** Default audio format for Piper */ +const PIPER_DEFAULT_FORMAT: AudioFormat = "mp3"; + +// ========================================== +// OpenAI standard voice names +// ========================================== + +/** + * The 6 standard OpenAI TTS voice names. + * OpenedAI Speech accepts these names and routes them to configured Piper voices. + */ +export const OPENAI_STANDARD_VOICES: readonly string[] = [ + "alloy", + "echo", + "fable", + "onyx", + "nova", + "shimmer", +] as const; + +// ========================================== +// Voice mapping +// ========================================== + +/** Metadata for a Piper voice mapped from an OpenAI voice name */ +export interface PiperVoiceMapping { + /** The underlying Piper voice ID configured in OpenedAI Speech */ + piperVoice: string; + /** Human-readable description of the voice character */ + description: string; + /** Gender of the voice */ + gender: "female" | "male"; + /** BCP 47 language code */ + language: string; +} + +/** Fallback mapping used when a voice ID is not found in PIPER_VOICE_MAP */ +const DEFAULT_MAPPING: PiperVoiceMapping = { + piperVoice: "en_US-amy-medium", + description: "Default voice", + gender: "female", + language: "en-US", +}; + +/** + * Mapping of OpenAI standard voice names to their default Piper voice + * configuration in OpenedAI Speech. + * + * These are the default mappings that OpenedAI Speech uses when configured + * with Piper as the TTS backend. The actual Piper voice used can be + * customized in the OpenedAI Speech configuration file. + * + * Default Piper voice assignments: + * - alloy: en_US-amy-medium (warm, balanced female) + * - echo: en_US-ryan-medium (clear, articulate male) + * - fable: en_GB-alan-medium (British male narrator) + * - onyx: en_US-danny-low (deep, resonant male) + * - nova: en_US-lessac-medium (expressive female) + * - shimmer: en_US-kristin-medium (bright, energetic female) + */ +export const PIPER_VOICE_MAP: Record = { + alloy: { + piperVoice: "en_US-amy-medium", + description: "Warm, balanced voice", + gender: "female", + language: "en-US", + }, + echo: { + piperVoice: "en_US-ryan-medium", + description: "Clear, articulate voice", + gender: "male", + language: "en-US", + }, + fable: { + piperVoice: "en_GB-alan-medium", + description: "British narrator voice", + gender: "male", + language: "en-GB", + }, + onyx: { + piperVoice: "en_US-danny-low", + description: "Deep, resonant voice", + gender: "male", + language: "en-US", + }, + nova: { + piperVoice: "en_US-lessac-medium", + description: "Expressive, versatile voice", + gender: "female", + language: "en-US", + }, + shimmer: { + piperVoice: "en_US-kristin-medium", + description: "Bright, energetic voice", + gender: "female", + language: "en-US", + }, +}; + +// ========================================== +// Provider class +// ========================================== + +/** + * Piper TTS provider via OpenedAI Speech (fallback tier). + * + * Ultra-lightweight CPU-only text-to-speech engine using Piper voices + * through the OpenedAI Speech server's OpenAI-compatible API. + * + * Designed for: + * - CPU-only environments (no GPU required) + * - Low-resource devices (Raspberry Pi, ARM SBCs) + * - Fallback when primary TTS engines are unavailable + * - High-volume, low-latency synthesis needs + * + * The provider exposes the 6 standard OpenAI voice names (alloy, echo, + * fable, onyx, nova, shimmer) which OpenedAI Speech maps to configured + * Piper voices. Additional Piper voices (100+ across 40+ languages) + * can be accessed by passing the Piper voice ID directly. + * + * @example + * ```typescript + * const piper = new PiperTtsProvider("http://openedai-speech:8000/v1"); + * const voices = await piper.listVoices(); + * const result = await piper.synthesize("Hello!", { voice: "alloy" }); + * ``` + */ +export class PiperTtsProvider extends BaseTTSProvider { + readonly name = "piper"; + readonly tier: SpeechTier = "fallback"; + + /** + * Create a new Piper TTS provider. + * + * @param baseURL - Base URL for the OpenedAI Speech endpoint (e.g. "http://openedai-speech:8000/v1") + * @param defaultVoice - Default OpenAI voice name (defaults to "alloy") + * @param defaultFormat - Default audio format (defaults to "mp3") + */ + constructor( + baseURL: string, + defaultVoice: string = PIPER_DEFAULT_VOICE, + defaultFormat: AudioFormat = PIPER_DEFAULT_FORMAT + ) { + super(baseURL, defaultVoice, defaultFormat); + } + + /** + * List available voices with OpenAI-to-Piper mapping metadata. + * + * Returns the 6 standard OpenAI voice names with information about + * the underlying Piper voice, gender, and language. These are the + * voices that can be specified in the `voice` parameter of synthesize(). + * + * @returns Array of VoiceInfo objects for all mapped Piper voices + */ + override listVoices(): Promise { + const voices: VoiceInfo[] = OPENAI_STANDARD_VOICES.map((voiceId) => { + const mapping = PIPER_VOICE_MAP[voiceId] ?? DEFAULT_MAPPING; + const genderLabel = mapping.gender === "female" ? "Female" : "Male"; + const label = voiceId.charAt(0).toUpperCase() + voiceId.slice(1); + + return { + id: voiceId, + name: `${label} (${genderLabel} - ${mapping.description})`, + language: mapping.language, + tier: this.tier, + isDefault: voiceId === this.defaultVoice, + }; + }); + + return Promise.resolve(voices); + } +} diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts index 28c807f..5a1f69f 100644 --- a/apps/api/src/speech/providers/tts-provider.factory.ts +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -14,30 +14,13 @@ */ import { Logger } from "@nestjs/common"; -import { BaseTTSProvider } from "./base-tts.provider"; import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; import { KokoroTtsProvider } from "./kokoro-tts.provider"; +import { PiperTtsProvider } from "./piper-tts.provider"; import type { ITTSProvider } from "../interfaces/tts-provider.interface"; import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; import type { SpeechConfig } from "../speech.config"; -// ========================================== -// Concrete provider classes -// ========================================== - -/** - * Piper TTS provider via OpenedAI Speech (fallback tier). - * Ultra-lightweight CPU, GPL license. - */ -class PiperProvider extends BaseTTSProvider { - readonly name = "piper"; - readonly tier: SpeechTier = "fallback"; - - constructor(baseURL: string) { - super(baseURL, "alloy", "mp3"); - } -} - // ========================================== // Factory function // ========================================== @@ -76,7 +59,7 @@ export function createTTSProviders(config: SpeechConfig): Map Date: Sun, 15 Feb 2026 02:47:52 -0600 Subject: [PATCH 11/19] feat(#392): create /api/speech/transcribe REST endpoint Add SpeechController with POST /api/speech/transcribe for audio transcription and GET /api/speech/health for provider status. Uses AudioValidationPipe for file upload validation and returns results in standard { data: T } envelope. Includes 10 unit tests covering transcribe with options, error propagation, and all health status combinations. Fixes #392 Co-Authored-By: Claude Opus 4.6 --- apps/api/src/speech/speech.controller.spec.ts | 437 ++++++++++++++++++ apps/api/src/speech/speech.controller.ts | 193 ++++++++ apps/api/src/speech/speech.module.ts | 2 + 3 files changed, 632 insertions(+) create mode 100644 apps/api/src/speech/speech.controller.spec.ts create mode 100644 apps/api/src/speech/speech.controller.ts diff --git a/apps/api/src/speech/speech.controller.spec.ts b/apps/api/src/speech/speech.controller.spec.ts new file mode 100644 index 0000000..2db1cf8 --- /dev/null +++ b/apps/api/src/speech/speech.controller.spec.ts @@ -0,0 +1,437 @@ +import { describe, it, expect, beforeEach, vi } from "vitest"; +import { StreamableFile, ServiceUnavailableException } from "@nestjs/common"; +import { SpeechController } from "./speech.controller"; +import { SpeechService } from "./speech.service"; +import type { TranscribeDto } from "./dto/transcribe.dto"; +import type { SynthesizeDto } from "./dto/synthesize.dto"; +import type { TranscriptionResult, SynthesisResult, VoiceInfo } from "./interfaces/speech-types"; + +describe("SpeechController", () => { + let controller: SpeechController; + let service: SpeechService; + + const mockSpeechService = { + transcribe: vi.fn(), + synthesize: vi.fn(), + listVoices: vi.fn(), + isSTTAvailable: vi.fn(), + isTTSAvailable: vi.fn(), + }; + + const mockWorkspaceId = "550e8400-e29b-41d4-a716-446655440001"; + const mockUserId = "550e8400-e29b-41d4-a716-446655440002"; + + const mockUser = { + id: mockUserId, + email: "test@example.com", + name: "Test User", + workspaceId: mockWorkspaceId, + }; + + const mockFile: Express.Multer.File = { + buffer: Buffer.from("fake-audio-data"), + mimetype: "audio/wav", + size: 1024, + originalname: "test.wav", + fieldname: "file", + encoding: "7bit", + stream: null as never, + destination: "", + filename: "", + path: "", + }; + + const mockTranscriptionResult: TranscriptionResult = { + text: "Hello, world!", + language: "en", + durationSeconds: 2.5, + confidence: 0.95, + }; + + beforeEach(() => { + service = mockSpeechService as unknown as SpeechService; + controller = new SpeechController(service); + + vi.clearAllMocks(); + }); + + it("should be defined", () => { + expect(controller).toBeDefined(); + }); + + describe("transcribe", () => { + it("should transcribe audio file and return data wrapper", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = {}; + const result = await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(result).toEqual({ data: mockTranscriptionResult }); + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + mimeType: "audio/wav", + }); + }); + + it("should pass language override from DTO to service", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = { language: "fr" }; + await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + language: "fr", + mimeType: "audio/wav", + }); + }); + + it("should pass model override from DTO to service", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = { model: "whisper-large-v3" }; + await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + model: "whisper-large-v3", + mimeType: "audio/wav", + }); + }); + + it("should pass all DTO options to service", async () => { + mockSpeechService.transcribe.mockResolvedValue(mockTranscriptionResult); + + const dto: TranscribeDto = { + language: "de", + model: "whisper-large-v3", + prompt: "Meeting notes", + temperature: 0.5, + }; + await controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.transcribe).toHaveBeenCalledWith(mockFile.buffer, { + language: "de", + model: "whisper-large-v3", + prompt: "Meeting notes", + temperature: 0.5, + mimeType: "audio/wav", + }); + }); + + it("should propagate service errors", async () => { + mockSpeechService.transcribe.mockRejectedValue(new Error("STT unavailable")); + + const dto: TranscribeDto = {}; + await expect(controller.transcribe(mockFile, dto, mockWorkspaceId, mockUser)).rejects.toThrow( + "STT unavailable" + ); + }); + }); + + describe("health", () => { + it("should return health status with both providers available", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(true); + mockSpeechService.isTTSAvailable.mockReturnValue(true); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: true }, + tts: { available: true }, + }, + }); + }); + + it("should return health status with STT unavailable", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(false); + mockSpeechService.isTTSAvailable.mockReturnValue(true); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: false }, + tts: { available: true }, + }, + }); + }); + + it("should return health status with TTS unavailable", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(true); + mockSpeechService.isTTSAvailable.mockReturnValue(false); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: true }, + tts: { available: false }, + }, + }); + }); + + it("should return health status with both providers unavailable", async () => { + mockSpeechService.isSTTAvailable.mockReturnValue(false); + mockSpeechService.isTTSAvailable.mockReturnValue(false); + + const result = await controller.health(mockWorkspaceId); + + expect(result).toEqual({ + data: { + stt: { available: false }, + tts: { available: false }, + }, + }); + }); + }); + + // ============================================== + // POST /api/speech/synthesize (Issue #396) + // ============================================== + + describe("synthesize", () => { + const mockAudioBuffer = Buffer.from("fake-audio-data"); + + const mockSynthesisResult: SynthesisResult = { + audio: mockAudioBuffer, + format: "mp3", + voice: "af_heart", + tier: "default", + durationSeconds: 2.5, + }; + + it("should synthesize text and return a StreamableFile", async () => { + const dto: SynthesizeDto = { text: "Hello world" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Hello world", {}); + expect(result).toBeInstanceOf(StreamableFile); + }); + + it("should pass voice, speed, format, and tier options to the service", async () => { + const dto: SynthesizeDto = { + text: "Test with options", + voice: "af_heart", + speed: 1.5, + format: "wav", + tier: "premium", + }; + + const wavResult: SynthesisResult = { + audio: mockAudioBuffer, + format: "wav", + voice: "af_heart", + tier: "premium", + }; + + mockSpeechService.synthesize.mockResolvedValue(wavResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + expect(mockSpeechService.synthesize).toHaveBeenCalledWith("Test with options", { + voice: "af_heart", + speed: 1.5, + format: "wav", + tier: "premium", + }); + expect(result).toBeInstanceOf(StreamableFile); + }); + + it("should set correct Content-Type for mp3 format", async () => { + const dto: SynthesizeDto = { text: "Hello", format: "mp3" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + expect(result).toBeInstanceOf(StreamableFile); + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/mpeg"); + }); + + it("should set correct Content-Type for wav format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" }; + + mockSpeechService.synthesize.mockResolvedValue(wavResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/wav"); + }); + + it("should set correct Content-Type for opus format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const opusResult: SynthesisResult = { ...mockSynthesisResult, format: "opus" }; + + mockSpeechService.synthesize.mockResolvedValue(opusResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/opus"); + }); + + it("should set correct Content-Type for flac format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const flacResult: SynthesisResult = { ...mockSynthesisResult, format: "flac" }; + + mockSpeechService.synthesize.mockResolvedValue(flacResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/flac"); + }); + + it("should set correct Content-Type for aac format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const aacResult: SynthesisResult = { ...mockSynthesisResult, format: "aac" }; + + mockSpeechService.synthesize.mockResolvedValue(aacResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/aac"); + }); + + it("should set correct Content-Type for pcm format", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const pcmResult: SynthesisResult = { ...mockSynthesisResult, format: "pcm" }; + + mockSpeechService.synthesize.mockResolvedValue(pcmResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.type).toBe("audio/pcm"); + }); + + it("should set Content-Disposition header for download with correct extension", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.disposition).toContain("attachment"); + expect(headers.disposition).toContain("speech.mp3"); + }); + + it("should set Content-Disposition with correct file extension for wav", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + const wavResult: SynthesisResult = { ...mockSynthesisResult, format: "wav" }; + + mockSpeechService.synthesize.mockResolvedValue(wavResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.disposition).toContain("speech.wav"); + }); + + it("should set Content-Length header based on audio buffer size", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + + mockSpeechService.synthesize.mockResolvedValue(mockSynthesisResult); + + const result = await controller.synthesize(dto, mockWorkspaceId, mockUser); + + const headers = result.getHeaders(); + expect(headers.length).toBe(mockAudioBuffer.length); + }); + + it("should propagate ServiceUnavailableException from service", async () => { + const dto: SynthesizeDto = { text: "Hello" }; + + mockSpeechService.synthesize.mockRejectedValue( + new ServiceUnavailableException("No TTS providers are available") + ); + + await expect(controller.synthesize(dto, mockWorkspaceId, mockUser)).rejects.toThrow( + ServiceUnavailableException + ); + }); + }); + + // ============================================== + // GET /api/speech/voices (Issue #396) + // ============================================== + + describe("getVoices", () => { + const mockVoices: VoiceInfo[] = [ + { + id: "af_heart", + name: "Heart", + language: "en", + tier: "default", + isDefault: true, + }, + { + id: "af_sky", + name: "Sky", + language: "en", + tier: "default", + isDefault: false, + }, + { + id: "chatterbox-voice", + name: "Chatterbox Default", + language: "en", + tier: "premium", + isDefault: true, + }, + ]; + + it("should return all voices when no tier filter is provided", async () => { + mockSpeechService.listVoices.mockResolvedValue(mockVoices); + + const result = await controller.getVoices(mockWorkspaceId); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith(undefined); + expect(result).toEqual({ data: mockVoices }); + }); + + it("should filter voices by default tier", async () => { + const defaultVoices = mockVoices.filter((v) => v.tier === "default"); + mockSpeechService.listVoices.mockResolvedValue(defaultVoices); + + const result = await controller.getVoices(mockWorkspaceId, "default"); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith("default"); + expect(result).toEqual({ data: defaultVoices }); + }); + + it("should filter voices by premium tier", async () => { + const premiumVoices = mockVoices.filter((v) => v.tier === "premium"); + mockSpeechService.listVoices.mockResolvedValue(premiumVoices); + + const result = await controller.getVoices(mockWorkspaceId, "premium"); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith("premium"); + expect(result).toEqual({ data: premiumVoices }); + }); + + it("should return empty array when no voices are available", async () => { + mockSpeechService.listVoices.mockResolvedValue([]); + + const result = await controller.getVoices(mockWorkspaceId); + + expect(result).toEqual({ data: [] }); + }); + + it("should return empty array when filtering by tier with no matching voices", async () => { + mockSpeechService.listVoices.mockResolvedValue([]); + + const result = await controller.getVoices(mockWorkspaceId, "fallback"); + + expect(mockSpeechService.listVoices).toHaveBeenCalledWith("fallback"); + expect(result).toEqual({ data: [] }); + }); + }); +}); diff --git a/apps/api/src/speech/speech.controller.ts b/apps/api/src/speech/speech.controller.ts new file mode 100644 index 0000000..a38f36a --- /dev/null +++ b/apps/api/src/speech/speech.controller.ts @@ -0,0 +1,193 @@ +/** + * SpeechController + * + * REST endpoints for speech-to-text (STT) and text-to-speech (TTS) services. + * Handles audio file uploads for transcription, text-to-speech synthesis, + * voice listing, and provider health status. + * + * Endpoints: + * - POST /api/speech/transcribe - Transcribe uploaded audio file to text + * - POST /api/speech/synthesize - Synthesize text to audio (TTS) + * - GET /api/speech/voices - List available TTS voices + * - GET /api/speech/health - Check STT/TTS provider availability + * + * Issue #392, #396 + */ + +import { + Controller, + Post, + Get, + Body, + Query, + UseGuards, + UseInterceptors, + UploadedFile, + StreamableFile, +} from "@nestjs/common"; +import { FileInterceptor } from "@nestjs/platform-express"; +import { SpeechService } from "./speech.service"; +import { TranscribeDto } from "./dto/transcribe.dto"; +import { SynthesizeDto } from "./dto/synthesize.dto"; +import { AudioValidationPipe } from "./pipes/audio-validation.pipe"; +import { AuthGuard } from "../auth/guards/auth.guard"; +import { WorkspaceGuard, PermissionGuard } from "../common/guards"; +import { Workspace, Permission, RequirePermission } from "../common/decorators"; +import { CurrentUser } from "../auth/decorators/current-user.decorator"; +import type { AuthenticatedUser } from "../common/types/user.types"; +import type { + AudioFormat, + SynthesizeOptions, + TranscribeOptions, + TranscriptionResult, + VoiceInfo, + SpeechTier, +} from "./interfaces/speech-types"; + +/** + * Map audio format to MIME type for Content-Type header. + */ +const AUDIO_FORMAT_MIME_TYPES: Record = { + mp3: "audio/mpeg", + wav: "audio/wav", + opus: "audio/opus", + flac: "audio/flac", + aac: "audio/aac", + pcm: "audio/pcm", +}; + +/** + * Health status for a single speech provider category. + */ +interface ProviderHealth { + available: boolean; +} + +/** + * Combined health status response for all speech providers. + */ +interface SpeechHealthResponse { + data: { + stt: ProviderHealth; + tts: ProviderHealth; + }; +} + +@Controller("speech") +@UseGuards(AuthGuard, WorkspaceGuard, PermissionGuard) +export class SpeechController { + constructor(private readonly speechService: SpeechService) {} + + /** + * POST /api/speech/transcribe + * + * Transcribe an uploaded audio file to text. + * Accepts multipart form data with an audio file and optional transcription parameters. + * + * @param file - Uploaded audio file (validated by AudioValidationPipe) + * @param dto - Optional transcription parameters (language, model, prompt, temperature) + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @param _user - Authenticated user (validated by AuthGuard) + * @returns Transcription result wrapped in standard data envelope + */ + @Post("transcribe") + @RequirePermission(Permission.WORKSPACE_MEMBER) + @UseInterceptors(FileInterceptor("file")) + async transcribe( + @UploadedFile(new AudioValidationPipe()) file: Express.Multer.File, + @Body() dto: TranscribeDto, + @Workspace() _workspaceId: string, + @CurrentUser() _user: AuthenticatedUser + ): Promise<{ data: TranscriptionResult }> { + const options: TranscribeOptions = { mimeType: file.mimetype }; + if (dto.language !== undefined) options.language = dto.language; + if (dto.model !== undefined) options.model = dto.model; + if (dto.prompt !== undefined) options.prompt = dto.prompt; + if (dto.temperature !== undefined) options.temperature = dto.temperature; + + const result = await this.speechService.transcribe(file.buffer, options); + + return { data: result }; + } + + /** + * GET /api/speech/health + * + * Check availability of STT and TTS providers. + * + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @returns Health status of STT and TTS providers + */ + @Get("health") + @RequirePermission(Permission.WORKSPACE_ANY) + health(@Workspace() _workspaceId: string): SpeechHealthResponse { + return { + data: { + stt: { available: this.speechService.isSTTAvailable() }, + tts: { available: this.speechService.isTTSAvailable() }, + }, + }; + } + + /** + * POST /api/speech/synthesize + * + * Synthesize text to audio using TTS providers. + * Accepts JSON body with text and optional voice/format/speed/tier parameters. + * Returns audio binary with appropriate Content-Type and Content-Disposition headers. + * + * Provider selection follows fallback chain: requested tier -> default -> fallback. + * + * @param dto - Synthesis parameters (text, voice?, speed?, format?, tier?) + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @param _user - Authenticated user (validated by AuthGuard) + * @returns StreamableFile containing synthesized audio + * + * Issue #396 + */ + @Post("synthesize") + @RequirePermission(Permission.WORKSPACE_MEMBER) + async synthesize( + @Body() dto: SynthesizeDto, + @Workspace() _workspaceId: string, + @CurrentUser() _user: AuthenticatedUser + ): Promise { + const options: SynthesizeOptions = {}; + if (dto.voice !== undefined) options.voice = dto.voice; + if (dto.speed !== undefined) options.speed = dto.speed; + if (dto.format !== undefined) options.format = dto.format; + if (dto.tier !== undefined) options.tier = dto.tier; + + const result = await this.speechService.synthesize(dto.text, options); + + const mimeType = AUDIO_FORMAT_MIME_TYPES[result.format]; + + return new StreamableFile(result.audio, { + type: mimeType, + disposition: `attachment; filename="speech.${result.format}"`, + length: result.audio.length, + }); + } + + /** + * GET /api/speech/voices + * + * List available TTS voices across all tiers. + * Optionally filter by tier using the `tier` query parameter. + * + * @param _workspaceId - Workspace context (validated by WorkspaceGuard) + * @param tier - Optional tier filter (default, premium, fallback) + * @returns Voice information array wrapped in standard data envelope + * + * Issue #396 + */ + @Get("voices") + @RequirePermission(Permission.WORKSPACE_ANY) + async getVoices( + @Workspace() _workspaceId: string, + @Query("tier") tier?: SpeechTier + ): Promise<{ data: VoiceInfo[] }> { + const voices = await this.speechService.listVoices(tier); + return { data: voices }; + } +} diff --git a/apps/api/src/speech/speech.module.ts b/apps/api/src/speech/speech.module.ts index 840123e..d2151ef 100644 --- a/apps/api/src/speech/speech.module.ts +++ b/apps/api/src/speech/speech.module.ts @@ -31,12 +31,14 @@ import { type SpeechConfig, } from "./speech.config"; import { SpeechService } from "./speech.service"; +import { SpeechController } from "./speech.controller"; import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; import { SpeachesSttProvider } from "./providers/speaches-stt.provider"; import { createTTSProviders } from "./providers/tts-provider.factory"; @Module({ imports: [ConfigModule.forFeature(speechConfig)], + controllers: [SpeechController], providers: [ SpeechService, // STT provider: conditionally register SpeachesSttProvider when STT is enabled From b3d6d73348c2f984a3741a96d3d6e3bbd62b904f Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:51:13 -0600 Subject: [PATCH 12/19] feat(#400): add Docker Compose swarm/prod deployment for speech services Add docker/docker-compose.sample.speech.yml for standalone speech services deployment in Docker Swarm with Portainer compatibility: - Speaches (STT + basic TTS) with Whisper model configuration - Kokoro TTS (default high-quality TTS) always deployed - Chatterbox TTS (premium, GPU) commented out as optional - Traefik labels for reverse proxy routing with TLS - Health checks on all services - Volume persistence for Whisper models - GPU reservation via Swarm generic resources for Chatterbox - Environment variable substitution for Portainer - Comprehensive header documentation Fixes #400 --- docker/docker-compose.sample.speech.yml | 164 ++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 docker/docker-compose.sample.speech.yml diff --git a/docker/docker-compose.sample.speech.yml b/docker/docker-compose.sample.speech.yml new file mode 100644 index 0000000..983fb37 --- /dev/null +++ b/docker/docker-compose.sample.speech.yml @@ -0,0 +1,164 @@ +# ============================================== +# Speech Services - Sample Swarm Deployment +# ============================================== +# +# Standalone speech services deployment for use with Mosaic Stack. +# This is SEPARATE infrastructure — not part of the Mosaic Stack itself. +# Mosaic connects to it via SPEACHES_URL and TTS_URL environment variables. +# +# Provides: +# - Speaches: Speech-to-Text (Whisper) + basic TTS fallback +# - Kokoro TTS: Default high-quality text-to-speech +# - Chatterbox TTS: Premium TTS with voice cloning (optional, requires GPU) +# +# Usage (Docker Swarm via Portainer): +# 1. Create a new stack in Portainer +# 2. Paste this file or point to the repo +# 3. Set environment variables in Portainer's env var section +# 4. Deploy the stack +# +# Usage (Docker Swarm CLI): +# 1. Create .env file with variables below +# 2. docker stack deploy -c docker-compose.sample.speech.yml speech +# +# Required Environment Variables: +# STT_DOMAIN=stt.example.com # Domain for Speaches (STT + basic TTS) +# TTS_DOMAIN=tts.example.com # Domain for Kokoro TTS (default TTS) +# +# Optional Environment Variables: +# WHISPER_MODEL=Systran/faster-whisper-large-v3-turbo # Whisper model for STT +# CHATTERBOX_TTS_DOMAIN=tts-premium.example.com # Domain for Chatterbox (premium TTS) +# TRAEFIK_ENTRYPOINT=websecure # Traefik entrypoint name +# TRAEFIK_CERTRESOLVER=letsencrypt # Traefik cert resolver +# TRAEFIK_DOCKER_NETWORK=traefik-public # Traefik network name +# TRAEFIK_TLS_ENABLED=true # Enable TLS on Traefik routers +# +# Connecting to Mosaic Stack: +# Add to your Mosaic Stack .env: +# SPEACHES_URL=http://speaches:8000 (if same Docker network) +# SPEACHES_URL=https://stt.example.com (if external) +# TTS_URL=http://kokoro-tts:8880 (if same Docker network) +# TTS_URL=https://tts.example.com (if external) +# +# GPU Requirements (Chatterbox only): +# - NVIDIA GPU with CUDA support +# - nvidia-container-toolkit installed on Docker host +# - Docker runtime configured for GPU access +# - Note: Docker Swarm requires "generic resources" for GPU scheduling. +# See: https://docs.docker.com/engine/daemon/nvidia-gpu/#configure-gpus-for-docker-swarm +# +# ============================================== + +services: + # ====================== + # Speaches (STT + basic TTS) + # ====================== + # Primary speech-to-text service using Whisper. + # Also provides basic TTS as a fallback. + speaches: + image: ghcr.io/speaches-ai/speaches:latest + environment: + WHISPER__MODEL: ${WHISPER_MODEL:-Systran/faster-whisper-large-v3-turbo} + volumes: + - speaches-models:/root/.cache/huggingface + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 120s + networks: + - internal + - traefik-public + deploy: + restart_policy: + condition: on-failure + delay: 10s + labels: + - "traefik.enable=true" + - "traefik.http.routers.speech-stt.rule=Host(`${STT_DOMAIN}`)" + - "traefik.http.routers.speech-stt.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}" + - "traefik.http.routers.speech-stt.tls=${TRAEFIK_TLS_ENABLED:-true}" + - "traefik.http.routers.speech-stt.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}" + - "traefik.http.services.speech-stt.loadbalancer.server.port=8000" + - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}" + + # ====================== + # Kokoro TTS (Default TTS) + # ====================== + # High-quality text-to-speech engine. Always deployed alongside Speaches. + kokoro-tts: + image: ghcr.io/remsky/kokoro-fastapi:latest-cpu + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8880/health || exit 1"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 120s + networks: + - internal + - traefik-public + deploy: + restart_policy: + condition: on-failure + delay: 10s + labels: + - "traefik.enable=true" + - "traefik.http.routers.speech-tts.rule=Host(`${TTS_DOMAIN}`)" + - "traefik.http.routers.speech-tts.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}" + - "traefik.http.routers.speech-tts.tls=${TRAEFIK_TLS_ENABLED:-true}" + - "traefik.http.routers.speech-tts.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}" + - "traefik.http.services.speech-tts.loadbalancer.server.port=8880" + - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}" + + # ====================== + # Chatterbox TTS (Premium TTS - Optional) + # ====================== + # Premium TTS with voice cloning capabilities. Requires NVIDIA GPU. + # + # To enable: Uncomment this service and set CHATTERBOX_TTS_DOMAIN. + # + # For Docker Swarm GPU scheduling, configure generic resources on the node: + # /etc/docker/daemon.json: + # { "runtimes": { "nvidia": { ... } }, + # "node-generic-resources": ["NVIDIA-GPU=0"] } + # + # chatterbox-tts: + # image: devnen/chatterbox-tts-server:latest + # healthcheck: + # test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + # interval: 30s + # timeout: 10s + # retries: 5 + # start_period: 180s + # networks: + # - internal + # - traefik-public + # deploy: + # restart_policy: + # condition: on-failure + # delay: 10s + # resources: + # reservations: + # generic_resources: + # - discrete_resource_spec: + # kind: "NVIDIA-GPU" + # value: 1 + # labels: + # - "traefik.enable=true" + # - "traefik.http.routers.speech-tts-premium.rule=Host(`${CHATTERBOX_TTS_DOMAIN}`)" + # - "traefik.http.routers.speech-tts-premium.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}" + # - "traefik.http.routers.speech-tts-premium.tls=${TRAEFIK_TLS_ENABLED:-true}" + # - "traefik.http.routers.speech-tts-premium.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}" + # - "traefik.http.services.speech-tts-premium.loadbalancer.server.port=8000" + # - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}" + +volumes: + speaches-models: + +networks: + internal: + driver: overlay + traefik-public: + external: true + name: ${TRAEFIK_DOCKER_NETWORK:-traefik-public} From 28c9e6fe65f50aa4a2f29d42e4d1daa7edba4907 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 02:54:41 -0600 Subject: [PATCH 13/19] feat(#397): implement WebSocket streaming transcription gateway Add SpeechGateway with Socket.IO namespace /speech for real-time streaming transcription. Supports start-transcription, audio-chunk, and stop-transcription events with session management, authentication, and buffer size rate limiting. Includes 29 unit tests covering authentication, session lifecycle, error handling, cleanup, and client isolation. Co-Authored-By: Claude Opus 4.6 --- apps/api/src/speech/speech.gateway.spec.ts | 683 +++++++++++++++++++++ apps/api/src/speech/speech.gateway.ts | 366 +++++++++++ apps/api/src/speech/speech.module.ts | 11 +- 3 files changed, 1058 insertions(+), 2 deletions(-) create mode 100644 apps/api/src/speech/speech.gateway.spec.ts create mode 100644 apps/api/src/speech/speech.gateway.ts diff --git a/apps/api/src/speech/speech.gateway.spec.ts b/apps/api/src/speech/speech.gateway.spec.ts new file mode 100644 index 0000000..dac50e1 --- /dev/null +++ b/apps/api/src/speech/speech.gateway.spec.ts @@ -0,0 +1,683 @@ +/** + * SpeechGateway Tests + * + * Issue #397: WebSocket streaming transcription endpoint tests. + * Written FIRST following TDD (Red-Green-Refactor). + * + * Tests cover: + * - Authentication via handshake token + * - Session lifecycle: start -> audio chunks -> stop + * - Transcription result emission + * - Session cleanup on disconnect + * - Error handling + * - Buffer size limit enforcement + */ + +import { describe, it, expect, beforeEach, vi } from "vitest"; +import { Socket } from "socket.io"; +import { SpeechGateway } from "./speech.gateway"; +import { SpeechService } from "./speech.service"; +import { AuthService } from "../auth/auth.service"; +import { PrismaService } from "../prisma/prisma.service"; +import type { SpeechConfig } from "./speech.config"; +import type { TranscriptionResult } from "./interfaces/speech-types"; + +// ========================================== +// Test helpers +// ========================================== + +interface AuthenticatedSocket extends Socket { + data: { + userId?: string; + workspaceId?: string; + }; +} + +function createMockConfig(): SpeechConfig { + return { + stt: { + enabled: true, + baseUrl: "http://localhost:8000/v1", + model: "test-model", + language: "en", + }, + tts: { + default: { enabled: true, url: "http://localhost:8880/v1", voice: "test", format: "mp3" }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, + }; +} + +function createMockSocket(overrides?: Partial): AuthenticatedSocket { + return { + id: "test-socket-id", + join: vi.fn(), + leave: vi.fn(), + emit: vi.fn(), + disconnect: vi.fn(), + data: {}, + handshake: { + auth: { token: "valid-token" }, + query: {}, + headers: {}, + }, + ...overrides, + } as unknown as AuthenticatedSocket; +} + +function createMockAuthService(): { + verifySession: ReturnType; +} { + return { + verifySession: vi.fn().mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }), + }; +} + +function createMockPrismaService(): { + workspaceMember: { findFirst: ReturnType }; +} { + return { + workspaceMember: { + findFirst: vi.fn().mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }), + }, + }; +} + +function createMockSpeechService(): { + transcribe: ReturnType; + isSTTAvailable: ReturnType; +} { + return { + transcribe: vi.fn().mockResolvedValue({ + text: "Hello world", + language: "en", + durationSeconds: 2.5, + } satisfies TranscriptionResult), + isSTTAvailable: vi.fn().mockReturnValue(true), + }; +} + +// ========================================== +// Tests +// ========================================== + +describe("SpeechGateway", () => { + let gateway: SpeechGateway; + let mockAuthService: ReturnType; + let mockPrismaService: ReturnType; + let mockSpeechService: ReturnType; + let mockConfig: SpeechConfig; + let mockClient: AuthenticatedSocket; + + beforeEach(() => { + mockAuthService = createMockAuthService(); + mockPrismaService = createMockPrismaService(); + mockSpeechService = createMockSpeechService(); + mockConfig = createMockConfig(); + mockClient = createMockSocket(); + + gateway = new SpeechGateway( + mockAuthService as unknown as AuthService, + mockPrismaService as unknown as PrismaService, + mockSpeechService as unknown as SpeechService, + mockConfig + ); + + vi.clearAllMocks(); + }); + + // ========================================== + // Authentication + // ========================================== + describe("handleConnection", () => { + it("should authenticate client and populate socket data on valid token", async () => { + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }); + + await gateway.handleConnection(mockClient); + + expect(mockAuthService.verifySession).toHaveBeenCalledWith("valid-token"); + expect(mockClient.data.userId).toBe("user-123"); + expect(mockClient.data.workspaceId).toBe("workspace-456"); + }); + + it("should disconnect client without token", async () => { + const clientNoToken = createMockSocket({ + handshake: { auth: {}, query: {}, headers: {} }, + } as Partial); + + await gateway.handleConnection(clientNoToken); + + expect(clientNoToken.disconnect).toHaveBeenCalled(); + }); + + it("should disconnect client with invalid token", async () => { + mockAuthService.verifySession.mockResolvedValue(null); + + await gateway.handleConnection(mockClient); + + expect(mockClient.disconnect).toHaveBeenCalled(); + }); + + it("should disconnect client without workspace access", async () => { + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue(null); + + await gateway.handleConnection(mockClient); + + expect(mockClient.disconnect).toHaveBeenCalled(); + }); + + it("should disconnect client when auth throws", async () => { + mockAuthService.verifySession.mockRejectedValue(new Error("Auth failure")); + + await gateway.handleConnection(mockClient); + + expect(mockClient.disconnect).toHaveBeenCalled(); + }); + + it("should extract token from handshake.query as fallback", async () => { + const clientQueryToken = createMockSocket({ + handshake: { + auth: {}, + query: { token: "query-token" }, + headers: {}, + }, + } as Partial); + + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }); + + await gateway.handleConnection(clientQueryToken); + + expect(mockAuthService.verifySession).toHaveBeenCalledWith("query-token"); + }); + }); + + // ========================================== + // start-transcription + // ========================================== + describe("handleStartTranscription", () => { + beforeEach(async () => { + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }); + await gateway.handleConnection(mockClient); + vi.clearAllMocks(); + }); + + it("should create a transcription session", () => { + gateway.handleStartTranscription(mockClient, { language: "en" }); + + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-started", + expect.objectContaining({ sessionId: expect.any(String) }) + ); + }); + + it("should create a session with optional language parameter", () => { + gateway.handleStartTranscription(mockClient, { language: "fr" }); + + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-started", + expect.objectContaining({ sessionId: expect.any(String) }) + ); + }); + + it("should create a session with no options", () => { + gateway.handleStartTranscription(mockClient, {}); + + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-started", + expect.objectContaining({ sessionId: expect.any(String) }) + ); + }); + + it("should emit error if client is not authenticated", () => { + const unauthClient = createMockSocket(); + // Not connected through handleConnection, so no userId set + + gateway.handleStartTranscription(unauthClient, {}); + + expect(unauthClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.any(String) }) + ); + }); + + it("should replace existing session if one already exists", () => { + gateway.handleStartTranscription(mockClient, {}); + gateway.handleStartTranscription(mockClient, { language: "de" }); + + // Should have emitted transcription-started twice (no error) + const startedCalls = (mockClient.emit as ReturnType).mock.calls.filter( + (call: unknown[]) => call[0] === "transcription-started" + ); + expect(startedCalls).toHaveLength(2); + }); + }); + + // ========================================== + // audio-chunk + // ========================================== + describe("handleAudioChunk", () => { + beforeEach(async () => { + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }); + await gateway.handleConnection(mockClient); + vi.clearAllMocks(); + gateway.handleStartTranscription(mockClient, {}); + vi.clearAllMocks(); + }); + + it("should accumulate audio data in the session", () => { + const chunk = Buffer.from("audio-data-1"); + gateway.handleAudioChunk(mockClient, chunk); + + // No error emitted + const errorCalls = (mockClient.emit as ReturnType).mock.calls.filter( + (call: unknown[]) => call[0] === "transcription-error" + ); + expect(errorCalls).toHaveLength(0); + }); + + it("should accept Uint8Array data and convert to Buffer", () => { + const chunk = new Uint8Array([1, 2, 3, 4]); + gateway.handleAudioChunk(mockClient, chunk); + + const errorCalls = (mockClient.emit as ReturnType).mock.calls.filter( + (call: unknown[]) => call[0] === "transcription-error" + ); + expect(errorCalls).toHaveLength(0); + }); + + it("should emit error if no active session exists", () => { + const noSessionClient = createMockSocket({ id: "no-session" }); + noSessionClient.data = { userId: "user-123", workspaceId: "workspace-456" }; + + const chunk = Buffer.from("audio-data"); + gateway.handleAudioChunk(noSessionClient, chunk); + + expect(noSessionClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.any(String) }) + ); + }); + + it("should emit error if client is not authenticated", () => { + const unauthClient = createMockSocket({ id: "unauth" }); + // Not authenticated + + const chunk = Buffer.from("audio-data"); + gateway.handleAudioChunk(unauthClient, chunk); + + expect(unauthClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.any(String) }) + ); + }); + + it("should emit error when buffer size exceeds max upload size", () => { + // Set a very small max upload size + const smallConfig = createMockConfig(); + smallConfig.limits.maxUploadSize = 10; + + const limitedGateway = new SpeechGateway( + mockAuthService as unknown as AuthService, + mockPrismaService as unknown as PrismaService, + mockSpeechService as unknown as SpeechService, + smallConfig + ); + + // We need to manually set up the authenticated client in the new gateway + const limitedClient = createMockSocket({ id: "limited-client" }); + limitedClient.data = { userId: "user-123", workspaceId: "workspace-456" }; + + // Start session directly (since handleConnection populates data) + limitedGateway.handleStartTranscription(limitedClient, {}); + vi.clearAllMocks(); + + // Send a chunk that exceeds the limit + const largeChunk = Buffer.alloc(20, "a"); + limitedGateway.handleAudioChunk(limitedClient, largeChunk); + + expect(limitedClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.stringContaining("exceeds") }) + ); + }); + + it("should emit error when accumulated buffer size exceeds max upload size", () => { + const smallConfig = createMockConfig(); + smallConfig.limits.maxUploadSize = 15; + + const limitedGateway = new SpeechGateway( + mockAuthService as unknown as AuthService, + mockPrismaService as unknown as PrismaService, + mockSpeechService as unknown as SpeechService, + smallConfig + ); + + const limitedClient = createMockSocket({ id: "limited-client-2" }); + limitedClient.data = { userId: "user-123", workspaceId: "workspace-456" }; + + limitedGateway.handleStartTranscription(limitedClient, {}); + vi.clearAllMocks(); + + // Send two chunks that together exceed the limit + const chunk1 = Buffer.alloc(10, "a"); + const chunk2 = Buffer.alloc(10, "b"); + limitedGateway.handleAudioChunk(limitedClient, chunk1); + limitedGateway.handleAudioChunk(limitedClient, chunk2); + + expect(limitedClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.stringContaining("exceeds") }) + ); + }); + }); + + // ========================================== + // stop-transcription + // ========================================== + describe("handleStopTranscription", () => { + beforeEach(async () => { + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }); + await gateway.handleConnection(mockClient); + vi.clearAllMocks(); + }); + + it("should transcribe accumulated audio and emit final result", async () => { + gateway.handleStartTranscription(mockClient, { language: "en" }); + + const chunk1 = Buffer.from("audio-part-1"); + const chunk2 = Buffer.from("audio-part-2"); + gateway.handleAudioChunk(mockClient, chunk1); + gateway.handleAudioChunk(mockClient, chunk2); + + vi.clearAllMocks(); + + const expectedResult: TranscriptionResult = { + text: "Hello world", + language: "en", + durationSeconds: 2.5, + }; + mockSpeechService.transcribe.mockResolvedValue(expectedResult); + + await gateway.handleStopTranscription(mockClient); + + // Should have called transcribe with concatenated buffer + expect(mockSpeechService.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({}) + ); + + // Should emit transcription-final + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-final", + expect.objectContaining({ text: "Hello world" }) + ); + }); + + it("should pass language option to SpeechService.transcribe", async () => { + gateway.handleStartTranscription(mockClient, { language: "fr" }); + gateway.handleAudioChunk(mockClient, Buffer.from("audio")); + + vi.clearAllMocks(); + + await gateway.handleStopTranscription(mockClient); + + expect(mockSpeechService.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({ language: "fr" }) + ); + }); + + it("should clean up session after stop", async () => { + gateway.handleStartTranscription(mockClient, {}); + gateway.handleAudioChunk(mockClient, Buffer.from("audio")); + + await gateway.handleStopTranscription(mockClient); + + vi.clearAllMocks(); + + // Sending more audio after stop should error (no session) + gateway.handleAudioChunk(mockClient, Buffer.from("more-audio")); + + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.any(String) }) + ); + }); + + it("should emit transcription-error when transcription fails", async () => { + gateway.handleStartTranscription(mockClient, {}); + gateway.handleAudioChunk(mockClient, Buffer.from("audio")); + + vi.clearAllMocks(); + + mockSpeechService.transcribe.mockRejectedValue(new Error("STT service down")); + + await gateway.handleStopTranscription(mockClient); + + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.stringContaining("STT service down") }) + ); + }); + + it("should emit error if no active session exists", async () => { + await gateway.handleStopTranscription(mockClient); + + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.any(String) }) + ); + }); + + it("should emit error if client is not authenticated", async () => { + const unauthClient = createMockSocket({ id: "unauth-stop" }); + + await gateway.handleStopTranscription(unauthClient); + + expect(unauthClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.any(String) }) + ); + }); + + it("should emit error when stopping with no audio chunks received", async () => { + gateway.handleStartTranscription(mockClient, {}); + + vi.clearAllMocks(); + + await gateway.handleStopTranscription(mockClient); + + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.stringContaining("No audio") }) + ); + }); + }); + + // ========================================== + // handleDisconnect + // ========================================== + describe("handleDisconnect", () => { + beforeEach(async () => { + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }); + await gateway.handleConnection(mockClient); + vi.clearAllMocks(); + }); + + it("should clean up active session on disconnect", () => { + gateway.handleStartTranscription(mockClient, {}); + gateway.handleAudioChunk(mockClient, Buffer.from("audio")); + + gateway.handleDisconnect(mockClient); + + // Session should be gone. Verify by trying to add a chunk to a new + // socket with the same ID (should error since session was cleaned up). + const newClient = createMockSocket({ id: mockClient.id }); + newClient.data = { userId: "user-123", workspaceId: "workspace-456" }; + + gateway.handleAudioChunk(newClient, Buffer.from("more")); + + expect(newClient.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.any(String) }) + ); + }); + + it("should not throw when disconnecting client without active session", () => { + expect(() => gateway.handleDisconnect(mockClient)).not.toThrow(); + }); + + it("should not throw when disconnecting unauthenticated client", () => { + const unauthClient = createMockSocket({ id: "unauth-disconnect" }); + expect(() => gateway.handleDisconnect(unauthClient)).not.toThrow(); + }); + }); + + // ========================================== + // Edge cases + // ========================================== + describe("edge cases", () => { + beforeEach(async () => { + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-123" }, + session: { id: "session-123" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-123", + workspaceId: "workspace-456", + role: "MEMBER", + }); + await gateway.handleConnection(mockClient); + vi.clearAllMocks(); + }); + + it("should handle multiple start-stop cycles for the same client", async () => { + // First cycle + gateway.handleStartTranscription(mockClient, {}); + gateway.handleAudioChunk(mockClient, Buffer.from("cycle-1")); + await gateway.handleStopTranscription(mockClient); + + vi.clearAllMocks(); + + // Second cycle + gateway.handleStartTranscription(mockClient, { language: "de" }); + gateway.handleAudioChunk(mockClient, Buffer.from("cycle-2")); + await gateway.handleStopTranscription(mockClient); + + expect(mockSpeechService.transcribe).toHaveBeenCalledTimes(1); + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-final", + expect.objectContaining({ text: "Hello world" }) + ); + }); + + it("should isolate sessions between different clients", async () => { + const client2 = createMockSocket({ id: "client-2" }); + client2.data = { userId: "user-456", workspaceId: "workspace-789" }; + + // Client 2 also needs to be "connected" + mockAuthService.verifySession.mockResolvedValue({ + user: { id: "user-456" }, + session: { id: "session-456" }, + }); + mockPrismaService.workspaceMember.findFirst.mockResolvedValue({ + userId: "user-456", + workspaceId: "workspace-789", + role: "MEMBER", + }); + await gateway.handleConnection(client2); + vi.clearAllMocks(); + + // Start sessions for both clients + gateway.handleStartTranscription(mockClient, {}); + gateway.handleStartTranscription(client2, {}); + + // Send audio to client 1 only + gateway.handleAudioChunk(mockClient, Buffer.from("audio-for-client-1")); + + // Stop client 2 (no audio) + await gateway.handleStopTranscription(client2); + + // Client 2 should get an error (no audio received) + expect(client2.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ message: expect.stringContaining("No audio") }) + ); + + vi.clearAllMocks(); + + // Stop client 1 (has audio) -- should succeed + await gateway.handleStopTranscription(mockClient); + expect(mockSpeechService.transcribe).toHaveBeenCalled(); + expect(mockClient.emit).toHaveBeenCalledWith( + "transcription-final", + expect.objectContaining({ text: "Hello world" }) + ); + }); + }); +}); diff --git a/apps/api/src/speech/speech.gateway.ts b/apps/api/src/speech/speech.gateway.ts new file mode 100644 index 0000000..907ec57 --- /dev/null +++ b/apps/api/src/speech/speech.gateway.ts @@ -0,0 +1,366 @@ +/** + * SpeechGateway + * + * WebSocket gateway for real-time streaming transcription. + * Uses a separate `/speech` namespace from the main WebSocket gateway. + * + * Protocol: + * 1. Client connects with auth token in handshake + * 2. Client emits `start-transcription` with optional { language } + * 3. Client streams audio via `audio-chunk` events (Buffer/Uint8Array) + * 4. Client emits `stop-transcription` to finalize + * 5. Server responds with `transcription-final` containing the result + * + * Session management: + * - One active transcription session per client + * - Audio chunks accumulated in memory (Buffer array) + * - On stop: chunks concatenated and sent to SpeechService.transcribe() + * - Sessions cleaned up on disconnect + * + * Rate limiting: + * - Total accumulated audio size is capped by config limits.maxUploadSize + * + * Issue #397 + */ + +import { + WebSocketGateway as WSGateway, + WebSocketServer, + SubscribeMessage, + OnGatewayConnection, + OnGatewayDisconnect, +} from "@nestjs/websockets"; +import { Logger, Inject } from "@nestjs/common"; +import { Server, Socket } from "socket.io"; +import { AuthService } from "../auth/auth.service"; +import { PrismaService } from "../prisma/prisma.service"; +import { SpeechService } from "./speech.service"; +import { speechConfig, type SpeechConfig } from "./speech.config"; + +// ========================================== +// Types +// ========================================== + +interface AuthenticatedSocket extends Socket { + data: { + userId?: string; + workspaceId?: string; + }; +} + +interface TranscriptionSession { + chunks: Buffer[]; + totalSize: number; + language: string | undefined; + startedAt: Date; +} + +interface StartTranscriptionPayload { + language?: string; +} + +// ========================================== +// Gateway +// ========================================== + +@WSGateway({ + namespace: "/speech", + cors: { + origin: process.env.WEB_URL ?? "http://localhost:3000", + credentials: true, + }, +}) +export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { + @WebSocketServer() + server!: Server; + + private readonly logger = new Logger(SpeechGateway.name); + private readonly sessions = new Map(); + private readonly CONNECTION_TIMEOUT_MS = 5000; + + constructor( + private readonly authService: AuthService, + private readonly prisma: PrismaService, + private readonly speechService: SpeechService, + @Inject(speechConfig.KEY) + private readonly config: SpeechConfig + ) {} + + // ========================================== + // Connection lifecycle + // ========================================== + + /** + * Authenticate client on connection using the same pattern as the main WebSocket gateway. + * Extracts token from handshake, verifies session, and checks workspace membership. + */ + async handleConnection(client: Socket): Promise { + const authenticatedClient = client as AuthenticatedSocket; + + const timeoutId = setTimeout(() => { + if (!authenticatedClient.data.userId) { + this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`); + authenticatedClient.disconnect(); + } + }, this.CONNECTION_TIMEOUT_MS); + + try { + const token = this.extractTokenFromHandshake(authenticatedClient); + + if (!token) { + this.logger.warn(`Client ${authenticatedClient.id} connected without token`); + authenticatedClient.disconnect(); + clearTimeout(timeoutId); + return; + } + + const sessionData = await this.authService.verifySession(token); + + if (!sessionData) { + this.logger.warn(`Client ${authenticatedClient.id} has invalid token`); + authenticatedClient.disconnect(); + clearTimeout(timeoutId); + return; + } + + const user = sessionData.user as { id: string }; + const userId = user.id; + + const workspaceMembership = await this.prisma.workspaceMember.findFirst({ + where: { userId }, + select: { workspaceId: true, userId: true, role: true }, + }); + + if (!workspaceMembership) { + this.logger.warn(`User ${userId} has no workspace access`); + authenticatedClient.disconnect(); + clearTimeout(timeoutId); + return; + } + + authenticatedClient.data.userId = userId; + authenticatedClient.data.workspaceId = workspaceMembership.workspaceId; + + clearTimeout(timeoutId); + this.logger.log( + `Speech client ${authenticatedClient.id} connected (user: ${userId}, workspace: ${workspaceMembership.workspaceId})` + ); + } catch (error) { + clearTimeout(timeoutId); + this.logger.error( + `Authentication failed for speech client ${authenticatedClient.id}:`, + error instanceof Error ? error.message : "Unknown error" + ); + authenticatedClient.disconnect(); + } + } + + /** + * Clean up transcription session on client disconnect. + */ + handleDisconnect(client: Socket): void { + const authenticatedClient = client as AuthenticatedSocket; + const sessionId = authenticatedClient.id; + + if (this.sessions.has(sessionId)) { + this.sessions.delete(sessionId); + this.logger.log(`Cleaned up transcription session for client ${sessionId}`); + } + + this.logger.debug(`Speech client ${sessionId} disconnected`); + } + + // ========================================== + // Transcription events + // ========================================== + + /** + * Start a new transcription session for the client. + * Replaces any existing session for this client. + * + * @param client - The connected socket client + * @param payload - Optional parameters: { language?: string } + */ + @SubscribeMessage("start-transcription") + handleStartTranscription(client: Socket, payload: StartTranscriptionPayload): void { + const authenticatedClient = client as AuthenticatedSocket; + + if (!authenticatedClient.data.userId) { + authenticatedClient.emit("transcription-error", { + message: "Not authenticated. Connect with a valid token.", + }); + return; + } + + const sessionId = authenticatedClient.id; + + // Clean up any existing session for this client + if (this.sessions.has(sessionId)) { + this.sessions.delete(sessionId); + this.logger.debug(`Replaced existing session for client ${sessionId}`); + } + + const language = payload.language; + + const session: TranscriptionSession = { + chunks: [], + totalSize: 0, + language, + startedAt: new Date(), + }; + + this.sessions.set(sessionId, session); + + authenticatedClient.emit("transcription-started", { + sessionId, + language, + }); + + this.logger.debug( + `Transcription session started for client ${sessionId} (language: ${language ?? "auto"})` + ); + } + + /** + * Receive an audio chunk and accumulate it in the active session. + * Enforces maximum buffer size from configuration. + * + * @param client - The connected socket client + * @param data - Audio data as Buffer or Uint8Array + */ + @SubscribeMessage("audio-chunk") + handleAudioChunk(client: Socket, data: Buffer | Uint8Array): void { + const authenticatedClient = client as AuthenticatedSocket; + + if (!authenticatedClient.data.userId) { + authenticatedClient.emit("transcription-error", { + message: "Not authenticated. Connect with a valid token.", + }); + return; + } + + const sessionId = authenticatedClient.id; + const session = this.sessions.get(sessionId); + + if (!session) { + authenticatedClient.emit("transcription-error", { + message: "No active transcription session. Send start-transcription first.", + }); + return; + } + + const chunk = Buffer.isBuffer(data) ? data : Buffer.from(data); + const newTotalSize = session.totalSize + chunk.length; + + if (newTotalSize > this.config.limits.maxUploadSize) { + authenticatedClient.emit("transcription-error", { + message: `Audio buffer size (${String(newTotalSize)} bytes) exceeds maximum allowed size (${String(this.config.limits.maxUploadSize)} bytes).`, + }); + // Clean up the session on overflow + this.sessions.delete(sessionId); + return; + } + + session.chunks.push(chunk); + session.totalSize = newTotalSize; + } + + /** + * Stop the transcription session, concatenate audio chunks, and transcribe. + * Emits `transcription-final` on success or `transcription-error` on failure. + * + * @param client - The connected socket client + */ + @SubscribeMessage("stop-transcription") + async handleStopTranscription(client: Socket): Promise { + const authenticatedClient = client as AuthenticatedSocket; + + if (!authenticatedClient.data.userId) { + authenticatedClient.emit("transcription-error", { + message: "Not authenticated. Connect with a valid token.", + }); + return; + } + + const sessionId = authenticatedClient.id; + const session = this.sessions.get(sessionId); + + if (!session) { + authenticatedClient.emit("transcription-error", { + message: "No active transcription session. Send start-transcription first.", + }); + return; + } + + // Always remove session before processing (prevents double-stop) + this.sessions.delete(sessionId); + + if (session.chunks.length === 0) { + authenticatedClient.emit("transcription-error", { + message: "No audio data received. Send audio-chunk events before stopping.", + }); + return; + } + + try { + const audioBuffer = Buffer.concat(session.chunks); + const options: { language?: string } = {}; + if (session.language) { + options.language = session.language; + } + + this.logger.debug( + `Transcribing ${String(audioBuffer.length)} bytes for client ${sessionId} (language: ${session.language ?? "auto"})` + ); + + const result = await this.speechService.transcribe(audioBuffer, options); + + authenticatedClient.emit("transcription-final", { + text: result.text, + language: result.language, + durationSeconds: result.durationSeconds, + confidence: result.confidence, + segments: result.segments, + }); + + this.logger.debug(`Transcription complete for client ${sessionId}: "${result.text}"`); + } catch (error: unknown) { + const message = error instanceof Error ? error.message : String(error); + this.logger.error(`Transcription failed for client ${sessionId}: ${message}`); + authenticatedClient.emit("transcription-error", { + message: `Transcription failed: ${message}`, + }); + } + } + + // ========================================== + // Private helpers + // ========================================== + + /** + * Extract authentication token from Socket.IO handshake. + * Checks auth.token, query.token, and Authorization header (in that order). + */ + private extractTokenFromHandshake(client: Socket): string | undefined { + const authToken = client.handshake.auth.token as unknown; + if (typeof authToken === "string" && authToken.length > 0) { + return authToken; + } + + const queryToken = client.handshake.query.token as unknown; + if (typeof queryToken === "string" && queryToken.length > 0) { + return queryToken; + } + + const authHeader = client.handshake.headers.authorization as unknown; + if (typeof authHeader === "string") { + const parts = authHeader.split(" "); + const [type, token] = parts; + if (type === "Bearer" && token) { + return token; + } + } + + return undefined; + } +} diff --git a/apps/api/src/speech/speech.module.ts b/apps/api/src/speech/speech.module.ts index d2151ef..42978f9 100644 --- a/apps/api/src/speech/speech.module.ts +++ b/apps/api/src/speech/speech.module.ts @@ -11,15 +11,18 @@ * * Imports: * - ConfigModule.forFeature(speechConfig) for speech configuration + * - AuthModule for WebSocket authentication + * - PrismaModule for workspace membership queries * * Providers: * - SpeechService: High-level speech operations with provider selection + * - SpeechGateway: WebSocket gateway for streaming transcription (Issue #397) * - TTS_PROVIDERS: Map populated by factory based on config * * Exports: * - SpeechService for use by other modules (e.g., controllers, brain) * - * Issue #389, #390, #391 + * Issue #389, #390, #391, #397 */ import { Module, type OnModuleInit, Logger } from "@nestjs/common"; @@ -32,15 +35,19 @@ import { } from "./speech.config"; import { SpeechService } from "./speech.service"; import { SpeechController } from "./speech.controller"; +import { SpeechGateway } from "./speech.gateway"; import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; import { SpeachesSttProvider } from "./providers/speaches-stt.provider"; import { createTTSProviders } from "./providers/tts-provider.factory"; +import { AuthModule } from "../auth/auth.module"; +import { PrismaModule } from "../prisma/prisma.module"; @Module({ - imports: [ConfigModule.forFeature(speechConfig)], + imports: [ConfigModule.forFeature(speechConfig), AuthModule, PrismaModule], controllers: [SpeechController], providers: [ SpeechService, + SpeechGateway, // STT provider: conditionally register SpeachesSttProvider when STT is enabled ...(isSttEnabled() ? [ From 74d6c1092ea8625a1ec8dbc8c3565a5281543a22 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:05:39 -0600 Subject: [PATCH 14/19] feat(#403): add audio playback component for TTS output Implements AudioPlayer inline component with play/pause, progress bar, speed control (0.5x-2x), download, and duration display. Adds TextToSpeechButton "Read aloud" component that synthesizes text via the speech API and integrates AudioPlayer for playback. Includes useTextToSpeech hook with API integration, audio caching, and playback state management. All 32 tests passing. Co-Authored-By: Claude Opus 4.6 --- .../components/speech/AudioPlayer.test.tsx | 178 ++++++++ .../web/src/components/speech/AudioPlayer.tsx | 250 +++++++++++ .../speech/AudioVisualizer.test.tsx | 70 +++ .../src/components/speech/AudioVisualizer.tsx | 87 ++++ .../speech/TextToSpeechButton.test.tsx | 218 ++++++++++ .../components/speech/TextToSpeechButton.tsx | 126 ++++++ .../src/components/speech/VoiceInput.test.tsx | 228 ++++++++++ apps/web/src/components/speech/VoiceInput.tsx | 146 +++++++ apps/web/src/components/speech/index.ts | 8 + apps/web/src/hooks/useTextToSpeech.test.ts | 285 ++++++++++++ apps/web/src/hooks/useTextToSpeech.ts | 239 ++++++++++ apps/web/src/hooks/useVoiceInput.test.ts | 362 ++++++++++++++++ apps/web/src/hooks/useVoiceInput.ts | 409 ++++++++++++++++++ apps/web/src/lib/api/speech.ts | 58 +++ 14 files changed, 2664 insertions(+) create mode 100644 apps/web/src/components/speech/AudioPlayer.test.tsx create mode 100644 apps/web/src/components/speech/AudioPlayer.tsx create mode 100644 apps/web/src/components/speech/AudioVisualizer.test.tsx create mode 100644 apps/web/src/components/speech/AudioVisualizer.tsx create mode 100644 apps/web/src/components/speech/TextToSpeechButton.test.tsx create mode 100644 apps/web/src/components/speech/TextToSpeechButton.tsx create mode 100644 apps/web/src/components/speech/VoiceInput.test.tsx create mode 100644 apps/web/src/components/speech/VoiceInput.tsx create mode 100644 apps/web/src/components/speech/index.ts create mode 100644 apps/web/src/hooks/useTextToSpeech.test.ts create mode 100644 apps/web/src/hooks/useTextToSpeech.ts create mode 100644 apps/web/src/hooks/useVoiceInput.test.ts create mode 100644 apps/web/src/hooks/useVoiceInput.ts create mode 100644 apps/web/src/lib/api/speech.ts diff --git a/apps/web/src/components/speech/AudioPlayer.test.tsx b/apps/web/src/components/speech/AudioPlayer.test.tsx new file mode 100644 index 0000000..f185b09 --- /dev/null +++ b/apps/web/src/components/speech/AudioPlayer.test.tsx @@ -0,0 +1,178 @@ +/** + * @file AudioPlayer.test.tsx + * @description Tests for the AudioPlayer component that provides inline TTS audio playback + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { AudioPlayer } from "./AudioPlayer"; + +// Mock HTMLAudioElement +class MockAudio { + src = ""; + currentTime = 0; + duration = 60; + paused = true; + playbackRate = 1; + volume = 1; + onended: (() => void) | null = null; + ontimeupdate: (() => void) | null = null; + onloadedmetadata: (() => void) | null = null; + onerror: ((e: unknown) => void) | null = null; + + play(): Promise { + this.paused = false; + return Promise.resolve(); + } + + pause(): void { + this.paused = true; + } + + addEventListener(event: string, handler: () => void): void { + if (event === "ended") this.onended = handler; + if (event === "timeupdate") this.ontimeupdate = handler; + if (event === "loadedmetadata") this.onloadedmetadata = handler; + if (event === "error") this.onerror = handler; + } + + removeEventListener(): void { + // no-op for tests + } +} + +vi.stubGlobal("Audio", MockAudio); + +describe("AudioPlayer", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + describe("rendering", () => { + it("should render play button", () => { + render(); + + const playButton = screen.getByRole("button", { name: "Play audio" }); + expect(playButton).toBeInTheDocument(); + }); + + it("should render download button", () => { + render(); + + const downloadButton = screen.getByRole("button", { name: /download/i }); + expect(downloadButton).toBeInTheDocument(); + }); + + it("should render time display showing 0:00", () => { + render(); + + expect(screen.getByText("0:00")).toBeInTheDocument(); + }); + + it("should render speed control", () => { + render(); + + const speedButton = screen.getByRole("button", { name: "Playback speed" }); + expect(speedButton).toBeInTheDocument(); + }); + + it("should render progress bar", () => { + render(); + + const progressBar = screen.getByRole("progressbar"); + expect(progressBar).toBeInTheDocument(); + }); + + it("should not render when src is null", () => { + const { container } = render(); + + expect(container.firstChild).toBeNull(); + }); + }); + + describe("play/pause", () => { + it("should toggle to pause button when playing", async () => { + const user = userEvent.setup(); + render(); + + const playButton = screen.getByRole("button", { name: "Play audio" }); + await user.click(playButton); + + expect(screen.getByRole("button", { name: "Pause audio" })).toBeInTheDocument(); + }); + }); + + describe("speed control", () => { + it("should cycle through speed options on click", async () => { + const user = userEvent.setup(); + render(); + + const speedButton = screen.getByRole("button", { name: "Playback speed" }); + + // Default should be 1x + expect(speedButton).toHaveTextContent("1x"); + + // Click to go to 1.5x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("1.5x"); + + // Click to go to 2x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("2x"); + + // Click to go to 0.5x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("0.5x"); + + // Click to go back to 1x + await user.click(speedButton); + expect(speedButton).toHaveTextContent("1x"); + }); + }); + + describe("accessibility", () => { + it("should have proper aria labels on controls", () => { + render(); + + expect(screen.getByRole("button", { name: "Play audio" })).toBeInTheDocument(); + expect(screen.getByRole("button", { name: /download/i })).toBeInTheDocument(); + expect(screen.getByRole("button", { name: "Playback speed" })).toBeInTheDocument(); + expect(screen.getByRole("progressbar")).toHaveAttribute("aria-label"); + }); + + it("should have region role on the player container", () => { + render(); + + expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument(); + }); + }); + + describe("design", () => { + it("should not use aggressive red colors", () => { + const { container } = render(); + + const allElements = container.querySelectorAll("*"); + allElements.forEach((el) => { + const className = el.className; + if (typeof className === "string") { + expect(className).not.toMatch(/bg-red-|text-red-|border-red-/); + } + }); + }); + }); + + describe("callbacks", () => { + it("should call onPlayStateChange when play state changes", async () => { + const onPlayStateChange = vi.fn(); + const user = userEvent.setup(); + + render(); + + const playButton = screen.getByRole("button", { name: "Play audio" }); + await user.click(playButton); + + expect(onPlayStateChange).toHaveBeenCalledWith(true); + }); + }); +}); diff --git a/apps/web/src/components/speech/AudioPlayer.tsx b/apps/web/src/components/speech/AudioPlayer.tsx new file mode 100644 index 0000000..d4a9a50 --- /dev/null +++ b/apps/web/src/components/speech/AudioPlayer.tsx @@ -0,0 +1,250 @@ +/** + * AudioPlayer Component + * Inline audio player for TTS content with play/pause, progress, + * speed control, download, and duration display. + * + * Follows PDA-friendly design: no aggressive colors, calm interface. + */ + +import { useState, useRef, useEffect, useCallback } from "react"; +import type { ReactElement } from "react"; + +/** Playback speed options */ +const SPEED_OPTIONS = [1, 1.5, 2, 0.5] as const; + +export interface AudioPlayerProps { + /** URL of the audio to play (blob URL or HTTP URL). If null, nothing renders. */ + src: string | null; + /** Whether to auto-play when src changes */ + autoPlay?: boolean; + /** Callback when play state changes */ + onPlayStateChange?: (isPlaying: boolean) => void; + /** Optional className for the container */ + className?: string; +} + +/** + * Format seconds into M:SS display + */ +function formatTime(seconds: number): string { + if (!isFinite(seconds) || seconds < 0) return "0:00"; + const mins = Math.floor(seconds / 60); + const secs = Math.floor(seconds % 60); + return `${String(mins)}:${String(secs).padStart(2, "0")}`; +} + +/** + * AudioPlayer displays an inline audio player with controls for + * play/pause, progress tracking, speed adjustment, and download. + */ +export function AudioPlayer({ + src, + autoPlay = false, + onPlayStateChange, + className = "", +}: AudioPlayerProps): ReactElement | null { + const [isPlaying, setIsPlaying] = useState(false); + const [currentTime, setCurrentTime] = useState(0); + const [duration, setDuration] = useState(0); + const [speedIndex, setSpeedIndex] = useState(0); + + const audioRef = useRef(null); + + /** + * Set up audio element when src changes + */ + useEffect((): (() => void) | undefined => { + if (!src) return undefined; + + const audio = new Audio(src); + audioRef.current = audio; + + const onLoadedMetadata = (): void => { + if (isFinite(audio.duration)) { + setDuration(audio.duration); + } + }; + + const onTimeUpdate = (): void => { + setCurrentTime(audio.currentTime); + }; + + const onEnded = (): void => { + setIsPlaying(false); + setCurrentTime(0); + onPlayStateChange?.(false); + }; + + audio.addEventListener("loadedmetadata", onLoadedMetadata); + audio.addEventListener("timeupdate", onTimeUpdate); + audio.addEventListener("ended", onEnded); + + if (autoPlay) { + void audio.play().then(() => { + setIsPlaying(true); + onPlayStateChange?.(true); + }); + } + + return (): void => { + audio.pause(); + audio.removeEventListener("loadedmetadata", onLoadedMetadata); + audio.removeEventListener("timeupdate", onTimeUpdate); + audio.removeEventListener("ended", onEnded); + audioRef.current = null; + }; + }, [src, autoPlay, onPlayStateChange]); + + /** + * Toggle play/pause + */ + const togglePlayPause = useCallback(async (): Promise => { + const audio = audioRef.current; + if (!audio) return; + + if (isPlaying) { + audio.pause(); + setIsPlaying(false); + onPlayStateChange?.(false); + } else { + await audio.play(); + setIsPlaying(true); + onPlayStateChange?.(true); + } + }, [isPlaying, onPlayStateChange]); + + /** + * Cycle through speed options + */ + const cycleSpeed = useCallback((): void => { + const nextIndex = (speedIndex + 1) % SPEED_OPTIONS.length; + setSpeedIndex(nextIndex); + + const audio = audioRef.current; + if (audio) { + audio.playbackRate = SPEED_OPTIONS[nextIndex] ?? 1; + } + }, [speedIndex]); + + /** + * Handle progress bar click for seeking + */ + const handleProgressClick = useCallback( + (event: React.MouseEvent): void => { + const audio = audioRef.current; + if (!audio || !duration) return; + + const rect = event.currentTarget.getBoundingClientRect(); + const clickX = event.clientX - rect.left; + const fraction = clickX / rect.width; + audio.currentTime = fraction * duration; + setCurrentTime(audio.currentTime); + }, + [duration] + ); + + /** + * Handle download + */ + const handleDownload = useCallback((): void => { + if (!src) return; + + const link = document.createElement("a"); + link.href = src; + link.download = "speech-audio.mp3"; + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + }, [src]); + + // Don't render if no source + if (!src) return null; + + const progress = duration > 0 ? (currentTime / duration) * 100 : 0; + const currentSpeed = SPEED_OPTIONS[speedIndex] ?? 1; + + return ( +
+ {/* Play/Pause Button */} + + + {/* Time Display */} + + {formatTime(currentTime)} + {duration > 0 && / {formatTime(duration)}} + + + {/* Progress Bar */} +
+
+
+ + {/* Speed Control */} + + + {/* Download Button */} + +
+ ); +} + +export default AudioPlayer; diff --git a/apps/web/src/components/speech/AudioVisualizer.test.tsx b/apps/web/src/components/speech/AudioVisualizer.test.tsx new file mode 100644 index 0000000..6132f7e --- /dev/null +++ b/apps/web/src/components/speech/AudioVisualizer.test.tsx @@ -0,0 +1,70 @@ +import { describe, it, expect } from "vitest"; +import { render, screen } from "@testing-library/react"; +import { AudioVisualizer } from "./AudioVisualizer"; + +describe("AudioVisualizer", (): void => { + it("should render the visualizer container", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + expect(container).toBeInTheDocument(); + }); + + it("should render visualization bars", (): void => { + render(); + + const bars = screen.getAllByTestId("visualizer-bar"); + expect(bars.length).toBeGreaterThan(0); + }); + + it("should show inactive state when not active", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + expect(container).toBeInTheDocument(); + // Bars should be at minimum height when inactive + const bars = screen.getAllByTestId("visualizer-bar"); + bars.forEach((bar) => { + const style = bar.getAttribute("style"); + expect(style).toContain("height"); + }); + }); + + it("should reflect audio level in bar heights when active", (): void => { + render(); + + const bars = screen.getAllByTestId("visualizer-bar"); + // At least one bar should have non-minimal height + const hasActiveBars = bars.some((bar) => { + const style = bar.getAttribute("style") ?? ""; + const heightMatch = /height:\s*(\d+)/.exec(style); + return heightMatch?.[1] ? parseInt(heightMatch[1], 10) > 4 : false; + }); + expect(hasActiveBars).toBe(true); + }); + + it("should use calm colors (no aggressive reds)", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + const allElements = container.querySelectorAll("*"); + allElements.forEach((el) => { + const className = (el as HTMLElement).className; + expect(className).not.toMatch(/bg-red-|text-red-/); + }); + }); + + it("should accept custom className", (): void => { + render(); + + const container = screen.getByTestId("audio-visualizer"); + expect(container.className).toContain("custom-class"); + }); + + it("should render with configurable bar count", (): void => { + render(); + + const bars = screen.getAllByTestId("visualizer-bar"); + expect(bars).toHaveLength(8); + }); +}); diff --git a/apps/web/src/components/speech/AudioVisualizer.tsx b/apps/web/src/components/speech/AudioVisualizer.tsx new file mode 100644 index 0000000..e215fd0 --- /dev/null +++ b/apps/web/src/components/speech/AudioVisualizer.tsx @@ -0,0 +1,87 @@ +/** + * AudioVisualizer component + * + * Displays a simple audio level visualization using bars. + * Uses the Web Audio API's AnalyserNode data (passed as audioLevel) + * to show microphone input levels during recording. + * + * Design: Calm, non-aggressive colors following PDA-friendly guidelines. + */ + +import { useMemo } from "react"; + +export interface AudioVisualizerProps { + /** Current audio level (0-1) */ + audioLevel: number; + /** Whether the visualizer is actively listening */ + isActive: boolean; + /** Number of bars to display (default: 5) */ + barCount?: number; + /** Additional CSS classes */ + className?: string; +} + +/** + * Generate bar heights based on audio level. + * Creates a natural-looking wave pattern where center bars are taller. + */ +function generateBarHeights(level: number, count: number): number[] { + const heights: number[] = []; + const center = (count - 1) / 2; + + for (let i = 0; i < count; i++) { + // Distance from center (0-1) + const distFromCenter = Math.abs(i - center) / center; + // Center bars are taller, edge bars shorter + const multiplier = 1 - distFromCenter * 0.5; + // Min height 4px, max height 24px when active + const minHeight = 4; + const maxHeight = 24; + const height = minHeight + level * (maxHeight - minHeight) * multiplier; + heights.push(Math.round(height)); + } + + return heights; +} + +/** + * Audio level visualizer with animated bars. + * Shows microphone input levels during voice recording. + */ +export function AudioVisualizer({ + audioLevel, + isActive, + barCount = 5, + className = "", +}: AudioVisualizerProps): React.JSX.Element { + const barHeights = useMemo(() => { + if (!isActive) { + return Array.from({ length: barCount }, () => 4); + } + return generateBarHeights(audioLevel, barCount); + }, [audioLevel, isActive, barCount]); + + return ( +
+ {barHeights.map((height, index) => ( +
+ ))} +
+ ); +} diff --git a/apps/web/src/components/speech/TextToSpeechButton.test.tsx b/apps/web/src/components/speech/TextToSpeechButton.test.tsx new file mode 100644 index 0000000..cd265c3 --- /dev/null +++ b/apps/web/src/components/speech/TextToSpeechButton.test.tsx @@ -0,0 +1,218 @@ +/** + * @file TextToSpeechButton.test.tsx + * @description Tests for the TextToSpeechButton "Read aloud" component + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { TextToSpeechButton } from "./TextToSpeechButton"; + +// Mock the useTextToSpeech hook +const mockSynthesize = vi.fn(); +const mockPlay = vi.fn(); +const mockPause = vi.fn(); +const mockStop = vi.fn(); + +vi.mock("@/hooks/useTextToSpeech", () => ({ + useTextToSpeech: vi.fn(() => ({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: false, + error: null, + isPlaying: false, + duration: 0, + currentTime: 0, + })), +})); + +// Import after mocking +import { useTextToSpeech } from "@/hooks/useTextToSpeech"; + +const mockUseTextToSpeech = useTextToSpeech as ReturnType; + +// Mock HTMLAudioElement for AudioPlayer used inside TextToSpeechButton +class MockAudio { + src = ""; + currentTime = 0; + duration = 60; + paused = true; + playbackRate = 1; + volume = 1; + onended: (() => void) | null = null; + ontimeupdate: (() => void) | null = null; + onloadedmetadata: (() => void) | null = null; + onerror: ((e: unknown) => void) | null = null; + + play(): Promise { + this.paused = false; + return Promise.resolve(); + } + + pause(): void { + this.paused = true; + } + + addEventListener(): void { + // no-op + } + + removeEventListener(): void { + // no-op + } +} + +vi.stubGlobal("Audio", MockAudio); + +describe("TextToSpeechButton", () => { + beforeEach(() => { + vi.clearAllMocks(); + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: false, + error: null, + isPlaying: false, + duration: 0, + currentTime: 0, + }); + }); + + describe("rendering", () => { + it("should render a read aloud button", () => { + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + expect(button).toBeInTheDocument(); + }); + + it("should not render AudioPlayer initially when no audio is synthesized", () => { + render(); + + expect(screen.queryByRole("region", { name: /audio player/i })).not.toBeInTheDocument(); + }); + }); + + describe("click behavior", () => { + it("should call synthesize with text on click", async () => { + const user = userEvent.setup(); + mockSynthesize.mockResolvedValueOnce(undefined); + + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + await user.click(button); + + expect(mockSynthesize).toHaveBeenCalledWith("Hello world", undefined); + }); + + it("should pass voice and tier options when provided", async () => { + const user = userEvent.setup(); + mockSynthesize.mockResolvedValueOnce(undefined); + + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + await user.click(button); + + expect(mockSynthesize).toHaveBeenCalledWith("Hello", { + voice: "alloy", + tier: "premium", + }); + }); + }); + + describe("loading state", () => { + it("should show loading indicator while synthesizing", () => { + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: true, + error: null, + isPlaying: false, + duration: 0, + currentTime: 0, + }); + + render(); + + const button = screen.getByRole("button", { name: /synthesizing/i }); + expect(button).toBeInTheDocument(); + expect(button).toBeDisabled(); + }); + }); + + describe("audio player integration", () => { + it("should show AudioPlayer when audio is available", () => { + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: "blob:mock-url", + isLoading: false, + error: null, + isPlaying: false, + duration: 30, + currentTime: 0, + }); + + render(); + + expect(screen.getByRole("region", { name: /audio player/i })).toBeInTheDocument(); + }); + }); + + describe("error state", () => { + it("should display error message when synthesis fails", () => { + mockUseTextToSpeech.mockReturnValue({ + synthesize: mockSynthesize, + play: mockPlay, + pause: mockPause, + stop: mockStop, + audioUrl: null, + isLoading: false, + error: "Synthesis failed", + isPlaying: false, + duration: 0, + currentTime: 0, + }); + + render(); + + expect(screen.getByText(/synthesis failed/i)).toBeInTheDocument(); + }); + }); + + describe("accessibility", () => { + it("should have proper aria label on button", () => { + render(); + + const button = screen.getByRole("button", { name: /read aloud/i }); + expect(button).toBeInTheDocument(); + }); + }); + + describe("design", () => { + it("should not use aggressive colors", () => { + const { container } = render(); + + const allElements = container.querySelectorAll("*"); + allElements.forEach((el) => { + const className = el.className; + if (typeof className === "string") { + expect(className).not.toMatch(/bg-red-|text-red-|border-red-/); + } + }); + }); + }); +}); diff --git a/apps/web/src/components/speech/TextToSpeechButton.tsx b/apps/web/src/components/speech/TextToSpeechButton.tsx new file mode 100644 index 0000000..a8f97f7 --- /dev/null +++ b/apps/web/src/components/speech/TextToSpeechButton.tsx @@ -0,0 +1,126 @@ +/** + * TextToSpeechButton Component + * "Read aloud" button that synthesizes text and plays it via AudioPlayer. + * + * Accepts text as a prop, with optional voice and tier selection. + * Shows loading state during synthesis and integrates AudioPlayer for playback. + * + * Follows PDA-friendly design: no aggressive colors, calm interface. + */ + +import { useCallback } from "react"; +import type { ReactElement } from "react"; +import { useTextToSpeech } from "@/hooks/useTextToSpeech"; +import type { SynthesizeOptions } from "@/hooks/useTextToSpeech"; +import { AudioPlayer } from "./AudioPlayer"; + +export interface TextToSpeechButtonProps { + /** The text to synthesize to speech */ + text: string; + /** Optional voice ID to use */ + voice?: string; + /** Optional tier (e.g. "standard", "premium") */ + tier?: string; + /** Optional className for the container */ + className?: string; +} + +/** + * TextToSpeechButton provides a "Read aloud" button that synthesizes + * the given text and displays an AudioPlayer for playback control. + */ +export function TextToSpeechButton({ + text, + voice, + tier, + className = "", +}: TextToSpeechButtonProps): ReactElement { + const { synthesize, audioUrl, isLoading, error } = useTextToSpeech(); + + /** + * Handle read aloud button click + */ + const handleClick = useCallback(async (): Promise => { + let options: SynthesizeOptions | undefined; + + if (voice !== undefined || tier !== undefined) { + options = {}; + if (voice !== undefined) options.voice = voice; + if (tier !== undefined) options.tier = tier; + } + + await synthesize(text, options); + }, [text, voice, tier, synthesize]); + + return ( +
+ {/* Read Aloud Button */} + + + {/* Error Display */} + {error && ( +

+ {error} +

+ )} + + {/* Audio Player (shown after synthesis) */} + {audioUrl && } +
+ ); +} + +export default TextToSpeechButton; diff --git a/apps/web/src/components/speech/VoiceInput.test.tsx b/apps/web/src/components/speech/VoiceInput.test.tsx new file mode 100644 index 0000000..74c1f44 --- /dev/null +++ b/apps/web/src/components/speech/VoiceInput.test.tsx @@ -0,0 +1,228 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { VoiceInput } from "./VoiceInput"; + +// Mock the useVoiceInput hook +const mockStartRecording = vi.fn(); +const mockStopRecording = vi.fn(); + +vi.mock("@/hooks/useVoiceInput", () => ({ + useVoiceInput: vi.fn(() => ({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0, + })), +})); + +// We need to import after mocking +import { useVoiceInput } from "@/hooks/useVoiceInput"; + +describe("VoiceInput", (): void => { + beforeEach((): void => { + vi.clearAllMocks(); + // Reset mock implementation to default + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0, + }); + }); + + it("should render a microphone button", (): void => { + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + expect(button).toBeInTheDocument(); + }); + + it("should have accessible aria label", (): void => { + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + expect(button).toHaveAttribute("aria-label", "Start voice input"); + }); + + it("should call startRecording when mic button is clicked", async (): Promise => { + const user = userEvent.setup(); + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + await user.click(button); + + expect(mockStartRecording).toHaveBeenCalledTimes(1); + }); + + it("should show recording state when isRecording is true", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0.5, + }); + + render(); + + const button = screen.getByRole("button", { + name: /stop voice input/i, + }); + expect(button).toBeInTheDocument(); + }); + + it("should call stopRecording when mic button is clicked while recording", async (): Promise => { + const user = userEvent.setup(); + + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0.5, + }); + + render(); + + const button = screen.getByRole("button", { + name: /stop voice input/i, + }); + await user.click(button); + + expect(mockStopRecording).toHaveBeenCalledTimes(1); + }); + + it("should display partial transcription text", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "hello worl", + error: null, + audioLevel: 0.3, + }); + + render(); + + expect(screen.getByText("hello worl")).toBeInTheDocument(); + }); + + it("should display final transcript text", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "hello world", + partialTranscript: "", + error: null, + audioLevel: 0, + }); + + render(); + + expect(screen.getByText("hello world")).toBeInTheDocument(); + }); + + it("should display error message", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: "Microphone access not available", + audioLevel: 0, + }); + + render(); + + expect(screen.getByText("Microphone access not available")).toBeInTheDocument(); + }); + + it("should call onTranscript callback prop", (): void => { + const onTranscript = vi.fn(); + + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "final text", + partialTranscript: "", + error: null, + audioLevel: 0, + }); + + render(); + + // The onTranscript prop is passed to the hook - we verify the prop is accepted + expect(useVoiceInput).toHaveBeenCalledWith( + expect.objectContaining({ + onTranscript, + }) + ); + }); + + it("should use calm, non-aggressive design for recording indicator", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: true, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: null, + audioLevel: 0.5, + }); + + render(); + + // Check there are no aggressive red colors in the recording state + const button = screen.getByRole("button", { name: /stop voice input/i }); + const className = button.className; + expect(className).not.toMatch(/bg-red-|text-red-|border-red-/); + }); + + it("should use calm design for error display", (): void => { + vi.mocked(useVoiceInput).mockReturnValue({ + isRecording: false, + startRecording: mockStartRecording, + stopRecording: mockStopRecording, + transcript: "", + partialTranscript: "", + error: "Something went wrong", + audioLevel: 0, + }); + + render(); + + const errorEl = screen.getByText("Something went wrong"); + const className = errorEl.className; + expect(className).not.toMatch(/text-red-600|bg-red-/); + }); + + it("should be disabled when disabled prop is true", (): void => { + render(); + + const button = screen.getByRole("button", { + name: /start voice input/i, + }); + expect(button).toBeDisabled(); + }); +}); diff --git a/apps/web/src/components/speech/VoiceInput.tsx b/apps/web/src/components/speech/VoiceInput.tsx new file mode 100644 index 0000000..fa74e53 --- /dev/null +++ b/apps/web/src/components/speech/VoiceInput.tsx @@ -0,0 +1,146 @@ +/** + * VoiceInput component + * + * Provides a microphone button with visual feedback for voice input. + * Click to start/stop recording with real-time transcription display. + * + * Design principles: + * - PDA-friendly: calm, non-aggressive colors + * - Gentle pulsing animation for recording state (blue/green) + * - Mobile-friendly touch interaction + * - Accessible with proper aria labels + */ + +import { useVoiceInput } from "@/hooks/useVoiceInput"; +import type { UseVoiceInputOptions } from "@/hooks/useVoiceInput"; +import { AudioVisualizer } from "./AudioVisualizer"; +import { Mic, MicOff } from "lucide-react"; + +export interface VoiceInputProps { + /** Callback fired when final transcription is received */ + onTranscript?: (text: string) => void; + /** Whether to use WebSocket streaming (default: true) */ + useWebSocket?: boolean; + /** Whether the input is disabled */ + disabled?: boolean; + /** Additional CSS classes for the container */ + className?: string; +} + +/** + * Voice input component with microphone capture and real-time transcription. + * Shows a mic button that toggles recording, with visual feedback + * and transcription text display. + */ +export function VoiceInput({ + onTranscript, + useWebSocket: useWs, + disabled = false, + className = "", +}: VoiceInputProps): React.JSX.Element { + const hookOptions: UseVoiceInputOptions = {}; + if (onTranscript !== undefined) { + hookOptions.onTranscript = onTranscript; + } + if (useWs !== undefined) { + hookOptions.useWebSocket = useWs; + } + + const { + isRecording, + startRecording, + stopRecording, + transcript, + partialTranscript, + error, + audioLevel, + } = useVoiceInput(hookOptions); + + const handleClick = (): void => { + if (isRecording) { + stopRecording(); + } else { + void startRecording(); + } + }; + + const displayText = isRecording ? partialTranscript : transcript; + + return ( +
+ {/* Mic button with recording indicator */} +
+ {/* Pulsing ring animation when recording */} + {isRecording && ( + + + {/* Recording status indicator */} + {isRecording && ( +
+
+ )} + + {/* Transcription text display */} + {displayText && ( +

+ {displayText} +

+ )} + + {/* Error display - calm, non-aggressive */} + {error && ( +

+ {error} +

+ )} +
+ ); +} diff --git a/apps/web/src/components/speech/index.ts b/apps/web/src/components/speech/index.ts new file mode 100644 index 0000000..657e410 --- /dev/null +++ b/apps/web/src/components/speech/index.ts @@ -0,0 +1,8 @@ +export { VoiceInput } from "./VoiceInput"; +export type { VoiceInputProps } from "./VoiceInput"; +export { AudioVisualizer } from "./AudioVisualizer"; +export type { AudioVisualizerProps } from "./AudioVisualizer"; +export { AudioPlayer } from "./AudioPlayer"; +export type { AudioPlayerProps } from "./AudioPlayer"; +export { TextToSpeechButton } from "./TextToSpeechButton"; +export type { TextToSpeechButtonProps } from "./TextToSpeechButton"; diff --git a/apps/web/src/hooks/useTextToSpeech.test.ts b/apps/web/src/hooks/useTextToSpeech.test.ts new file mode 100644 index 0000000..a6e1a0f --- /dev/null +++ b/apps/web/src/hooks/useTextToSpeech.test.ts @@ -0,0 +1,285 @@ +/** + * @file useTextToSpeech.test.ts + * @description Tests for the useTextToSpeech hook that manages TTS API integration + */ + +import { renderHook, act } from "@testing-library/react"; +import { describe, it, expect, beforeEach, vi, afterEach } from "vitest"; +import { useTextToSpeech } from "./useTextToSpeech"; +import * as speechApi from "@/lib/api/speech"; + +// Mock the speech API module +vi.mock("@/lib/api/speech", () => ({ + synthesizeSpeech: vi.fn(), + getVoices: vi.fn(), +})); + +// Mock URL.createObjectURL and URL.revokeObjectURL +const mockCreateObjectURL = vi.fn().mockReturnValue("blob:mock-audio-url"); +const mockRevokeObjectURL = vi.fn(); + +beforeEach(() => { + global.URL.createObjectURL = mockCreateObjectURL; + global.URL.revokeObjectURL = mockRevokeObjectURL; +}); + +// Mock HTMLAudioElement +class MockAudio { + src = ""; + currentTime = 0; + duration = 120; + paused = true; + playbackRate = 1; + volume = 1; + onended: (() => void) | null = null; + ontimeupdate: (() => void) | null = null; + onloadedmetadata: (() => void) | null = null; + onerror: ((e: unknown) => void) | null = null; + + play(): Promise { + this.paused = false; + return Promise.resolve(); + } + + pause(): void { + this.paused = true; + } + + addEventListener(event: string, handler: () => void): void { + if (event === "ended") this.onended = handler; + if (event === "timeupdate") this.ontimeupdate = handler; + if (event === "loadedmetadata") this.onloadedmetadata = handler; + if (event === "error") this.onerror = handler; + } + + removeEventListener(): void { + // no-op for tests + } +} + +vi.stubGlobal("Audio", MockAudio); + +const mockSynthesizeSpeech = speechApi.synthesizeSpeech as ReturnType; + +describe("useTextToSpeech", () => { + beforeEach(() => { + vi.clearAllMocks(); + mockCreateObjectURL.mockReturnValue("blob:mock-audio-url"); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + describe("initial state", () => { + it("should return correct initial interface", () => { + const { result } = renderHook(() => useTextToSpeech()); + + expect(result.current.synthesize).toBeTypeOf("function"); + expect(result.current.play).toBeTypeOf("function"); + expect(result.current.pause).toBeTypeOf("function"); + expect(result.current.stop).toBeTypeOf("function"); + expect(result.current.audioUrl).toBeNull(); + expect(result.current.isLoading).toBe(false); + expect(result.current.error).toBeNull(); + expect(result.current.isPlaying).toBe(false); + expect(result.current.duration).toBe(0); + expect(result.current.currentTime).toBe(0); + }); + }); + + describe("synthesize", () => { + it("should call API and return audio blob URL", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello world"); + }); + + expect(mockSynthesizeSpeech).toHaveBeenCalledWith({ + text: "Hello world", + }); + expect(result.current.audioUrl).toBe("blob:mock-audio-url"); + expect(result.current.isLoading).toBe(false); + expect(result.current.error).toBeNull(); + }); + + it("should pass voice and tier options to API", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello", { + voice: "alloy", + tier: "premium", + speed: 1.5, + }); + }); + + expect(mockSynthesizeSpeech).toHaveBeenCalledWith({ + text: "Hello", + voice: "alloy", + tier: "premium", + speed: 1.5, + }); + }); + + it("should set loading state while synthesizing", async () => { + let resolvePromise: ((value: Blob) => void) | undefined; + const pendingPromise = new Promise((resolve) => { + resolvePromise = resolve; + }); + mockSynthesizeSpeech.mockReturnValueOnce(pendingPromise); + + const { result } = renderHook(() => useTextToSpeech()); + + act(() => { + void result.current.synthesize("Hello"); + }); + + expect(result.current.isLoading).toBe(true); + + await act(async () => { + resolvePromise?.(new Blob(["audio"], { type: "audio/mpeg" })); + await pendingPromise; + }); + + expect(result.current.isLoading).toBe(false); + }); + + it("should handle API errors gracefully", async () => { + mockSynthesizeSpeech.mockRejectedValueOnce(new Error("Synthesis failed")); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + expect(result.current.error).toBe("Synthesis failed"); + expect(result.current.isLoading).toBe(false); + expect(result.current.audioUrl).toBeNull(); + }); + + it("should cache audio for repeated synthesis of same text", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValue(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + // First call + await act(async () => { + await result.current.synthesize("Hello world"); + }); + + // Second call with same text + await act(async () => { + await result.current.synthesize("Hello world"); + }); + + // API should only be called once due to caching + expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(1); + }); + + it("should not cache when options differ", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValue(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello", { voice: "alloy" }); + }); + + await act(async () => { + await result.current.synthesize("Hello", { voice: "nova" }); + }); + + expect(mockSynthesizeSpeech).toHaveBeenCalledTimes(2); + }); + }); + + describe("playback controls", () => { + it("should play audio after synthesis", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + await act(async () => { + await result.current.play(); + }); + + expect(result.current.isPlaying).toBe(true); + }); + + it("should pause audio playback", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + await act(async () => { + await result.current.play(); + }); + + act(() => { + result.current.pause(); + }); + + expect(result.current.isPlaying).toBe(false); + }); + + it("should stop and reset playback", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + await act(async () => { + await result.current.play(); + }); + + act(() => { + result.current.stop(); + }); + + expect(result.current.isPlaying).toBe(false); + expect(result.current.currentTime).toBe(0); + }); + }); + + describe("cleanup", () => { + it("should revoke object URLs on unmount", async () => { + const mockBlob = new Blob(["audio-data"], { type: "audio/mpeg" }); + mockSynthesizeSpeech.mockResolvedValueOnce(mockBlob); + + const { result, unmount } = renderHook(() => useTextToSpeech()); + + await act(async () => { + await result.current.synthesize("Hello"); + }); + + unmount(); + + expect(mockRevokeObjectURL).toHaveBeenCalled(); + }); + }); +}); diff --git a/apps/web/src/hooks/useTextToSpeech.ts b/apps/web/src/hooks/useTextToSpeech.ts new file mode 100644 index 0000000..cc04cc4 --- /dev/null +++ b/apps/web/src/hooks/useTextToSpeech.ts @@ -0,0 +1,239 @@ +/** + * useTextToSpeech hook + * Manages TTS API integration with synthesis, caching, and playback state + */ + +import { useState, useCallback, useRef, useEffect } from "react"; +import { synthesizeSpeech } from "@/lib/api/speech"; + +export interface SynthesizeOptions { + voice?: string; + speed?: number; + format?: string; + tier?: string; +} + +export interface UseTextToSpeechReturn { + /** Synthesize text to speech audio */ + synthesize: (text: string, options?: SynthesizeOptions) => Promise; + /** The URL of the synthesized audio blob */ + audioUrl: string | null; + /** Whether synthesis is in progress */ + isLoading: boolean; + /** Error message if synthesis failed */ + error: string | null; + /** Start or resume audio playback */ + play: () => Promise; + /** Pause audio playback */ + pause: () => void; + /** Stop audio and reset to beginning */ + stop: () => void; + /** Whether audio is currently playing */ + isPlaying: boolean; + /** Total duration of the audio in seconds */ + duration: number; + /** Current playback position in seconds */ + currentTime: number; +} + +/** Cache key generator for text + options combination */ +function getCacheKey(text: string, options?: SynthesizeOptions): string { + return JSON.stringify({ text, ...options }); +} + +/** + * Hook for text-to-speech API integration with caching and playback controls + */ +export function useTextToSpeech(): UseTextToSpeechReturn { + const [audioUrl, setAudioUrl] = useState(null); + const [isLoading, setIsLoading] = useState(false); + const [error, setError] = useState(null); + const [isPlaying, setIsPlaying] = useState(false); + const [duration, setDuration] = useState(0); + const [currentTime, setCurrentTime] = useState(0); + + // Audio element ref for playback control + const audioRef = useRef(null); + + // Cache: maps cache key -> blob URL + const cacheRef = useRef>(new Map()); + + // Track all blob URLs for cleanup + const blobUrlsRef = useRef>(new Set()); + + /** + * Clean up audio element event listeners and state + */ + const cleanupAudio = useCallback(() => { + const audio = audioRef.current; + if (audio) { + audio.pause(); + audio.removeEventListener("ended", handleEnded); + audio.removeEventListener("timeupdate", handleTimeUpdate); + audio.removeEventListener("loadedmetadata", handleLoadedMetadata); + audioRef.current = null; + } + setIsPlaying(false); + }, []); + + /** + * Handle audio ended event + */ + function handleEnded(): void { + setIsPlaying(false); + setCurrentTime(0); + } + + /** + * Handle audio time update event + */ + function handleTimeUpdate(): void { + const audio = audioRef.current; + if (audio) { + setCurrentTime(audio.currentTime); + } + } + + /** + * Handle audio metadata loaded event + */ + function handleLoadedMetadata(): void { + const audio = audioRef.current; + if (audio && isFinite(audio.duration)) { + setDuration(audio.duration); + } + } + + /** + * Set up a new Audio element for a given URL + */ + const setupAudio = useCallback( + (url: string) => { + cleanupAudio(); + + const audio = new Audio(url); + audio.addEventListener("ended", handleEnded); + audio.addEventListener("timeupdate", handleTimeUpdate); + audio.addEventListener("loadedmetadata", handleLoadedMetadata); + audioRef.current = audio; + }, + [cleanupAudio] + ); + + /** + * Synthesize text to speech + */ + const synthesize = useCallback( + async (text: string, options?: SynthesizeOptions): Promise => { + setError(null); + + // Check cache first + const cacheKey = getCacheKey(text, options); + const cachedUrl = cacheRef.current.get(cacheKey); + + if (cachedUrl) { + setAudioUrl(cachedUrl); + setupAudio(cachedUrl); + return; + } + + setIsLoading(true); + + try { + const blob = await synthesizeSpeech({ + text, + ...(options?.voice !== undefined && { voice: options.voice }), + ...(options?.speed !== undefined && { speed: options.speed }), + ...(options?.format !== undefined && { format: options.format }), + ...(options?.tier !== undefined && { tier: options.tier }), + }); + + const url = URL.createObjectURL(blob); + + // Store in cache and track for cleanup + cacheRef.current.set(cacheKey, url); + blobUrlsRef.current.add(url); + + setAudioUrl(url); + setupAudio(url); + } catch (err) { + const errorMsg = err instanceof Error ? err.message : "Speech synthesis failed"; + setError(errorMsg); + setAudioUrl(null); + } finally { + setIsLoading(false); + } + }, + [setupAudio] + ); + + /** + * Start or resume audio playback + */ + const play = useCallback(async (): Promise => { + const audio = audioRef.current; + if (audio) { + await audio.play(); + setIsPlaying(true); + } + }, []); + + /** + * Pause audio playback + */ + const pause = useCallback((): void => { + const audio = audioRef.current; + if (audio) { + audio.pause(); + setIsPlaying(false); + } + }, []); + + /** + * Stop audio and reset to beginning + */ + const stop = useCallback((): void => { + const audio = audioRef.current; + if (audio) { + audio.pause(); + audio.currentTime = 0; + setIsPlaying(false); + setCurrentTime(0); + } + }, []); + + // Cleanup on unmount: revoke all blob URLs and clean up audio + useEffect((): (() => void) => { + return (): void => { + // Clean up audio element + const audio = audioRef.current; + if (audio) { + audio.pause(); + audio.removeEventListener("ended", handleEnded); + audio.removeEventListener("timeupdate", handleTimeUpdate); + audio.removeEventListener("loadedmetadata", handleLoadedMetadata); + audioRef.current = null; + } + + // Revoke all blob URLs + for (const url of blobUrlsRef.current) { + URL.revokeObjectURL(url); + } + blobUrlsRef.current.clear(); + cacheRef.current.clear(); + }; + }, []); + + return { + synthesize, + audioUrl, + isLoading, + error, + play, + pause, + stop, + isPlaying, + duration, + currentTime, + }; +} diff --git a/apps/web/src/hooks/useVoiceInput.test.ts b/apps/web/src/hooks/useVoiceInput.test.ts new file mode 100644 index 0000000..4f80a34 --- /dev/null +++ b/apps/web/src/hooks/useVoiceInput.test.ts @@ -0,0 +1,362 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { renderHook, act, waitFor } from "@testing-library/react"; +import { useVoiceInput } from "./useVoiceInput"; +import type { Socket } from "socket.io-client"; +import { io } from "socket.io-client"; + +// Mock socket.io-client +vi.mock("socket.io-client"); + +// Mock MediaRecorder +const mockMediaRecorder = { + start: vi.fn(), + stop: vi.fn(), + pause: vi.fn(), + resume: vi.fn(), + state: "inactive" as RecordingState, + ondataavailable: null as ((event: BlobEvent) => void) | null, + onstop: null as (() => void) | null, + onerror: null as ((event: Event) => void) | null, + addEventListener: vi.fn((event: string, handler: EventListenerOrEventListenerObject) => { + if (event === "dataavailable") { + mockMediaRecorder.ondataavailable = handler as (event: BlobEvent) => void; + } else if (event === "stop") { + mockMediaRecorder.onstop = handler as () => void; + } else if (event === "error") { + mockMediaRecorder.onerror = handler as (event: Event) => void; + } + }), + removeEventListener: vi.fn(), + stream: { + getTracks: vi.fn(() => [{ stop: vi.fn() }]), + }, +}; + +// Mock MediaStream with getByteFrequencyData for audio level +const mockAnalyserNode = { + fftSize: 256, + frequencyBinCount: 128, + getByteFrequencyData: vi.fn((array: Uint8Array) => { + // Simulate some audio data + for (let i = 0; i < array.length; i++) { + array[i] = 128; + } + }), + connect: vi.fn(), + disconnect: vi.fn(), +}; + +const mockMediaStreamSource = { + connect: vi.fn(), + disconnect: vi.fn(), +}; + +const mockAudioContext = { + createAnalyser: vi.fn(() => mockAnalyserNode), + createMediaStreamSource: vi.fn(() => mockMediaStreamSource), + close: vi.fn(), + state: "running", +}; + +// Mock getUserMedia +const mockGetUserMedia = vi.fn(); + +// Set up global mocks +Object.defineProperty(global.navigator, "mediaDevices", { + value: { + getUserMedia: mockGetUserMedia, + }, + writable: true, + configurable: true, +}); + +// Mock AudioContext +vi.stubGlobal( + "AudioContext", + vi.fn(() => mockAudioContext) +); + +// Mock MediaRecorder constructor +vi.stubGlobal( + "MediaRecorder", + vi.fn(() => mockMediaRecorder) +); + +// Add isTypeSupported static method +( + global.MediaRecorder as unknown as { isTypeSupported: (type: string) => boolean } +).isTypeSupported = vi.fn(() => true); + +describe("useVoiceInput", (): void => { + let mockSocket: Partial; + let socketEventHandlers: Record void>; + + beforeEach((): void => { + socketEventHandlers = {}; + + mockSocket = { + on: vi.fn((event: string, handler: (...args: unknown[]) => void) => { + socketEventHandlers[event] = handler; + return mockSocket; + }) as unknown as Socket["on"], + off: vi.fn(() => mockSocket) as unknown as Socket["off"], + emit: vi.fn() as unknown as Socket["emit"], + connect: vi.fn(), + disconnect: vi.fn(), + connected: true, + }; + + (io as unknown as ReturnType).mockReturnValue(mockSocket); + + // Reset MediaRecorder mock state + mockMediaRecorder.state = "inactive"; + mockMediaRecorder.ondataavailable = null; + mockMediaRecorder.onstop = null; + mockMediaRecorder.onerror = null; + + // Default: getUserMedia succeeds + const mockStream = { + getTracks: vi.fn(() => [{ stop: vi.fn() }]), + } as unknown as MediaStream; + mockGetUserMedia.mockResolvedValue(mockStream); + }); + + afterEach((): void => { + vi.clearAllMocks(); + }); + + it("should return the correct interface", (): void => { + const { result } = renderHook(() => useVoiceInput()); + + expect(result.current).toHaveProperty("isRecording"); + expect(result.current).toHaveProperty("startRecording"); + expect(result.current).toHaveProperty("stopRecording"); + expect(result.current).toHaveProperty("transcript"); + expect(result.current).toHaveProperty("partialTranscript"); + expect(result.current).toHaveProperty("error"); + expect(result.current).toHaveProperty("audioLevel"); + }); + + it("should start with default state", (): void => { + const { result } = renderHook(() => useVoiceInput()); + + expect(result.current.isRecording).toBe(false); + expect(result.current.transcript).toBe(""); + expect(result.current.partialTranscript).toBe(""); + expect(result.current.error).toBeNull(); + expect(result.current.audioLevel).toBe(0); + }); + + it("should start recording when startRecording is called", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(true); + expect(mockGetUserMedia).toHaveBeenCalledWith({ + audio: { + echoCancellation: true, + noiseSuppression: true, + sampleRate: 16000, + }, + }); + }); + + it("should stop recording when stopRecording is called", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(true); + + act(() => { + result.current.stopRecording(); + }); + + expect(result.current.isRecording).toBe(false); + }); + + it("should set error when microphone access is denied", async (): Promise => { + mockGetUserMedia.mockRejectedValueOnce( + new DOMException("Permission denied", "NotAllowedError") + ); + + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(false); + expect(result.current.error).toBeTruthy(); + expect(result.current.error).toContain("microphone"); + }); + + it("should connect to speech WebSocket namespace", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(io).toHaveBeenCalledWith( + expect.any(String), + expect.objectContaining({ + path: "/socket.io", + }) + ); + }); + + it("should emit start-transcription when recording begins", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + expect(mockSocket.emit).toHaveBeenCalledWith( + "start-transcription", + expect.objectContaining({ + // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment + format: expect.any(String), + }) + ); + }); + + it("should emit stop-transcription when recording stops", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + result.current.stopRecording(); + }); + + expect(mockSocket.emit).toHaveBeenCalledWith("stop-transcription"); + }); + + it("should handle partial transcription events", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-partial"]?.({ + text: "hello world", + }); + }); + + await waitFor(() => { + expect(result.current.partialTranscript).toBe("hello world"); + }); + }); + + it("should handle final transcription events", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-final"]?.({ + text: "hello world final", + }); + }); + + await waitFor(() => { + expect(result.current.transcript).toBe("hello world final"); + }); + }); + + it("should handle transcription error events", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-error"]?.({ + message: "Transcription failed", + }); + }); + + await waitFor(() => { + expect(result.current.error).toBe("Transcription failed"); + }); + }); + + it("should call onTranscript callback when final transcription received", async (): Promise => { + const onTranscript = vi.fn(); + const { result } = renderHook(() => useVoiceInput({ onTranscript })); + + await act(async () => { + await result.current.startRecording(); + }); + + act(() => { + socketEventHandlers["transcription-final"]?.({ + text: "final text", + }); + }); + + await waitFor(() => { + expect(onTranscript).toHaveBeenCalledWith("final text"); + }); + }); + + it("should clean up on unmount", async (): Promise => { + const { result, unmount } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + unmount(); + + expect(mockSocket.disconnect).toHaveBeenCalled(); + }); + + it("should not start recording if already recording", async (): Promise => { + const { result } = renderHook(() => useVoiceInput()); + + await act(async () => { + await result.current.startRecording(); + }); + + // Reset the call count + mockGetUserMedia.mockClear(); + + await act(async () => { + await result.current.startRecording(); + }); + + // Should not have called getUserMedia again + expect(mockGetUserMedia).not.toHaveBeenCalled(); + }); + + describe("REST fallback", (): void => { + it("should fall back to REST when WebSocket is unavailable", async (): Promise => { + // Simulate socket not connecting + (mockSocket as { connected: boolean }).connected = false; + + const { result } = renderHook(() => useVoiceInput({ useWebSocket: false })); + + // Should still be able to start recording (REST mode) + await act(async () => { + await result.current.startRecording(); + }); + + expect(result.current.isRecording).toBe(true); + }); + }); +}); diff --git a/apps/web/src/hooks/useVoiceInput.ts b/apps/web/src/hooks/useVoiceInput.ts new file mode 100644 index 0000000..24e792d --- /dev/null +++ b/apps/web/src/hooks/useVoiceInput.ts @@ -0,0 +1,409 @@ +/** + * useVoiceInput hook + * + * Custom hook for microphone capture and real-time transcription. + * Supports WebSocket streaming for real-time partial transcriptions + * with REST upload fallback when WebSocket is unavailable. + */ + +import { useState, useCallback, useRef, useEffect } from "react"; +import type { Socket } from "socket.io-client"; +import { io } from "socket.io-client"; +import { API_BASE_URL } from "@/lib/config"; +import { apiPostFormData } from "@/lib/api/client"; + +/** Options for the useVoiceInput hook */ +export interface UseVoiceInputOptions { + /** Callback fired when final transcription is received */ + onTranscript?: (text: string) => void; + /** Whether to use WebSocket streaming (default: true) */ + useWebSocket?: boolean; + /** Audio sample rate in Hz (default: 16000) */ + sampleRate?: number; +} + +/** Return type for the useVoiceInput hook */ +export interface UseVoiceInputReturn { + /** Whether the microphone is currently recording */ + isRecording: boolean; + /** Start microphone capture and transcription */ + startRecording: () => Promise; + /** Stop microphone capture and transcription */ + stopRecording: () => void; + /** The final transcription text */ + transcript: string; + /** Partial transcription text (updates in real-time) */ + partialTranscript: string; + /** Error message if something went wrong */ + error: string | null; + /** Current audio input level (0-1) */ + audioLevel: number; +} + +interface TranscriptionPartialPayload { + text: string; +} + +interface TranscriptionFinalPayload { + text: string; +} + +interface TranscriptionErrorPayload { + message: string; +} + +interface TranscribeResponse { + data: { + text: string; + }; +} + +/** + * Determine the best MIME type for audio recording + */ +function getAudioMimeType(): string { + if (typeof MediaRecorder === "undefined") { + return "audio/webm"; + } + const types = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/mp4"]; + for (const type of types) { + if (MediaRecorder.isTypeSupported(type)) { + return type; + } + } + return "audio/webm"; +} + +/** + * Hook for microphone capture and real-time speech-to-text transcription. + * + * Uses WebSocket streaming by default for real-time partial transcriptions. + * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket + * is disabled or unavailable. + */ +export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn { + const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options; + + const [isRecording, setIsRecording] = useState(false); + const [transcript, setTranscript] = useState(""); + const [partialTranscript, setPartialTranscript] = useState(""); + const [error, setError] = useState(null); + const [audioLevel, setAudioLevel] = useState(0); + + // Refs to hold mutable state without re-renders + const socketRef = useRef(null); + const mediaRecorderRef = useRef(null); + const streamRef = useRef(null); + const audioContextRef = useRef(null); + const analyserRef = useRef(null); + const animationFrameRef = useRef(null); + const onTranscriptRef = useRef(onTranscript); + const recordedChunksRef = useRef([]); + const isRecordingRef = useRef(false); + + // Keep callback ref up to date + useEffect(() => { + onTranscriptRef.current = onTranscript; + }, [onTranscript]); + + /** + * Set up audio analysis for visualizing input level + */ + const setupAudioAnalysis = useCallback((stream: MediaStream): void => { + try { + const audioContext = new AudioContext(); + const analyser = audioContext.createAnalyser(); + const source = audioContext.createMediaStreamSource(stream); + + analyser.fftSize = 256; + source.connect(analyser); + + audioContextRef.current = audioContext; + analyserRef.current = analyser; + + // Start level monitoring + const dataArray = new Uint8Array(analyser.frequencyBinCount); + + const updateLevel = (): void => { + if (!isRecordingRef.current) { + return; + } + + analyser.getByteFrequencyData(dataArray); + + // Calculate average level + let sum = 0; + for (const value of dataArray) { + sum += value; + } + const average = sum / dataArray.length / 255; + setAudioLevel(average); + + animationFrameRef.current = requestAnimationFrame(updateLevel); + }; + + animationFrameRef.current = requestAnimationFrame(updateLevel); + } catch { + // Audio analysis is non-critical; continue without it + console.warn("Audio analysis not available"); + } + }, []); + + /** + * Clean up audio analysis resources + */ + const cleanupAudioAnalysis = useCallback((): void => { + if (animationFrameRef.current !== null) { + cancelAnimationFrame(animationFrameRef.current); + animationFrameRef.current = null; + } + if (audioContextRef.current) { + void audioContextRef.current.close(); + audioContextRef.current = null; + } + analyserRef.current = null; + setAudioLevel(0); + }, []); + + /** + * Connect to the speech WebSocket namespace + */ + const connectSocket = useCallback((): Socket => { + const socket = io(API_BASE_URL, { + path: "/socket.io", + transports: ["websocket", "polling"], + }); + + socket.on("transcription-partial", (data: TranscriptionPartialPayload) => { + setPartialTranscript(data.text); + }); + + socket.on("transcription-final", (data: TranscriptionFinalPayload) => { + setTranscript(data.text); + setPartialTranscript(""); + onTranscriptRef.current?.(data.text); + }); + + socket.on("transcription-error", (data: TranscriptionErrorPayload) => { + setError(data.message); + }); + + socketRef.current = socket; + return socket; + }, []); + + /** + * Disconnect the WebSocket + */ + const disconnectSocket = useCallback((): void => { + if (socketRef.current) { + socketRef.current.off("transcription-partial"); + socketRef.current.off("transcription-final"); + socketRef.current.off("transcription-error"); + socketRef.current.disconnect(); + socketRef.current = null; + } + }, []); + + /** + * Send recorded audio via REST API as fallback + */ + const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise => { + try { + const formData = new FormData(); + formData.append("audio", audioBlob, "recording.webm"); + + const response = await apiPostFormData( + "/api/speech/transcribe", + formData + ); + + if (response.data.text) { + setTranscript(response.data.text); + setPartialTranscript(""); + onTranscriptRef.current?.(response.data.text); + } + } catch (err) { + const message = err instanceof Error ? err.message : "Transcription request failed"; + setError(message); + } + }, []); + + /** + * Stop all media tracks on the stream + */ + const stopMediaTracks = useCallback((): void => { + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => { + track.stop(); + }); + streamRef.current = null; + } + }, []); + + /** + * Start microphone capture and transcription + */ + const startRecording = useCallback(async (): Promise => { + // Prevent double-start + if (isRecordingRef.current) { + return; + } + + setError(null); + setPartialTranscript(""); + recordedChunksRef.current = []; + + try { + // Request microphone access + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + sampleRate, + }, + }); + + streamRef.current = stream; + + // Set up audio level visualization + setupAudioAnalysis(stream); + + // Determine MIME type + const mimeType = getAudioMimeType(); + + // Create MediaRecorder + const mediaRecorder = new MediaRecorder(stream, { mimeType }); + mediaRecorderRef.current = mediaRecorder; + + // Connect WebSocket if enabled + let socket: Socket | null = null; + if (useWs) { + socket = connectSocket(); + + // Emit start-transcription event + socket.emit("start-transcription", { + format: mimeType, + sampleRate, + }); + } + + // Handle audio data chunks + mediaRecorder.addEventListener("dataavailable", (event: BlobEvent) => { + if (event.data.size > 0) { + if (socket?.connected) { + // Stream chunks via WebSocket + socket.emit("audio-chunk", event.data); + } else { + // Collect chunks for REST upload + recordedChunksRef.current.push(event.data); + } + } + }); + + // Handle recording stop + mediaRecorder.addEventListener("stop", () => { + // If using REST fallback, send collected audio + if (!useWs || !socket?.connected) { + if (recordedChunksRef.current.length > 0) { + const audioBlob = new Blob(recordedChunksRef.current, { + type: mimeType, + }); + void sendAudioViaRest(audioBlob); + } + } + }); + + // Handle errors + mediaRecorder.addEventListener("error", () => { + setError("Recording encountered an issue. Please try again."); + setIsRecording(false); + isRecordingRef.current = false; + }); + + // Start recording with timeslice for streaming chunks (250ms intervals) + mediaRecorder.start(250); + setIsRecording(true); + isRecordingRef.current = true; + } catch (err) { + // Handle specific error types + if (err instanceof DOMException) { + if (err.name === "NotAllowedError") { + setError( + "Microphone access was not granted. Please allow microphone access to use voice input." + ); + } else if (err.name === "NotFoundError") { + setError("No microphone found. Please connect a microphone and try again."); + } else { + setError("Unable to access the microphone. Please check your device settings."); + } + } else { + setError("Unable to start voice input. Please try again."); + } + + // Clean up on failure + stopMediaTracks(); + cleanupAudioAnalysis(); + } + }, [ + useWs, + sampleRate, + setupAudioAnalysis, + connectSocket, + sendAudioViaRest, + stopMediaTracks, + cleanupAudioAnalysis, + ]); + + /** + * Stop microphone capture and transcription + */ + const stopRecording = useCallback((): void => { + setIsRecording(false); + isRecordingRef.current = false; + + // Stop MediaRecorder + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { + mediaRecorderRef.current.stop(); + mediaRecorderRef.current = null; + } + + // Stop media tracks + stopMediaTracks(); + + // Clean up audio analysis + cleanupAudioAnalysis(); + + // Emit stop event and disconnect WebSocket + if (socketRef.current) { + socketRef.current.emit("stop-transcription"); + // Give the server a moment to process the final chunk before disconnecting + setTimeout(() => { + disconnectSocket(); + }, 500); + } + }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]); + + // Cleanup on unmount + useEffect(() => { + return (): void => { + isRecordingRef.current = false; + if (mediaRecorderRef.current && mediaRecorderRef.current.state !== "inactive") { + mediaRecorderRef.current.stop(); + } + stopMediaTracks(); + cleanupAudioAnalysis(); + disconnectSocket(); + }; + }, [stopMediaTracks, cleanupAudioAnalysis, disconnectSocket]); + + return { + isRecording, + startRecording, + stopRecording, + transcript, + partialTranscript, + error, + audioLevel, + }; +} diff --git a/apps/web/src/lib/api/speech.ts b/apps/web/src/lib/api/speech.ts new file mode 100644 index 0000000..cf5aeef --- /dev/null +++ b/apps/web/src/lib/api/speech.ts @@ -0,0 +1,58 @@ +/** + * Speech API client + * Handles text-to-speech synthesis and voice listing via /api/speech + */ + +import { apiGet } from "./client"; +import { API_BASE_URL } from "../config"; + +export interface VoiceInfo { + id: string; + name: string; + language: string; + gender?: string; + preview_url?: string; +} + +export interface SynthesizeOptions { + text: string; + voice?: string; + speed?: number; + format?: string; + tier?: string; +} + +export interface VoicesResponse { + data: VoiceInfo[]; +} + +/** + * Fetch available TTS voices + */ +export async function getVoices(): Promise { + return apiGet("/api/speech/voices"); +} + +/** + * Synthesize text to speech audio + * Returns the audio as a Blob since the API returns binary audio data + */ +export async function synthesizeSpeech(options: SynthesizeOptions): Promise { + const url = `${API_BASE_URL}/api/speech/synthesize`; + + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + credentials: "include", + body: JSON.stringify(options), + }); + + if (!response.ok) { + const errorText = await response.text().catch(() => "Unknown error"); + throw new Error(`Speech synthesis failed: ${errorText}`); + } + + return response.blob(); +} From bc86947d01c8e83cb422710e1e53cbe76c04a117 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:16:27 -0600 Subject: [PATCH 15/19] feat(#404): add speech settings page with provider config Implements the SpeechSettings component with four sections: - STT settings (enable/disable, language preference) - TTS settings (enable/disable, voice selector, tier preference, auto-play, speed control) - Voice preview with test button - Provider status with health indicators Also adds Slider UI component and getHealthStatus API client function. 30 unit tests covering all sections, toggles, voice loading, and PDA-friendly design. Fixes #404 Co-Authored-By: Claude Opus 4.6 --- .../components/speech/SpeechSettings.test.tsx | 439 ++++++++++++++++++ .../src/components/speech/SpeechSettings.tsx | 404 ++++++++++++++++ apps/web/src/components/ui/slider.tsx | 55 +++ apps/web/src/lib/api/speech.ts | 28 +- 4 files changed, 924 insertions(+), 2 deletions(-) create mode 100644 apps/web/src/components/speech/SpeechSettings.test.tsx create mode 100644 apps/web/src/components/speech/SpeechSettings.tsx create mode 100644 apps/web/src/components/ui/slider.tsx diff --git a/apps/web/src/components/speech/SpeechSettings.test.tsx b/apps/web/src/components/speech/SpeechSettings.test.tsx new file mode 100644 index 0000000..735ba24 --- /dev/null +++ b/apps/web/src/components/speech/SpeechSettings.test.tsx @@ -0,0 +1,439 @@ +/** + * @file SpeechSettings.test.tsx + * @description Tests for the SpeechSettings component + * + * Validates all settings sections: STT, TTS, Voice Preview, Provider Status. + * Follows TDD: tests written before implementation. + * + * Issue #404 + */ + +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { render, screen, waitFor, within } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { SpeechSettings } from "./SpeechSettings"; + +// Mock the speech API +const mockGetVoices = vi.fn(); +const mockGetHealthStatus = vi.fn(); +const mockSynthesizeSpeech = vi.fn(); + +vi.mock("@/lib/api/speech", () => ({ + getVoices: (...args: unknown[]): unknown => mockGetVoices(...args) as unknown, + getHealthStatus: (...args: unknown[]): unknown => mockGetHealthStatus(...args) as unknown, + synthesizeSpeech: (...args: unknown[]): unknown => mockSynthesizeSpeech(...args) as unknown, +})); + +// Mock the useTextToSpeech hook for voice preview +const mockSynthesize = vi.fn(); + +vi.mock("@/hooks/useTextToSpeech", () => ({ + useTextToSpeech: vi.fn(() => ({ + synthesize: mockSynthesize, + audioUrl: null, + isLoading: false, + error: null, + play: vi.fn(), + pause: vi.fn(), + stop: vi.fn(), + isPlaying: false, + duration: 0, + currentTime: 0, + })), +})); + +// Mock HTMLAudioElement for AudioPlayer used inside preview +class MockAudio { + src = ""; + currentTime = 0; + duration = 60; + paused = true; + playbackRate = 1; + volume = 1; + onended: (() => void) | null = null; + ontimeupdate: (() => void) | null = null; + onloadedmetadata: (() => void) | null = null; + onerror: ((e: unknown) => void) | null = null; + + play(): Promise { + this.paused = false; + return Promise.resolve(); + } + + pause(): void { + this.paused = true; + } + + addEventListener(): void { + // no-op + } + + removeEventListener(): void { + // no-op + } +} + +vi.stubGlobal("Audio", MockAudio); + +// Default mock responses +const mockVoicesResponse = { + data: [ + { id: "voice-1", name: "Alloy", language: "en", tier: "default", isDefault: true }, + { id: "voice-2", name: "Nova", language: "en", tier: "default", isDefault: false }, + { id: "voice-3", name: "Premium Voice", language: "en", tier: "premium", isDefault: true }, + ], +}; + +const mockHealthResponse = { + data: { + stt: { available: true }, + tts: { available: true }, + }, +}; + +describe("SpeechSettings", () => { + beforeEach(() => { + vi.clearAllMocks(); + mockGetVoices.mockResolvedValue(mockVoicesResponse); + mockGetHealthStatus.mockResolvedValue(mockHealthResponse); + mockSynthesizeSpeech.mockResolvedValue(new Blob()); + }); + + describe("rendering", () => { + it("should render the speech settings heading", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Speech Settings")).toBeInTheDocument(); + }); + }); + + it("should render the STT settings section", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Speech-to-Text")).toBeInTheDocument(); + }); + }); + + it("should render the TTS settings section", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Text-to-Speech")).toBeInTheDocument(); + }); + }); + + it("should render the provider status section", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Provider Status")).toBeInTheDocument(); + }); + }); + + it("should render all four section cards", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Speech-to-Text")).toBeInTheDocument(); + expect(screen.getByText("Text-to-Speech")).toBeInTheDocument(); + expect(screen.getByText("Voice Preview")).toBeInTheDocument(); + expect(screen.getByText("Provider Status")).toBeInTheDocument(); + }); + }); + }); + + describe("STT settings", () => { + it("should render an enable/disable toggle for STT", async () => { + render(); + + await waitFor(() => { + const sttToggle = screen.getByRole("switch", { name: /enable speech-to-text/i }); + expect(sttToggle).toBeInTheDocument(); + }); + }); + + it("should render a language preference dropdown", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Language")).toBeInTheDocument(); + }); + }); + + it("should toggle STT enabled state when clicked", async () => { + const user = userEvent.setup(); + render(); + + await waitFor(() => { + expect(screen.getByRole("switch", { name: /enable speech-to-text/i })).toBeInTheDocument(); + }); + + const sttToggle = screen.getByRole("switch", { name: /enable speech-to-text/i }); + // Default should be checked (enabled) + expect(sttToggle).toBeChecked(); + + await user.click(sttToggle); + expect(sttToggle).not.toBeChecked(); + }); + }); + + describe("TTS settings", () => { + it("should render an enable/disable toggle for TTS", async () => { + render(); + + await waitFor(() => { + const ttsToggle = screen.getByRole("switch", { name: /enable text-to-speech/i }); + expect(ttsToggle).toBeInTheDocument(); + }); + }); + + it("should render a voice selector", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Default Voice")).toBeInTheDocument(); + }); + }); + + it("should render a tier preference selector", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Provider Tier")).toBeInTheDocument(); + }); + }); + + it("should render an auto-play toggle", async () => { + render(); + + await waitFor(() => { + const autoPlayToggle = screen.getByRole("switch", { name: /auto-play/i }); + expect(autoPlayToggle).toBeInTheDocument(); + }); + }); + + it("should render a speed control slider", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Speed")).toBeInTheDocument(); + const slider = screen.getByRole("slider"); + expect(slider).toBeInTheDocument(); + }); + }); + + it("should display the current speed value", async () => { + render(); + + await waitFor(() => { + // The speed display label shows "1.0x" next to the Speed label + const speedLabels = screen.getAllByText("1.0x"); + expect(speedLabels.length).toBeGreaterThanOrEqual(1); + }); + }); + + it("should toggle TTS enabled state when clicked", async () => { + const user = userEvent.setup(); + render(); + + await waitFor(() => { + expect(screen.getByRole("switch", { name: /enable text-to-speech/i })).toBeInTheDocument(); + }); + + const ttsToggle = screen.getByRole("switch", { name: /enable text-to-speech/i }); + expect(ttsToggle).toBeChecked(); + + await user.click(ttsToggle); + expect(ttsToggle).not.toBeChecked(); + }); + + it("should toggle auto-play state when clicked", async () => { + const user = userEvent.setup(); + render(); + + await waitFor(() => { + expect(screen.getByRole("switch", { name: /auto-play/i })).toBeInTheDocument(); + }); + + const autoPlayToggle = screen.getByRole("switch", { name: /auto-play/i }); + // Default should be unchecked + expect(autoPlayToggle).not.toBeChecked(); + + await user.click(autoPlayToggle); + expect(autoPlayToggle).toBeChecked(); + }); + }); + + describe("voice selector", () => { + it("should fetch voices on mount", async () => { + render(); + + await waitFor(() => { + expect(mockGetVoices).toHaveBeenCalled(); + }); + }); + + it("should display voice options after fetching", async () => { + const user = userEvent.setup(); + render(); + + await waitFor(() => { + expect(mockGetVoices).toHaveBeenCalled(); + }); + + // Open the voice selector by clicking the trigger button (id="tts-voice") + const voiceButton = document.getElementById("tts-voice"); + expect(voiceButton).toBeTruthy(); + if (!voiceButton) throw new Error("Voice button not found"); + await user.click(voiceButton); + + await waitFor(() => { + expect(screen.getByText("Alloy")).toBeInTheDocument(); + expect(screen.getByText("Nova")).toBeInTheDocument(); + }); + }); + + it("should handle API error gracefully when fetching voices", async () => { + mockGetVoices.mockRejectedValueOnce(new Error("Network error")); + + render(); + + await waitFor(() => { + expect(screen.getByText(/unable to load voices/i)).toBeInTheDocument(); + }); + }); + }); + + describe("voice preview", () => { + it("should render a voice preview section", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Voice Preview")).toBeInTheDocument(); + }); + }); + + it("should render a test button for voice preview", async () => { + render(); + + await waitFor(() => { + const testButton = screen.getByRole("button", { name: /test voice/i }); + expect(testButton).toBeInTheDocument(); + }); + }); + }); + + describe("provider status", () => { + it("should fetch health status on mount", async () => { + render(); + + await waitFor(() => { + expect(mockGetHealthStatus).toHaveBeenCalled(); + }); + }); + + it("should display STT provider status", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Speech-to-Text Provider")).toBeInTheDocument(); + }); + }); + + it("should display TTS provider status", async () => { + render(); + + await waitFor(() => { + expect(screen.getByText("Text-to-Speech Provider")).toBeInTheDocument(); + }); + }); + + it("should show active indicator when provider is available", async () => { + render(); + + await waitFor(() => { + const statusSection = screen.getByTestId("provider-status"); + const activeIndicators = within(statusSection).getAllByTestId("status-active"); + expect(activeIndicators.length).toBe(2); + }); + }); + + it("should show inactive indicator when provider is unavailable", async () => { + mockGetHealthStatus.mockResolvedValueOnce({ + data: { + stt: { available: false }, + tts: { available: true }, + }, + }); + + render(); + + await waitFor(() => { + const statusSection = screen.getByTestId("provider-status"); + const inactiveIndicators = within(statusSection).getAllByTestId("status-inactive"); + expect(inactiveIndicators.length).toBe(1); + }); + }); + + it("should handle health check error gracefully", async () => { + mockGetHealthStatus.mockRejectedValueOnce(new Error("Service unavailable")); + + render(); + + await waitFor(() => { + expect(screen.getByText(/unable to check provider status/i)).toBeInTheDocument(); + }); + }); + }); + + describe("PDA-friendly design", () => { + it("should not use aggressive red colors", async () => { + const { container } = render(); + + await waitFor(() => { + expect(screen.getByText("Speech Settings")).toBeInTheDocument(); + }); + + const allElements = container.querySelectorAll("*"); + allElements.forEach((el) => { + const className = el.className; + if (typeof className === "string") { + expect(className).not.toMatch(/bg-red-|text-red-|border-red-/); + } + }); + }); + + it("should not use demanding language", async () => { + const { container } = render(); + + await waitFor(() => { + expect(screen.getByText("Speech Settings")).toBeInTheDocument(); + }); + + const text = container.textContent; + const demandingWords = [ + "OVERDUE", + "URGENT", + "MUST DO", + "CRITICAL", + "REQUIRED", + "YOU NEED TO", + ]; + for (const word of demandingWords) { + expect(text.toUpperCase()).not.toContain(word); + } + }); + + it("should use descriptive section headers", async () => { + render(); + + await waitFor(() => { + // Check for descriptive subtext under section headers + expect(screen.getByText("Configure voice input preferences")).toBeInTheDocument(); + expect(screen.getByText("Configure voice output preferences")).toBeInTheDocument(); + }); + }); + }); +}); diff --git a/apps/web/src/components/speech/SpeechSettings.tsx b/apps/web/src/components/speech/SpeechSettings.tsx new file mode 100644 index 0000000..7cad797 --- /dev/null +++ b/apps/web/src/components/speech/SpeechSettings.tsx @@ -0,0 +1,404 @@ +/** + * SpeechSettings Component + * + * Settings page for configuring speech preferences per workspace. + * Includes STT settings, TTS settings, voice preview, and provider status. + * + * Follows PDA-friendly design: calm colors, no aggressive language. + * + * Issue #404 + */ + +"use client"; + +import { useState, useEffect, useCallback } from "react"; +import type { ReactElement } from "react"; +import { Card, CardHeader, CardContent, CardTitle, CardDescription } from "@/components/ui/card"; +import { Switch } from "@/components/ui/switch"; +import { Label } from "@/components/ui/label"; +import { Button } from "@/components/ui/button"; +import { Slider } from "@/components/ui/slider"; +import { + Select, + SelectTrigger, + SelectValue, + SelectContent, + SelectItem, +} from "@/components/ui/select"; +import { getVoices, getHealthStatus } from "@/lib/api/speech"; +import type { VoiceInfo, HealthResponse } from "@/lib/api/speech"; +import { useTextToSpeech } from "@/hooks/useTextToSpeech"; + +/** Supported languages for STT */ +const STT_LANGUAGES = [ + { value: "en", label: "English" }, + { value: "es", label: "Spanish" }, + { value: "fr", label: "French" }, + { value: "de", label: "German" }, + { value: "it", label: "Italian" }, + { value: "pt", label: "Portuguese" }, + { value: "ja", label: "Japanese" }, + { value: "zh", label: "Chinese" }, + { value: "ko", label: "Korean" }, + { value: "auto", label: "Auto-detect" }, +]; + +/** TTS tier options */ +const TIER_OPTIONS = [ + { value: "default", label: "Default" }, + { value: "premium", label: "Premium" }, + { value: "fallback", label: "Fallback" }, +]; + +/** Sample text for voice preview */ +const PREVIEW_TEXT = "Hello, this is a preview of the selected voice. How does it sound?"; + +/** + * SpeechSettings provides a comprehensive settings interface for + * configuring speech-to-text and text-to-speech preferences. + */ +export function SpeechSettings(): ReactElement { + // STT state + const [sttEnabled, setSttEnabled] = useState(true); + const [sttLanguage, setSttLanguage] = useState("en"); + + // TTS state + const [ttsEnabled, setTtsEnabled] = useState(true); + const [selectedVoice, setSelectedVoice] = useState(""); + const [selectedTier, setSelectedTier] = useState("default"); + const [autoPlay, setAutoPlay] = useState(false); + const [speed, setSpeed] = useState(1.0); + + // Data state + const [voices, setVoices] = useState([]); + const [voicesError, setVoicesError] = useState(null); + const [healthData, setHealthData] = useState(null); + const [healthError, setHealthError] = useState(null); + + // Preview hook + const { + synthesize, + audioUrl, + isLoading: isPreviewLoading, + error: previewError, + } = useTextToSpeech(); + + /** + * Fetch available voices from the API + */ + const fetchVoices = useCallback(async (): Promise => { + try { + setVoicesError(null); + const response = await getVoices(); + setVoices(response.data); + + // Select the first default voice if none selected + if (response.data.length > 0 && !selectedVoice) { + const defaultVoice = response.data.find((v) => v.isDefault); + const firstVoice = response.data[0]; + setSelectedVoice(defaultVoice?.id ?? firstVoice?.id ?? ""); + } + } catch { + setVoicesError("Unable to load voices. Please try again later."); + } + }, [selectedVoice]); + + /** + * Fetch health status from the API + */ + const fetchHealth = useCallback(async (): Promise => { + try { + setHealthError(null); + const response = await getHealthStatus(); + setHealthData(response.data); + } catch { + setHealthError("Unable to check provider status. Please try again later."); + } + }, []); + + // Fetch voices and health on mount + useEffect(() => { + void fetchVoices(); + void fetchHealth(); + }, [fetchVoices, fetchHealth]); + + /** + * Handle voice preview test + */ + const handleTestVoice = useCallback(async (): Promise => { + const options: Record = { + speed, + tier: selectedTier, + }; + if (selectedVoice) { + options.voice = selectedVoice; + } + await synthesize(PREVIEW_TEXT, options); + }, [synthesize, selectedVoice, speed, selectedTier]); + + return ( +
+
+

Speech Settings

+

+ Configure voice input and output preferences for your workspace +

+
+ + {/* STT Settings */} + + + Speech-to-Text + Configure voice input preferences + + +
+ {/* Enable STT Toggle */} +
+
+ +

+ Allow voice input for text fields and commands +

+
+ +
+ + {/* Language Preference */} +
+ + +
+
+
+
+ + {/* TTS Settings */} + + + Text-to-Speech + Configure voice output preferences + + +
+ {/* Enable TTS Toggle */} +
+
+ +

+ Allow reading content aloud with synthesized voice +

+
+ +
+ + {/* Default Voice Selector */} +
+ + {voicesError ? ( +

{voicesError}

+ ) : ( + + )} +
+ + {/* Provider Tier Preference */} +
+ +

+ Choose the preferred quality tier for voice synthesis +

+ +
+ + {/* Auto-play Toggle */} +
+
+ +

+ Automatically play TTS responses when received +

+
+ +
+ + {/* Speed Control */} +
+
+ + {speed.toFixed(1)}x +
+ { + const newSpeed = values[0]; + if (newSpeed !== undefined) { + setSpeed(newSpeed); + } + }} + /> +
+ 0.5x + 1.0x + 2.0x +
+
+
+
+
+ + {/* Voice Preview */} + + + Voice Preview + Preview the selected voice with sample text + + +
+

“{PREVIEW_TEXT}”

+ + {previewError &&

{previewError}

} + {audioUrl && ( + + )} +
+
+
+ + {/* Provider Status */} + + + Provider Status + Current availability of speech service providers + + +
+ {healthError ? ( +

{healthError}

+ ) : healthData ? ( + <> + {/* STT Provider */} +
+ Speech-to-Text Provider +
+ {healthData.stt.available ? ( + <> + + Active + + ) : ( + <> + + Inactive + + )} +
+
+ + {/* TTS Provider */} +
+ Text-to-Speech Provider +
+ {healthData.tts.available ? ( + <> + + Active + + ) : ( + <> + + Inactive + + )} +
+
+ + ) : ( +

Checking provider status...

+ )} +
+
+
+
+ ); +} + +export default SpeechSettings; diff --git a/apps/web/src/components/ui/slider.tsx b/apps/web/src/components/ui/slider.tsx new file mode 100644 index 0000000..6241e6a --- /dev/null +++ b/apps/web/src/components/ui/slider.tsx @@ -0,0 +1,55 @@ +import * as React from "react"; + +export interface SliderProps { + id?: string; + min?: number; + max?: number; + step?: number; + value?: number[]; + defaultValue?: number[]; + onValueChange?: (value: number[]) => void; + disabled?: boolean; + className?: string; +} + +export const Slider = React.forwardRef( + ( + { + id, + min = 0, + max = 100, + step = 1, + value, + defaultValue, + onValueChange, + disabled, + className = "", + }, + ref + ) => { + const currentValue = value?.[0] ?? defaultValue?.[0] ?? min; + + return ( + { + onValueChange?.([parseFloat(e.target.value)]); + }} + disabled={disabled} + aria-valuemin={min} + aria-valuemax={max} + aria-valuenow={currentValue} + className={`w-full h-2 rounded-lg appearance-none cursor-pointer bg-gray-200 accent-blue-500 ${className}`} + /> + ); + } +); + +Slider.displayName = "Slider"; diff --git a/apps/web/src/lib/api/speech.ts b/apps/web/src/lib/api/speech.ts index cf5aeef..fc402de 100644 --- a/apps/web/src/lib/api/speech.ts +++ b/apps/web/src/lib/api/speech.ts @@ -6,12 +6,16 @@ import { apiGet } from "./client"; import { API_BASE_URL } from "../config"; +export type SpeechTier = "default" | "premium" | "fallback"; + export interface VoiceInfo { id: string; name: string; language: string; gender?: string; preview_url?: string; + tier?: SpeechTier; + isDefault?: boolean; } export interface SynthesizeOptions { @@ -26,11 +30,31 @@ export interface VoicesResponse { data: VoiceInfo[]; } +export interface ProviderHealth { + available: boolean; +} + +export interface HealthResponse { + data: { + stt: ProviderHealth; + tts: ProviderHealth; + }; +} + /** * Fetch available TTS voices + * Optionally filter by tier (default, premium, fallback) */ -export async function getVoices(): Promise { - return apiGet("/api/speech/voices"); +export async function getVoices(tier?: SpeechTier): Promise { + const endpoint = tier ? `/api/speech/voices?tier=${tier}` : "/api/speech/voices"; + return apiGet(endpoint); +} + +/** + * Fetch health status of speech providers (STT and TTS) + */ +export async function getHealthStatus(): Promise { + return apiGet("/api/speech/health"); } /** From 24065aa1999ae9468a0de9016a183be23d5a195f Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:23:22 -0600 Subject: [PATCH 16/19] docs(#406): add speech services documentation Comprehensive documentation for the speech services module: - docs/SPEECH.md: Architecture, API reference, WebSocket protocol, environment variables, provider configuration, Docker setup, GPU VRAM budget, and frontend integration examples - apps/api/src/speech/AGENTS.md: Module structure, provider pattern, how to add new providers, gotchas, and test patterns - README.md: Speech capabilities section with quick start Fixes #406 Co-Authored-By: Claude Opus 4.6 --- README.md | 50 +- apps/api/src/speech/AGENTS.md | 247 +++++++++ docs/SPEECH.md | 929 ++++++++++++++++++++++++++++++++++ 3 files changed, 1213 insertions(+), 13 deletions(-) create mode 100644 apps/api/src/speech/AGENTS.md create mode 100644 docs/SPEECH.md diff --git a/README.md b/README.md index 65b2ab2..a93c803 100644 --- a/README.md +++ b/README.md @@ -19,19 +19,20 @@ Mosaic Stack is a modern, PDA-friendly platform designed to help users manage th ## Technology Stack -| Layer | Technology | -| -------------- | -------------------------------------------- | -| **Frontend** | Next.js 16 + React + TailwindCSS + Shadcn/ui | -| **Backend** | NestJS + Prisma ORM | -| **Database** | PostgreSQL 17 + pgvector | -| **Cache** | Valkey (Redis-compatible) | -| **Auth** | Authentik (OIDC) via BetterAuth | -| **AI** | Ollama (local or remote) | -| **Messaging** | MoltBot (stock + plugins) | -| **Real-time** | WebSockets (Socket.io) | -| **Monorepo** | pnpm workspaces + TurboRepo | -| **Testing** | Vitest + Playwright | -| **Deployment** | Docker + docker-compose | +| Layer | Technology | +| -------------- | ---------------------------------------------- | +| **Frontend** | Next.js 16 + React + TailwindCSS + Shadcn/ui | +| **Backend** | NestJS + Prisma ORM | +| **Database** | PostgreSQL 17 + pgvector | +| **Cache** | Valkey (Redis-compatible) | +| **Auth** | Authentik (OIDC) via BetterAuth | +| **AI** | Ollama (local or remote) | +| **Messaging** | MoltBot (stock + plugins) | +| **Real-time** | WebSockets (Socket.io) | +| **Speech** | Speaches (STT) + Kokoro/Chatterbox/Piper (TTS) | +| **Monorepo** | pnpm workspaces + TurboRepo | +| **Testing** | Vitest + Playwright | +| **Deployment** | Docker + docker-compose | ## Quick Start @@ -356,6 +357,29 @@ Mosaic Stack includes a sophisticated agent orchestration system for autonomous See [Agent Orchestration Design](docs/design/agent-orchestration.md) for architecture details. +## Speech Services + +Mosaic Stack includes integrated speech-to-text (STT) and text-to-speech (TTS) capabilities through a modular provider architecture. Each component is optional and independently configurable. + +- **Speech-to-Text** - Transcribe audio files and real-time audio streams using Whisper (via Speaches) +- **Text-to-Speech** - Synthesize speech with 54+ voices across 8 languages (via Kokoro, CPU-based) +- **Premium Voice Cloning** - Clone voices from audio samples with emotion control (via Chatterbox, GPU) +- **Fallback TTS** - Ultra-lightweight CPU fallback for low-resource environments (via Piper/OpenedAI Speech) +- **WebSocket Streaming** - Real-time streaming transcription via Socket.IO `/speech` namespace +- **Automatic Fallback** - TTS tier system with graceful degradation (premium -> default -> fallback) + +**Quick Start:** + +```bash +# Start speech services alongside core stack +make speech-up + +# Or with Docker Compose directly +docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d +``` + +See [Speech Services Documentation](docs/SPEECH.md) for architecture details, API reference, provider configuration, and deployment options. + ## Current Implementation Status ### ✅ Completed (v0.0.1-0.0.6) diff --git a/apps/api/src/speech/AGENTS.md b/apps/api/src/speech/AGENTS.md new file mode 100644 index 0000000..04b6d97 --- /dev/null +++ b/apps/api/src/speech/AGENTS.md @@ -0,0 +1,247 @@ +# speech — Agent Context + +> Part of the `apps/api/src` layer. Speech-to-text (STT) and text-to-speech (TTS) services. + +## Module Structure + +``` +speech/ +├── speech.module.ts # NestJS module (conditional provider registration) +├── speech.config.ts # Environment validation + typed config (registerAs) +├── speech.config.spec.ts # 51 config validation tests +├── speech.constants.ts # NestJS injection tokens (STT_PROVIDER, TTS_PROVIDERS) +├── speech.controller.ts # REST endpoints (transcribe, synthesize, voices, health) +├── speech.controller.spec.ts # Controller tests +├── speech.service.ts # High-level service with fallback orchestration +├── speech.service.spec.ts # Service tests +├── speech.gateway.ts # WebSocket gateway (/speech namespace) +├── speech.gateway.spec.ts # Gateway tests +├── dto/ +│ ├── transcribe.dto.ts # Transcription request DTO (class-validator) +│ ├── synthesize.dto.ts # Synthesis request DTO (class-validator) +│ └── index.ts # Barrel export +├── interfaces/ +│ ├── speech-types.ts # Shared types (SpeechTier, AudioFormat, options, results) +│ ├── stt-provider.interface.ts # ISTTProvider contract +│ ├── tts-provider.interface.ts # ITTSProvider contract +│ └── index.ts # Barrel export +├── pipes/ +│ ├── audio-validation.pipe.ts # Validates uploaded audio (MIME type, size) +│ ├── audio-validation.pipe.spec.ts +│ ├── text-validation.pipe.ts # Validates TTS text input (non-empty, max length) +│ ├── text-validation.pipe.spec.ts +│ └── index.ts # Barrel export +└── providers/ + ├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic) + ├── base-tts.provider.spec.ts + ├── kokoro-tts.provider.ts # Default tier (CPU, 54 voices, 8 languages) + ├── kokoro-tts.provider.spec.ts + ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control) + ├── chatterbox-tts.provider.spec.ts + ├── piper-tts.provider.ts # Fallback tier (CPU, lightweight, Raspberry Pi) + ├── piper-tts.provider.spec.ts + ├── speaches-stt.provider.ts # STT provider (Whisper via Speaches) + ├── speaches-stt.provider.spec.ts + ├── tts-provider.factory.ts # Factory: creates providers from config + └── tts-provider.factory.spec.ts +``` + +## Codebase Patterns + +### Provider Pattern (BaseTTSProvider + Factory) + +All TTS providers extend `BaseTTSProvider`: + +```typescript +export class MyNewProvider extends BaseTTSProvider { + readonly name = "my-provider"; + readonly tier: SpeechTier = "default"; // or "premium" or "fallback" + + constructor(baseURL: string) { + super(baseURL, "default-voice-id", "mp3"); + } + + // Override listVoices() for custom voice catalog + override listVoices(): Promise { ... } + + // Override synthesize() only if non-standard API behavior is needed + // (see ChatterboxTTSProvider for example with extra body params) +} +``` + +The base class handles: + +- OpenAI SDK client creation with custom `baseURL` and `apiKey: "not-needed"` +- Standard `synthesize()` via `client.audio.speech.create()` +- Default `listVoices()` returning just the default voice +- `isHealthy()` via GET to the `/v1/models` endpoint + +### Config Pattern + +Config follows the existing pattern (`auth.config.ts`, `federation.config.ts`): + +- Export `isSttEnabled()`, `isTtsEnabled()`, etc. (boolean checks from env) +- Export `validateSpeechConfig()` (called at module init, throws on missing required vars) +- Export `getSpeechConfig()` (typed config object with defaults) +- Export `speechConfig = registerAs("speech", ...)` for NestJS ConfigModule + +Boolean env parsing: `value === "true" || value === "1"`. No default-true. + +### Conditional Provider Registration + +In `speech.module.ts`: + +- STT provider uses `isSttEnabled()` at module definition time to decide whether to register +- TTS providers use a factory function injected with `ConfigService` +- `@Optional()` decorator on `SpeechService`'s `sttProvider` handles the case where STT is disabled + +### Injection Tokens + +```typescript +// speech.constants.ts +export const STT_PROVIDER = Symbol("STT_PROVIDER"); // ISTTProvider +export const TTS_PROVIDERS = Symbol("TTS_PROVIDERS"); // Map +``` + +### Fallback Chain + +TTS fallback order: `premium` -> `default` -> `fallback` + +- Chain starts at the requested tier and goes downward +- Only tiers that are both enabled AND have a registered provider are attempted +- `ServiceUnavailableException` if all providers fail + +### WebSocket Gateway + +- Separate `/speech` namespace (not on the main gateway) +- Authentication mirrors the main WS gateway pattern (token extraction from handshake) +- One session per client, accumulates audio chunks in memory +- Chunks concatenated and transcribed on `stop-transcription` +- Session cleanup on disconnect + +## How to Add a New TTS Provider + +1. **Create the provider class** in `providers/`: + +```typescript +// providers/my-tts.provider.ts +import { BaseTTSProvider } from "./base-tts.provider"; +import type { SpeechTier } from "../interfaces/speech-types"; + +export class MyTtsProvider extends BaseTTSProvider { + readonly name = "my-provider"; + readonly tier: SpeechTier = "default"; // Choose tier + + constructor(baseURL: string) { + super(baseURL, "default-voice", "mp3"); + } + + override listVoices(): Promise { + // Return your voice catalog + } +} +``` + +2. **Add env vars** to `speech.config.ts`: + - Add enabled check function + - Add URL to validation in `validateSpeechConfig()` + - Add config section in `getSpeechConfig()` + +3. **Register in factory** (`tts-provider.factory.ts`): + +```typescript +if (config.tts.myTier.enabled) { + const provider = new MyTtsProvider(config.tts.myTier.url); + providers.set("myTier", provider); +} +``` + +4. **Add env vars** to `.env.example` + +5. **Write tests** following existing patterns (mock OpenAI SDK, test synthesis + listVoices + isHealthy) + +## How to Add a New STT Provider + +1. **Implement `ISTTProvider`** (does not use a base class -- STT has only one implementation currently) +2. **Add config section** similar to `stt` in `speech.config.ts` +3. **Register** in `speech.module.ts` providers array with `STT_PROVIDER` token +4. **Write tests** following `speaches-stt.provider.spec.ts` pattern + +## Common Gotchas + +- **OpenAI SDK `apiKey`**: Self-hosted services do not require an API key. Use `apiKey: "not-needed"` when creating the OpenAI client. +- **`toFile()` import**: The `toFile` helper is imported from `"openai"` (not from a subpath). Used in the STT provider to convert Buffer to a File-like object for multipart upload. +- **Health check URL**: `BaseTTSProvider.isHealthy()` calls `GET /v1/models`. The base URL is expected to end with `/v1`. +- **Voice ID prefix parsing**: Kokoro voice IDs encode language + gender in first two characters. See `parseVoicePrefix()` in `kokoro-tts.provider.ts`. +- **Chatterbox extra body params**: The `reference_audio` (base64) and `exaggeration` fields are passed via the OpenAI SDK by casting the request body. This works because the SDK passes through unknown fields. +- **WebSocket auth**: The gateway checks `auth.token`, then `query.token`, then `Authorization` header (in that order). Match this in test setup. +- **Config validation timing**: `validateSpeechConfig()` runs at module init (`onModuleInit`), not at provider construction. This means a misconfigured provider will fail at startup, not at first request. + +## Test Patterns + +### Mocking OpenAI SDK + +All provider tests mock the OpenAI SDK. Pattern: + +```typescript +vi.mock("openai", () => ({ + default: vi.fn().mockImplementation(() => ({ + audio: { + speech: { + create: vi.fn().mockResolvedValue({ + arrayBuffer: () => Promise.resolve(new ArrayBuffer(10)), + }), + }, + transcriptions: { + create: vi.fn().mockResolvedValue({ + text: "transcribed text", + language: "en", + duration: 3.5, + }), + }, + }, + models: { list: vi.fn().mockResolvedValue({ data: [] }) }, + })), +})); +``` + +### Mocking Config Injection + +```typescript +const mockConfig: SpeechConfig = { + stt: { enabled: true, baseUrl: "http://test:8000/v1", model: "test-model", language: "en" }, + tts: { + default: { enabled: true, url: "http://test:8880/v1", voice: "af_heart", format: "mp3" }, + premium: { enabled: false, url: "" }, + fallback: { enabled: false, url: "" }, + }, + limits: { maxUploadSize: 25000000, maxDurationSeconds: 600, maxTextLength: 4096 }, +}; +``` + +### Config Test Pattern + +`speech.config.spec.ts` saves and restores `process.env` around each test: + +```typescript +let savedEnv: NodeJS.ProcessEnv; +beforeEach(() => { + savedEnv = { ...process.env }; +}); +afterEach(() => { + process.env = savedEnv; +}); +``` + +## Key Files + +| File | Purpose | +| ----------------------------------- | ------------------------------------------------------------------------ | +| `speech.module.ts` | Module registration with conditional providers | +| `speech.config.ts` | All speech env vars + validation (51 tests) | +| `speech.service.ts` | Core service: transcribe, synthesize (with fallback), listVoices | +| `speech.controller.ts` | REST endpoints: POST transcribe, POST synthesize, GET voices, GET health | +| `speech.gateway.ts` | WebSocket streaming transcription (/speech namespace) | +| `providers/base-tts.provider.ts` | Abstract base for all TTS providers (OpenAI SDK wrapper) | +| `providers/tts-provider.factory.ts` | Creates provider instances from config | +| `interfaces/speech-types.ts` | All shared types: SpeechTier, AudioFormat, options, results | diff --git a/docs/SPEECH.md b/docs/SPEECH.md new file mode 100644 index 0000000..3ea7dd4 --- /dev/null +++ b/docs/SPEECH.md @@ -0,0 +1,929 @@ +# Speech Services + +Mosaic Stack provides integrated speech-to-text (STT) and text-to-speech (TTS) services through a provider abstraction layer. Speech services are optional and modular -- each component can be independently enabled, disabled, or pointed at external infrastructure. + +## Table of Contents + +- [Architecture Overview](#architecture-overview) +- [Provider Abstraction](#provider-abstraction) +- [TTS Tier System and Fallback Chain](#tts-tier-system-and-fallback-chain) +- [API Endpoint Reference](#api-endpoint-reference) +- [WebSocket Streaming Protocol](#websocket-streaming-protocol) +- [Environment Variable Reference](#environment-variable-reference) +- [Provider Configuration](#provider-configuration) +- [Voice Cloning Setup (Chatterbox)](#voice-cloning-setup-chatterbox) +- [Docker Compose Setup](#docker-compose-setup) +- [GPU VRAM Budget](#gpu-vram-budget) +- [Frontend Integration](#frontend-integration) + +--- + +## Architecture Overview + +``` + +-------------------+ + | SpeechController | + | (REST endpoints) | + +--------+----------+ + | + +--------------+--------------+ + | SpeechService | + | (provider selection, | + | fallback orchestration) | + +---------+----------+---------+ + | | + +------------+ +-----+-------+ + | | | + +------+------+ +-----+-----+ +-----+-----+ + | STT Provider| |TTS Provider| |TTS Provider| + | (Speaches) | |Map | |Map | + +------+------+ +-----+-----+ +-----+-----+ + | | | + +------+------+ +-----+-----+ +-----+-----+ + | Speaches | | Kokoro | | Chatterbox | + | (Whisper) | | (default) | | (premium) | + +-------------+ +-----------+ +-----+------+ + | + +-----+-----+ + | Piper | + | (fallback)| + +-----------+ + + +-------------------+ + | SpeechGateway | + | (WebSocket /speech) + +--------+----------+ + | + Uses SpeechService.transcribe() +``` + +The speech module (`apps/api/src/speech/`) is a self-contained NestJS module consisting of: + +| Component | File | Purpose | +| ---------- | ---------------------- | ------------------------------------------ | +| Module | `speech.module.ts` | Registers providers, controllers, gateway | +| Config | `speech.config.ts` | Environment validation and typed config | +| Service | `speech.service.ts` | High-level speech operations with fallback | +| Controller | `speech.controller.ts` | REST API endpoints | +| Gateway | `speech.gateway.ts` | WebSocket streaming transcription | +| Constants | `speech.constants.ts` | NestJS injection tokens | + +### Key Design Decisions + +1. **OpenAI-compatible APIs**: All providers (Speaches, Kokoro, Chatterbox, Piper/OpenedAI) expose OpenAI-compatible endpoints. The official OpenAI SDK is used as the HTTP client with a custom `baseURL`. + +2. **Provider abstraction**: STT and TTS providers implement well-defined interfaces (`ISTTProvider`, `ITTSProvider`). New providers can be added without modifying the service layer. + +3. **Conditional registration**: Providers are only instantiated when their corresponding `*_ENABLED` flag is `true`. The STT provider uses NestJS `@Optional()` injection. + +4. **Fail-fast validation**: Configuration is validated at module initialization. If a service is enabled but its URL is missing, the application fails on startup with a descriptive error. + +--- + +## Provider Abstraction + +### STT Provider Interface + +```typescript +interface ISTTProvider { + readonly name: string; + transcribe(audio: Buffer, options?: TranscribeOptions): Promise; + isHealthy(): Promise; +} +``` + +Currently implemented by `SpeachesSttProvider` which connects to a Speaches (faster-whisper) server. + +### TTS Provider Interface + +```typescript +interface ITTSProvider { + readonly name: string; + readonly tier: SpeechTier; + synthesize(text: string, options?: SynthesizeOptions): Promise; + listVoices(): Promise; + isHealthy(): Promise; +} +``` + +All TTS providers extend `BaseTTSProvider`, an abstract class that implements common OpenAI-compatible synthesis logic. Concrete providers only need to set `name` and `tier` and optionally override `listVoices()` or `synthesize()`. + +### Provider Registration + +Providers are created by the `TTS Provider Factory` (`providers/tts-provider.factory.ts`) based on configuration: + +| Tier | Provider Class | Engine | Requirements | +| ---------- | ----------------------- | ------------------------- | ------------ | +| `default` | `KokoroTtsProvider` | Kokoro-FastAPI | CPU only | +| `premium` | `ChatterboxTTSProvider` | Chatterbox TTS Server | NVIDIA GPU | +| `fallback` | `PiperTtsProvider` | Piper via OpenedAI Speech | CPU only | + +--- + +## TTS Tier System and Fallback Chain + +TTS uses a tiered architecture with automatic fallback: + +``` +Request with tier="premium" + | + v +[premium] Chatterbox available? --yes--> Use Chatterbox + | | + no (success/fail) + | + v +[default] Kokoro available? ------yes--> Use Kokoro + | | + no (success/fail) + | + v +[fallback] Piper available? -----yes--> Use Piper + | | + no (success/fail) + | + v +ServiceUnavailableException +``` + +**Fallback order:** `premium` -> `default` -> `fallback` + +The fallback chain starts from the requested tier and proceeds downward. A tier is only attempted if: + +1. It is enabled in configuration (`TTS_ENABLED`, `TTS_PREMIUM_ENABLED`, `TTS_FALLBACK_ENABLED`) +2. A provider is registered for that tier + +If no tier is specified in the request, `default` is used as the starting point. + +--- + +## API Endpoint Reference + +All speech endpoints are under `/api/speech/` and require authentication (Bearer token) plus workspace context (`x-workspace-id` header). + +### POST /api/speech/transcribe + +Transcribe an uploaded audio file to text. + +**Authentication:** Bearer token + workspace membership +**Content-Type:** `multipart/form-data` + +**Form Fields:** + +| Field | Type | Required | Description | +| ------------- | ------ | -------- | ------------------------------------------------------ | +| `file` | File | Yes | Audio file (max 25 MB) | +| `language` | string | No | Language code (e.g., "en", "fr"). Default: from config | +| `model` | string | No | Whisper model override. Default: from config | +| `prompt` | string | No | Prompt to guide transcription (max 1000 chars) | +| `temperature` | number | No | Temperature 0.0-1.0. Lower = more deterministic | + +**Accepted Audio Formats:** +`audio/wav`, `audio/mp3`, `audio/mpeg`, `audio/webm`, `audio/ogg`, `audio/flac`, `audio/x-m4a` + +**Response:** + +```json +{ + "data": { + "text": "Hello, this is a transcription test.", + "language": "en", + "durationSeconds": 3.5, + "confidence": 0.95, + "segments": [ + { + "text": "Hello, this is a transcription test.", + "start": 0.0, + "end": 3.5, + "confidence": 0.95 + } + ] + } +} +``` + +**Example:** + +```bash +curl -X POST http://localhost:3001/api/speech/transcribe \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "x-workspace-id: WORKSPACE_ID" \ + -F "file=@recording.wav" \ + -F "language=en" +``` + +### POST /api/speech/synthesize + +Synthesize text to audio using TTS providers. + +**Authentication:** Bearer token + workspace membership +**Content-Type:** `application/json` + +**Request Body:** + +| Field | Type | Required | Description | +| -------- | ------ | -------- | ----------------------------------------------------------- | +| `text` | string | Yes | Text to synthesize (max 4096 chars) | +| `voice` | string | No | Voice ID. Default: from config (e.g., "af_heart") | +| `speed` | number | No | Speed multiplier 0.5-2.0. Default: 1.0 | +| `format` | string | No | Output format: mp3, wav, opus, flac, aac, pcm. Default: mp3 | +| `tier` | string | No | Provider tier: default, premium, fallback. Default: default | + +**Response:** Binary audio data with appropriate `Content-Type` header. + +| Format | Content-Type | +| ------ | ------------ | +| mp3 | `audio/mpeg` | +| wav | `audio/wav` | +| opus | `audio/opus` | +| flac | `audio/flac` | +| aac | `audio/aac` | +| pcm | `audio/pcm` | + +**Example:** + +```bash +curl -X POST http://localhost:3001/api/speech/synthesize \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "x-workspace-id: WORKSPACE_ID" \ + -H "Content-Type: application/json" \ + -d '{"text": "Hello world", "voice": "af_heart", "format": "mp3"}' \ + --output speech.mp3 +``` + +### GET /api/speech/voices + +List available TTS voices across all tiers. + +**Authentication:** Bearer token + workspace access +**Query Parameters:** + +| Parameter | Type | Required | Description | +| --------- | ------ | -------- | ------------------------------------------ | +| `tier` | string | No | Filter by tier: default, premium, fallback | + +**Response:** + +```json +{ + "data": [ + { + "id": "af_heart", + "name": "Heart (American Female)", + "language": "en-US", + "tier": "default", + "isDefault": true + }, + { + "id": "am_adam", + "name": "Adam (American Male)", + "language": "en-US", + "tier": "default", + "isDefault": false + } + ] +} +``` + +**Example:** + +```bash +curl -X GET 'http://localhost:3001/api/speech/voices?tier=default' \ + -H "Authorization: Bearer YOUR_TOKEN" \ + -H "x-workspace-id: WORKSPACE_ID" +``` + +### GET /api/speech/health + +Check availability of STT and TTS providers. + +**Authentication:** Bearer token + workspace access + +**Response:** + +```json +{ + "data": { + "stt": { "available": true }, + "tts": { "available": true } + } +} +``` + +--- + +## WebSocket Streaming Protocol + +The speech module provides a WebSocket gateway at namespace `/speech` for real-time streaming transcription. Audio chunks are accumulated on the server and transcribed when the session is stopped. + +### Connection + +Connect to the `/speech` namespace with authentication: + +```typescript +import { io } from "socket.io-client"; + +const socket = io("http://localhost:3001/speech", { + auth: { token: "YOUR_SESSION_TOKEN" }, +}); +``` + +**Authentication methods** (checked in order): + +1. `auth.token` in handshake +2. `query.token` in handshake URL +3. `Authorization: Bearer ` header + +Connection is rejected if: + +- No valid token is provided +- Session verification fails +- User has no workspace membership + +**Connection timeout:** 5 seconds for authentication. + +### Protocol Flow + +``` +Client Server + | | + |--- connect (with token) ----->| + | | (authenticate, check workspace) + |<--- connected ----------------| + | | + |--- start-transcription ------>| { language?: "en" } + |<--- transcription-started ----| { sessionId, language } + | | + |--- audio-chunk -------------->| (Buffer/Uint8Array) + |--- audio-chunk -------------->| (Buffer/Uint8Array) + |--- audio-chunk -------------->| (Buffer/Uint8Array) + | | + |--- stop-transcription ------->| + | | (concatenate chunks, transcribe) + |<--- transcription-final ------| { text, language, durationSeconds, ... } + | | +``` + +### Client Events (emit) + +| Event | Payload | Description | +| --------------------- | ------------------------ | ---------------------------------------- | +| `start-transcription` | `{ language?: string }` | Begin a new transcription session | +| `audio-chunk` | `Buffer` or `Uint8Array` | Send audio data chunk | +| `stop-transcription` | (none) | Stop recording and trigger transcription | + +### Server Events (listen) + +| Event | Payload | Description | +| ----------------------- | ----------------------------------------------------------- | -------------------------- | +| `transcription-started` | `{ sessionId, language }` | Session created | +| `transcription-final` | `{ text, language, durationSeconds, confidence, segments }` | Transcription result | +| `transcription-error` | `{ message }` | Error during transcription | + +### Session Management + +- One active transcription session per client connection +- Starting a new session replaces any existing session +- Sessions are cleaned up on client disconnect +- Audio chunks are accumulated in memory +- Total accumulated size is capped by `SPEECH_MAX_UPLOAD_SIZE` (default: 25 MB) + +### Example Client Usage + +```typescript +import { io } from "socket.io-client"; + +const socket = io("http://localhost:3001/speech", { + auth: { token: sessionToken }, +}); + +// Start recording +socket.emit("start-transcription", { language: "en" }); + +socket.on("transcription-started", ({ sessionId }) => { + console.log("Session started:", sessionId); +}); + +// Stream audio chunks from MediaRecorder +mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + event.data.arrayBuffer().then((buffer) => { + socket.emit("audio-chunk", new Uint8Array(buffer)); + }); + } +}; + +// Stop and get result +socket.emit("stop-transcription"); + +socket.on("transcription-final", (result) => { + console.log("Transcription:", result.text); + console.log("Duration:", result.durationSeconds, "seconds"); +}); + +socket.on("transcription-error", ({ message }) => { + console.error("Transcription error:", message); +}); +``` + +--- + +## Environment Variable Reference + +### Speech-to-Text (STT) + +| Variable | Default | Description | +| -------------- | --------------------------------------- | ---------------------------------------------------- | +| `STT_ENABLED` | `false` | Enable speech-to-text transcription | +| `STT_BASE_URL` | `http://speaches:8000/v1` | Speaches server URL (required when STT_ENABLED=true) | +| `STT_MODEL` | `Systran/faster-whisper-large-v3-turbo` | Whisper model for transcription | +| `STT_LANGUAGE` | `en` | Default language code | + +### Text-to-Speech (TTS) - Default Engine (Kokoro) + +| Variable | Default | Description | +| -------------------- | --------------------------- | --------------------------------------------------- | +| `TTS_ENABLED` | `false` | Enable default TTS engine | +| `TTS_DEFAULT_URL` | `http://kokoro-tts:8880/v1` | Kokoro-FastAPI URL (required when TTS_ENABLED=true) | +| `TTS_DEFAULT_VOICE` | `af_heart` | Default Kokoro voice ID | +| `TTS_DEFAULT_FORMAT` | `mp3` | Default audio output format | + +### Text-to-Speech (TTS) - Premium Engine (Chatterbox) + +| Variable | Default | Description | +| --------------------- | ------------------------------- | ----------------------------------------------------------- | +| `TTS_PREMIUM_ENABLED` | `false` | Enable premium TTS engine | +| `TTS_PREMIUM_URL` | `http://chatterbox-tts:8881/v1` | Chatterbox TTS URL (required when TTS_PREMIUM_ENABLED=true) | + +### Text-to-Speech (TTS) - Fallback Engine (Piper/OpenedAI) + +| Variable | Default | Description | +| ---------------------- | -------------------------------- | ------------------------------------------------------------- | +| `TTS_FALLBACK_ENABLED` | `false` | Enable fallback TTS engine | +| `TTS_FALLBACK_URL` | `http://openedai-speech:8000/v1` | OpenedAI Speech URL (required when TTS_FALLBACK_ENABLED=true) | + +### Service Limits + +| Variable | Default | Description | +| ----------------------------- | ---------- | ---------------------------------------------- | +| `SPEECH_MAX_UPLOAD_SIZE` | `25000000` | Maximum upload file size in bytes (25 MB) | +| `SPEECH_MAX_DURATION_SECONDS` | `600` | Maximum audio duration in seconds (10 minutes) | +| `SPEECH_MAX_TEXT_LENGTH` | `4096` | Maximum text length for TTS in characters | + +### Conditional Validation + +When a service is enabled, its URL variable is required. If missing, the application fails at startup with a message like: + +``` +STT is enabled (STT_ENABLED=true) but required environment variables are missing or empty: STT_BASE_URL. +Either set these variables or disable by setting STT_ENABLED=false. +``` + +Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values default to `false`. + +--- + +## Provider Configuration + +### Kokoro (Default Tier) + +**Engine:** [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI) +**License:** Apache 2.0 +**Requirements:** CPU only +**Docker Image:** `ghcr.io/remsky/kokoro-fastapi:latest-cpu` + +**Capabilities:** + +- 54 built-in voices across 8 languages +- Speed control: 0.25x to 4.0x +- Output formats: mp3, wav, opus, flac +- Voice metadata derived from ID prefix (language, gender, accent) + +**Voice ID Format:** `{lang}{gender}_{name}` + +- First character: language/accent (a=American, b=British, e=Spanish, f=French, h=Hindi, j=Japanese, p=Portuguese, z=Chinese) +- Second character: gender (f=Female, m=Male) + +**Example voices:** +| Voice ID | Name | Language | Gender | +|----------|------|----------|--------| +| `af_heart` | Heart | en-US | Female | +| `am_adam` | Adam | en-US | Male | +| `bf_alice` | Alice | en-GB | Female | +| `bm_daniel` | Daniel | en-GB | Male | +| `ef_dora` | Dora | es | Female | +| `ff_camille` | Camille | fr | Female | +| `jf_alpha` | Alpha | ja | Female | +| `zf_xiaobei` | Xiaobei | zh | Female | + +### Chatterbox (Premium Tier) + +**Engine:** [Chatterbox TTS Server](https://github.com/devnen/chatterbox-tts-server) +**License:** Proprietary +**Requirements:** NVIDIA GPU with CUDA +**Docker Image:** `devnen/chatterbox-tts-server:latest` + +**Capabilities:** + +- Voice cloning via reference audio sample +- Emotion exaggeration control (0.0 - 1.0) +- Cross-language voice transfer (23 languages) +- Higher quality synthesis than default tier + +**Supported Languages:** +en, fr, de, es, it, pt, nl, pl, ru, uk, ja, zh, ko, ar, hi, tr, sv, da, fi, no, cs, el, ro + +**Extended Options (Chatterbox-specific):** + +| Option | Type | Description | +| --------------------- | ------ | --------------------------------------------------------- | +| `referenceAudio` | Buffer | Audio sample for voice cloning (5-30 seconds recommended) | +| `emotionExaggeration` | number | Emotion intensity 0.0-1.0 (clamped) | + +These are passed as extra body parameters to the OpenAI-compatible endpoint. Reference audio is base64-encoded before sending. + +### Piper (Fallback Tier) + +**Engine:** [Piper](https://github.com/rhasspy/piper) via [OpenedAI Speech](https://github.com/matatonic/openedai-speech) +**License:** GPL (OpenedAI Speech) +**Requirements:** CPU only (runs on Raspberry Pi) +**Docker Image:** Use OpenedAI Speech image + +**Capabilities:** + +- 100+ voices across 40+ languages +- 6 standard OpenAI voice names (mapped to Piper voices) +- Output formats: mp3, wav, opus, flac +- Ultra-lightweight, designed for low-resource environments + +**Standard Voice Mapping:** + +| OpenAI Voice | Piper Voice | Gender | Description | +| ------------ | -------------------- | ------ | --------------------- | +| `alloy` | en_US-amy-medium | Female | Warm, balanced | +| `echo` | en_US-ryan-medium | Male | Clear, articulate | +| `fable` | en_GB-alan-medium | Male | British narrator | +| `onyx` | en_US-danny-low | Male | Deep, resonant | +| `nova` | en_US-lessac-medium | Female | Expressive, versatile | +| `shimmer` | en_US-kristin-medium | Female | Bright, energetic | + +### Speaches (STT) + +**Engine:** [Speaches](https://github.com/speaches-ai/speaches) (faster-whisper backend) +**License:** MIT +**Requirements:** CPU (GPU optional for faster inference) +**Docker Image:** `ghcr.io/speaches-ai/speaches:latest` + +**Capabilities:** + +- OpenAI-compatible `/v1/audio/transcriptions` endpoint +- Whisper models via faster-whisper +- Verbose JSON response with segments and timestamps +- Language detection + +**Default model:** `Systran/faster-whisper-large-v3-turbo` + +--- + +## Voice Cloning Setup (Chatterbox) + +Voice cloning is available through the Chatterbox premium TTS provider. + +### Prerequisites + +1. NVIDIA GPU with CUDA support +2. `nvidia-container-toolkit` installed on the Docker host +3. Docker runtime configured for GPU access +4. TTS premium tier enabled (`TTS_PREMIUM_ENABLED=true`) + +### Basic Voice Cloning + +Provide a reference audio sample (WAV or MP3, 5-30 seconds) when calling synthesize: + +```typescript +import { SpeechService } from "./speech.service"; +import type { ChatterboxSynthesizeOptions } from "./interfaces/speech-types"; + +const options: ChatterboxSynthesizeOptions = { + tier: "premium", + referenceAudio: myAudioBuffer, // 5-30 second audio sample + emotionExaggeration: 0.5, // 0.0 = neutral, 1.0 = maximum emotion +}; + +const result = await speechService.synthesize("Hello, this is my cloned voice!", options); +``` + +### Voice Cloning Tips + +- **Audio quality:** Use clean recordings without background noise +- **Duration:** 5-30 seconds works best; shorter clips may produce lower quality +- **Format:** WAV provides the best quality; MP3 is also accepted +- **Emotion:** Start with 0.5 (moderate) and adjust from there +- **Cross-language:** You can clone a voice in one language and synthesize in another + +--- + +## Docker Compose Setup + +### Development (Local) + +Speech services are defined in a separate overlay file `docker-compose.speech.yml`. This keeps them optional and separate from core services. + +**Start basic speech services (STT + default TTS):** + +```bash +# Using docker compose directly +docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d + +# Using Makefile +make speech-up +``` + +**Start with premium TTS (requires NVIDIA GPU):** + +```bash +docker compose -f docker-compose.yml -f docker-compose.speech.yml --profile premium-tts up -d +``` + +**Stop speech services:** + +```bash +# Using docker compose directly +docker compose -f docker-compose.yml -f docker-compose.speech.yml down --remove-orphans + +# Using Makefile +make speech-down +``` + +**View logs:** + +```bash +make speech-logs +``` + +### Development Services + +| Service | Container | Port | Image | +| -------------- | --------------------- | ------------------------------- | ------------------------------------------ | +| Speaches (STT) | mosaic-speaches | 8090 (host) -> 8000 (container) | `ghcr.io/speaches-ai/speaches:latest` | +| Kokoro TTS | mosaic-kokoro-tts | 8880 (host) -> 8880 (container) | `ghcr.io/remsky/kokoro-fastapi:latest-cpu` | +| Chatterbox TTS | mosaic-chatterbox-tts | 8881 (host) -> 8000 (container) | `devnen/chatterbox-tts-server:latest` | + +### Production (Docker Swarm) + +For production deployments, use `docker/docker-compose.sample.speech.yml`. This file is designed for Docker Swarm with Traefik integration. + +**Required environment variables:** + +```bash +STT_DOMAIN=stt.example.com +TTS_DOMAIN=tts.example.com +``` + +**Optional environment variables:** + +```bash +WHISPER_MODEL=Systran/faster-whisper-large-v3-turbo +CHATTERBOX_TTS_DOMAIN=tts-premium.example.com +TRAEFIK_ENTRYPOINT=websecure +TRAEFIK_CERTRESOLVER=letsencrypt +TRAEFIK_DOCKER_NETWORK=traefik-public +TRAEFIK_TLS_ENABLED=true +``` + +**Deploy:** + +```bash +docker stack deploy -c docker/docker-compose.sample.speech.yml speech +``` + +**Connecting to Mosaic Stack:** Set the speech URLs in your Mosaic Stack `.env`: + +```bash +# Same Docker network +STT_BASE_URL=http://speaches:8000/v1 +TTS_DEFAULT_URL=http://kokoro-tts:8880/v1 + +# External / different network +STT_BASE_URL=https://stt.example.com/v1 +TTS_DEFAULT_URL=https://tts.example.com/v1 +``` + +### Health Checks + +All speech containers include health checks: + +| Service | Endpoint | Interval | Start Period | +| -------------- | ------------------------------ | -------- | ------------ | +| Speaches | `http://localhost:8000/health` | 30s | 120s | +| Kokoro TTS | `http://localhost:8880/health` | 30s | 120s | +| Chatterbox TTS | `http://localhost:8000/health` | 30s | 180s | + +Chatterbox has a longer start period (180s) because GPU model loading takes additional time. + +--- + +## GPU VRAM Budget + +Only Chatterbox requires GPU resources. The other providers (Speaches, Kokoro, Piper) are CPU-only. + +### Chatterbox VRAM Requirements + +| Component | Approximate VRAM | +| ----------------------- | ------------------ | +| Chatterbox TTS model | ~2-4 GB | +| Voice cloning inference | ~1-2 GB additional | +| **Total recommended** | **4-6 GB** | + +### Shared GPU Considerations + +If running multiple GPU services (e.g., Ollama for LLM + Chatterbox for TTS): + +| Service | VRAM Usage | Notes | +| -------------------- | ----------- | --------------------------------- | +| Ollama (7B model) | ~4-6 GB | Depends on model size | +| Ollama (13B model) | ~8-10 GB | Larger models need more | +| Chatterbox TTS | ~4-6 GB | Voice cloning is memory-intensive | +| **Combined minimum** | **8-12 GB** | For 7B LLM + Chatterbox | + +**Recommendations:** + +- 8 GB VRAM: Adequate for small LLM + Chatterbox (may need to alternate) +- 12 GB VRAM: Comfortable for 7B LLM + Chatterbox simultaneously +- 24 GB VRAM: Supports larger LLMs + Chatterbox with headroom + +If VRAM is limited, consider: + +1. Disabling Chatterbox (`TTS_PREMIUM_ENABLED=false`) and using Kokoro (CPU) as default +2. Using the fallback chain so Kokoro handles requests when Chatterbox is busy +3. Running Chatterbox on a separate GPU host + +### Docker Swarm GPU Scheduling + +For Docker Swarm deployments with GPU, configure generic resources on the node: + +```json +// /etc/docker/daemon.json +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime" + } + }, + "node-generic-resources": ["NVIDIA-GPU=0"] +} +``` + +See the [Docker GPU Swarm documentation](https://docs.docker.com/engine/daemon/nvidia-gpu/#configure-gpus-for-docker-swarm) for details. + +--- + +## Frontend Integration + +Speech services are consumed from the frontend through the REST API and WebSocket gateway. + +### REST API Usage + +**Transcribe audio:** + +```typescript +async function transcribeAudio(file: File, token: string, workspaceId: string) { + const formData = new FormData(); + formData.append("file", file); + formData.append("language", "en"); + + const response = await fetch("/api/speech/transcribe", { + method: "POST", + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + }, + body: formData, + }); + + const { data } = await response.json(); + return data.text; +} +``` + +**Synthesize speech:** + +```typescript +async function synthesizeSpeech( + text: string, + token: string, + workspaceId: string, + voice = "af_heart" +) { + const response = await fetch("/api/speech/synthesize", { + method: "POST", + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + "Content-Type": "application/json", + }, + body: JSON.stringify({ text, voice, format: "mp3" }), + }); + + const audioBlob = await response.blob(); + const audioUrl = URL.createObjectURL(audioBlob); + const audio = new Audio(audioUrl); + audio.play(); +} +``` + +**List voices:** + +```typescript +async function listVoices(token: string, workspaceId: string, tier?: string) { + const url = tier ? `/api/speech/voices?tier=${tier}` : "/api/speech/voices"; + + const response = await fetch(url, { + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + }, + }); + + const { data } = await response.json(); + return data; // VoiceInfo[] +} +``` + +### WebSocket Streaming Usage + +For real-time transcription using the browser's MediaRecorder API: + +```typescript +import { io } from "socket.io-client"; + +function createSpeechSocket(token: string) { + const socket = io("/speech", { + auth: { token }, + }); + + let mediaRecorder: MediaRecorder | null = null; + + async function startRecording() { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaRecorder = new MediaRecorder(stream, { + mimeType: "audio/webm;codecs=opus", + }); + + socket.emit("start-transcription", { language: "en" }); + + mediaRecorder.ondataavailable = (event) => { + if (event.data.size > 0) { + event.data.arrayBuffer().then((buffer) => { + socket.emit("audio-chunk", new Uint8Array(buffer)); + }); + } + }; + + mediaRecorder.start(250); // Send chunks every 250ms + } + + async function stopRecording(): Promise { + return new Promise((resolve, reject) => { + socket.once("transcription-final", (result) => { + resolve(result.text); + }); + + socket.once("transcription-error", ({ message }) => { + reject(new Error(message)); + }); + + if (mediaRecorder) { + mediaRecorder.stop(); + mediaRecorder.stream.getTracks().forEach((track) => track.stop()); + mediaRecorder = null; + } + + socket.emit("stop-transcription"); + }); + } + + return { socket, startRecording, stopRecording }; +} +``` + +### Check Speech Availability + +Before showing speech UI elements, check provider availability: + +```typescript +async function checkSpeechHealth(token: string, workspaceId: string) { + const response = await fetch("/api/speech/health", { + headers: { + Authorization: `Bearer ${token}`, + "x-workspace-id": workspaceId, + }, + }); + + const { data } = await response.json(); + return { + canTranscribe: data.stt.available, + canSynthesize: data.tts.available, + }; +} +``` From d2c76024304ee56957c4cf94cb2ffbdf2fe2f25f Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:26:05 -0600 Subject: [PATCH 17/19] test(#405): add E2E integration tests for speech services Adds comprehensive integration tests covering all 9 required scenarios: 1. REST transcription (POST /speech/transcribe) 2. REST synthesis (POST /speech/synthesize) 3. Provider fallback (premium -> default -> fallback chain) 4. WebSocket streaming transcription lifecycle 5. Audio MIME type validation (reject invalid formats) 6. File size limit enforcement (25 MB max) 7. Authentication on all endpoints (401 without token) 8. Voice listing with tier filtering (GET /speech/voices) 9. Health check status (GET /speech/health) Uses NestJS testing module with mocked providers (CI-compatible). 30 test cases, all passing. Fixes #405 --- .../api/src/speech/speech.integration.spec.ts | 933 ++++++++++++++++++ 1 file changed, 933 insertions(+) create mode 100644 apps/api/src/speech/speech.integration.spec.ts diff --git a/apps/api/src/speech/speech.integration.spec.ts b/apps/api/src/speech/speech.integration.spec.ts new file mode 100644 index 0000000..033a4e9 --- /dev/null +++ b/apps/api/src/speech/speech.integration.spec.ts @@ -0,0 +1,933 @@ +/** + * Speech Services E2E Integration Tests + * + * Tests the full speech pipeline from API endpoints through to mocked external providers. + * Covers REST transcription, synthesis, provider fallback, WebSocket streaming, + * audio validation, file size limits, authentication, voice listing, and health checks. + * + * Uses NestJS testing module with supertest for HTTP testing and direct gateway + * invocation for WebSocket streaming tests. + * + * Issue #405 + */ + +import { describe, it, expect, beforeAll, beforeEach, afterAll, vi } from "vitest"; +import { Test } from "@nestjs/testing"; +import { + type INestApplication, + type CanActivate, + type ExecutionContext, + UnauthorizedException, + ValidationPipe, +} from "@nestjs/common"; +import request from "supertest"; +import type { App } from "supertest/types"; + +import { SpeechController } from "./speech.controller"; +import { SpeechService } from "./speech.service"; +import { SpeechGateway } from "./speech.gateway"; +import { STT_PROVIDER, TTS_PROVIDERS } from "./speech.constants"; +import { speechConfig } from "./speech.config"; +import type { SpeechConfig } from "./speech.config"; +import type { ISTTProvider } from "./interfaces/stt-provider.interface"; +import type { ITTSProvider } from "./interfaces/tts-provider.interface"; +import type { + TranscriptionResult, + SynthesisResult, + VoiceInfo, + SpeechTier, +} from "./interfaces/speech-types"; +import { AuthGuard } from "../auth/guards/auth.guard"; +import { WorkspaceGuard, PermissionGuard } from "../common/guards"; +import { AuthService } from "../auth/auth.service"; +import { PrismaService } from "../prisma/prisma.service"; + +// ========================================== +// Test Fixtures +// ========================================== + +/** + * Small WAV file header (44 bytes) + minimal data. + * Not a real audio file, but has the correct structure for testing. + */ +const TEST_AUDIO_BUFFER = Buffer.alloc(1024, 0); + +const MOCK_WORKSPACE_ID = "550e8400-e29b-41d4-a716-446655440001"; +const MOCK_USER_ID = "550e8400-e29b-41d4-a716-446655440002"; + +const MOCK_USER = { + id: MOCK_USER_ID, + email: "test@example.com", + name: "Test User", + workspaceId: MOCK_WORKSPACE_ID, +}; + +const MOCK_TRANSCRIPTION_RESULT: TranscriptionResult = { + text: "Hello, this is a test transcription.", + language: "en", + durationSeconds: 3.2, + confidence: 0.97, + segments: [ + { text: "Hello, this is a test transcription.", start: 0, end: 3.2, confidence: 0.97 }, + ], +}; + +const MOCK_SYNTHESIS_RESULT: SynthesisResult = { + audio: Buffer.from("fake-synthesized-audio-data-mp3"), + format: "mp3", + voice: "af_heart", + tier: "default" as SpeechTier, + durationSeconds: 2.1, +}; + +const MOCK_VOICES: VoiceInfo[] = [ + { id: "af_heart", name: "Heart", language: "en", tier: "default", isDefault: true }, + { id: "af_sky", name: "Sky", language: "en", tier: "default", isDefault: false }, + { + id: "chatterbox-default", + name: "Chatterbox", + language: "en", + tier: "premium", + isDefault: true, + }, +]; + +const MOCK_SPEECH_CONFIG: SpeechConfig = { + stt: { + enabled: true, + baseUrl: "http://speaches:8000/v1", + model: "test-model", + language: "en", + }, + tts: { + default: { enabled: true, url: "http://kokoro:8880/v1", voice: "af_heart", format: "mp3" }, + premium: { enabled: true, url: "http://chatterbox:8881/v1" }, + fallback: { enabled: true, url: "http://openedai:8000/v1" }, + }, + limits: { + maxUploadSize: 25_000_000, + maxDurationSeconds: 600, + maxTextLength: 4096, + }, +}; + +// ========================================== +// Mock Providers +// ========================================== + +function createMockSTTProvider(): ISTTProvider { + return { + name: "mock-stt", + transcribe: vi.fn().mockResolvedValue(MOCK_TRANSCRIPTION_RESULT), + isHealthy: vi.fn().mockResolvedValue(true), + }; +} + +function createMockTTSProvider(tier: SpeechTier, name: string): ITTSProvider { + const voices = MOCK_VOICES.filter((v) => v.tier === tier); + return { + name, + tier, + synthesize: vi.fn().mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier, + }), + listVoices: vi.fn().mockResolvedValue(voices), + isHealthy: vi.fn().mockResolvedValue(true), + }; +} + +// ========================================== +// Test Guards +// ========================================== + +/** + * Conditional auth guard for testing. + * Authenticates requests that carry `Authorization: Bearer test-token`. + * Rejects all others with UnauthorizedException. + */ +class TestAuthGuard implements CanActivate { + canActivate(context: ExecutionContext): boolean { + const req = context.switchToHttp().getRequest<{ + headers: Record; + user?: typeof MOCK_USER; + cookies?: Record; + }>(); + const authHeader = req.headers.authorization; + const cookieToken = req.cookies?.["better-auth.session_token"]; + + if (authHeader === "Bearer test-token" || cookieToken === "test-token") { + req.user = { ...MOCK_USER }; + return true; + } + + throw new UnauthorizedException("No authentication token provided"); + } +} + +/** + * Test workspace guard that attaches a mock workspace to the request. + */ +class TestWorkspaceGuard implements CanActivate { + canActivate(context: ExecutionContext): boolean { + const req = context.switchToHttp().getRequest<{ + workspace?: { id: string }; + headers: Record; + }>(); + const workspaceId = req.headers["x-workspace-id"] ?? MOCK_WORKSPACE_ID; + req.workspace = { id: workspaceId as string }; + return true; + } +} + +/** + * Test permission guard that always allows access. + */ +class TestPermissionGuard implements CanActivate { + canActivate(): boolean { + return true; + } +} + +// ========================================== +// Tests +// ========================================== + +describe("Speech Services E2E Integration", () => { + let app: INestApplication; + let mockSTTProvider: ISTTProvider; + let defaultTTSProvider: ITTSProvider; + let premiumTTSProvider: ITTSProvider; + let fallbackTTSProvider: ITTSProvider; + let ttsProvidersMap: Map; + + // WebSocket gateway test dependencies + let speechGateway: SpeechGateway; + let mockSpeechService: SpeechService; + + beforeAll(async () => { + // Create mock providers + mockSTTProvider = createMockSTTProvider(); + defaultTTSProvider = createMockTTSProvider("default", "mock-kokoro"); + premiumTTSProvider = createMockTTSProvider("premium", "mock-chatterbox"); + fallbackTTSProvider = createMockTTSProvider("fallback", "mock-piper"); + + ttsProvidersMap = new Map([ + ["default", defaultTTSProvider], + ["premium", premiumTTSProvider], + ["fallback", fallbackTTSProvider], + ]); + + const moduleRef = await Test.createTestingModule({ + controllers: [SpeechController], + providers: [ + SpeechService, + { + provide: speechConfig.KEY, + useValue: MOCK_SPEECH_CONFIG, + }, + { + provide: STT_PROVIDER, + useValue: mockSTTProvider, + }, + { + provide: TTS_PROVIDERS, + useValue: ttsProvidersMap, + }, + // Gateway dependencies (not tested via HTTP but needed for DI) + { + provide: SpeechGateway, + useFactory: ( + authService: AuthService, + prisma: PrismaService, + speechService: SpeechService, + config: SpeechConfig + ): SpeechGateway => { + return new SpeechGateway(authService, prisma, speechService, config); + }, + inject: [AuthService, PrismaService, SpeechService, speechConfig.KEY], + }, + { + provide: AuthService, + useValue: { + verifySession: vi.fn().mockResolvedValue({ + user: { id: MOCK_USER_ID, email: "test@example.com", name: "Test User" }, + session: { id: "test-session" }, + }), + }, + }, + { + provide: PrismaService, + useValue: { + workspaceMember: { + findFirst: vi.fn().mockResolvedValue({ + userId: MOCK_USER_ID, + workspaceId: MOCK_WORKSPACE_ID, + role: "MEMBER", + }), + }, + }, + }, + ], + }) + .overrideGuard(AuthGuard) + .useClass(TestAuthGuard) + .overrideGuard(WorkspaceGuard) + .useClass(TestWorkspaceGuard) + .overrideGuard(PermissionGuard) + .useClass(TestPermissionGuard) + .compile(); + + app = moduleRef.createNestApplication(); + app.useGlobalPipes(new ValidationPipe({ transform: true, whitelist: true })); + await app.init(); + + // Capture references for WebSocket tests + speechGateway = moduleRef.get(SpeechGateway); + mockSpeechService = moduleRef.get(SpeechService); + }); + + beforeEach(() => { + vi.clearAllMocks(); + + // Reset default mock behaviors + (mockSTTProvider.transcribe as ReturnType).mockResolvedValue( + MOCK_TRANSCRIPTION_RESULT + ); + (defaultTTSProvider.synthesize as ReturnType).mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier: "default", + }); + (premiumTTSProvider.synthesize as ReturnType).mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier: "premium", + }); + (fallbackTTSProvider.synthesize as ReturnType).mockResolvedValue({ + ...MOCK_SYNTHESIS_RESULT, + tier: "fallback", + }); + (defaultTTSProvider.listVoices as ReturnType).mockResolvedValue( + MOCK_VOICES.filter((v) => v.tier === "default") + ); + (premiumTTSProvider.listVoices as ReturnType).mockResolvedValue( + MOCK_VOICES.filter((v) => v.tier === "premium") + ); + (fallbackTTSProvider.listVoices as ReturnType).mockResolvedValue([]); + }); + + afterAll(async () => { + if (app) { + await app.close(); + } + }); + + // ========================================== + // Scenario 1: REST Transcription + // ========================================== + describe("Scenario 1: REST Transcription (POST /speech/transcribe)", () => { + it("should transcribe an uploaded audio file and return the transcription result", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", TEST_AUDIO_BUFFER, { + filename: "test.wav", + contentType: "audio/wav", + }) + .expect(201); + + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toMatchObject({ + text: MOCK_TRANSCRIPTION_RESULT.text, + language: MOCK_TRANSCRIPTION_RESULT.language, + durationSeconds: MOCK_TRANSCRIPTION_RESULT.durationSeconds, + confidence: MOCK_TRANSCRIPTION_RESULT.confidence, + }); + expect(response.body.data.segments).toBeDefined(); + expect(response.body.data.segments).toHaveLength(1); + + expect(mockSTTProvider.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({ mimeType: "audio/wav" }) + ); + }); + + it("should pass optional transcription parameters to the service", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", TEST_AUDIO_BUFFER, { + filename: "test.mp3", + contentType: "audio/mpeg", + }) + .field("language", "fr") + .field("model", "whisper-large-v3") + .field("prompt", "Meeting transcript") + .field("temperature", "0.3") + .expect(201); + + expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text); + + expect(mockSTTProvider.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({ + mimeType: "audio/mpeg", + language: "fr", + model: "whisper-large-v3", + prompt: "Meeting transcript", + temperature: 0.3, + }) + ); + }); + + it("should reject request without an audio file", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .expect(400); + + expect(response.body).toHaveProperty("message"); + }); + }); + + // ========================================== + // Scenario 2: REST Synthesis + // ========================================== + describe("Scenario 2: REST Synthesis (POST /speech/synthesize)", () => { + it("should synthesize text and return audio binary response", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "Hello, world!" }) + .expect(201); + + // Response should be binary audio + expect(response.headers["content-type"]).toContain("audio/mpeg"); + expect(response.headers["content-disposition"]).toContain("attachment"); + expect(response.headers["content-disposition"]).toContain("speech.mp3"); + expect(response.body).toBeDefined(); + expect(Buffer.isBuffer(response.body) || response.body instanceof Buffer).toBe(true); + }); + + it("should pass voice, speed, format, and tier options to the service", async () => { + (defaultTTSProvider.synthesize as ReturnType).mockResolvedValue({ + audio: Buffer.from("wav-audio-data"), + format: "wav", + voice: "af_sky", + tier: "default", + durationSeconds: 1.5, + }); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ + text: "Test with options", + voice: "af_sky", + speed: 1.5, + format: "wav", + }) + .expect(201); + + expect(response.headers["content-type"]).toContain("audio/wav"); + expect(response.headers["content-disposition"]).toContain("speech.wav"); + }); + + it("should accept empty text (validation delegated to service)", async () => { + // The SynthesizeDto allows empty strings (no @IsNotEmpty decorator). + // The service/provider handles empty text semantics. + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "" }) + .expect(201); + + expect(response.headers["content-type"]).toContain("audio/mpeg"); + }); + + it("should reject missing text field", async () => { + await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({}) + .expect(400); + }); + }); + + // ========================================== + // Scenario 3: Provider Fallback + // ========================================== + describe("Scenario 3: Provider Fallback", () => { + it("should fall back from premium to default when premium fails", async () => { + // Make premium provider fail + (premiumTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Premium provider unavailable") + ); + + // Default provider should succeed + (defaultTTSProvider.synthesize as ReturnType).mockResolvedValue({ + audio: Buffer.from("fallback-audio"), + format: "mp3", + voice: "af_heart", + tier: "default", + }); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "Fallback test", tier: "premium" }) + .expect(201); + + // Premium was attempted first + expect(premiumTTSProvider.synthesize).toHaveBeenCalled(); + // Then default succeeded + expect(defaultTTSProvider.synthesize).toHaveBeenCalled(); + expect(response.headers["content-type"]).toContain("audio/mpeg"); + }); + + it("should fall back through entire chain: premium -> default -> fallback", async () => { + // Make premium and default fail + (premiumTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Premium down") + ); + (defaultTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Default down") + ); + + // Fallback should succeed + (fallbackTTSProvider.synthesize as ReturnType).mockResolvedValue({ + audio: Buffer.from("fallback-piper-audio"), + format: "mp3", + voice: "piper-default", + tier: "fallback", + }); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "Full fallback chain test", tier: "premium" }) + .expect(201); + + expect(premiumTTSProvider.synthesize).toHaveBeenCalled(); + expect(defaultTTSProvider.synthesize).toHaveBeenCalled(); + expect(fallbackTTSProvider.synthesize).toHaveBeenCalled(); + expect(response.headers["content-type"]).toContain("audio/mpeg"); + }); + + it("should return 503 when all TTS providers fail", async () => { + (premiumTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Premium down") + ); + (defaultTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Default down") + ); + (fallbackTTSProvider.synthesize as ReturnType).mockRejectedValue( + new Error("Fallback down") + ); + + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .set("Authorization", "Bearer test-token") + .send({ text: "All providers down", tier: "premium" }) + .expect(503); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("All TTS providers failed"); + }); + }); + + // ========================================== + // Scenario 4: WebSocket Streaming Transcription + // ========================================== + describe("Scenario 4: WebSocket Streaming Transcription", () => { + interface MockSocket { + id: string; + join: ReturnType; + leave: ReturnType; + emit: ReturnType; + disconnect: ReturnType; + data: { userId?: string; workspaceId?: string }; + handshake: { + auth: Record; + query: Record; + headers: Record; + }; + } + + function createTestSocket(overrides?: Partial): MockSocket { + return { + id: "e2e-test-socket", + join: vi.fn(), + leave: vi.fn(), + emit: vi.fn(), + disconnect: vi.fn(), + data: {}, + handshake: { + auth: { token: "valid-token" }, + query: {}, + headers: {}, + }, + ...overrides, + }; + } + + it("should complete the full streaming transcription lifecycle", async () => { + const client = createTestSocket(); + // Authenticate the client + await speechGateway.handleConnection(client as never); + + expect(client.data.userId).toBe(MOCK_USER_ID); + expect(client.data.workspaceId).toBe(MOCK_WORKSPACE_ID); + expect(client.disconnect).not.toHaveBeenCalled(); + + // Start transcription session + speechGateway.handleStartTranscription(client as never, { language: "en" }); + + expect(client.emit).toHaveBeenCalledWith( + "transcription-started", + expect.objectContaining({ sessionId: "e2e-test-socket" }) + ); + + // Send audio chunks + const chunk1 = Buffer.from("audio-data-chunk-1"); + const chunk2 = Buffer.from("audio-data-chunk-2"); + const chunk3 = Buffer.from("audio-data-chunk-3"); + + speechGateway.handleAudioChunk(client as never, chunk1); + speechGateway.handleAudioChunk(client as never, chunk2); + speechGateway.handleAudioChunk(client as never, chunk3); + + // No errors should have been emitted for chunks + const errorCalls = client.emit.mock.calls.filter( + (call: unknown[]) => call[0] === "transcription-error" + ); + expect(errorCalls).toHaveLength(0); + + vi.clearAllMocks(); + (mockSTTProvider.transcribe as ReturnType).mockResolvedValue( + MOCK_TRANSCRIPTION_RESULT + ); + + // Stop transcription - should trigger the full transcription pipeline + await speechGateway.handleStopTranscription(client as never); + + // Verify transcription was called with concatenated audio + expect(mockSTTProvider.transcribe).toHaveBeenCalledWith( + expect.any(Buffer), + expect.objectContaining({ language: "en" }) + ); + + // Verify the final result was emitted + expect(client.emit).toHaveBeenCalledWith( + "transcription-final", + expect.objectContaining({ + text: MOCK_TRANSCRIPTION_RESULT.text, + language: "en", + durationSeconds: 3.2, + confidence: 0.97, + }) + ); + }); + + it("should clean up session on disconnect", async () => { + const client = createTestSocket({ id: "disconnect-test" }); + await speechGateway.handleConnection(client as never); + + speechGateway.handleStartTranscription(client as never, {}); + speechGateway.handleAudioChunk(client as never, Buffer.from("data")); + + // Disconnect + speechGateway.handleDisconnect(client as never); + + // Trying to send more chunks should fail (session cleaned up) + vi.clearAllMocks(); + speechGateway.handleAudioChunk(client as never, Buffer.from("more-data")); + + expect(client.emit).toHaveBeenCalledWith( + "transcription-error", + expect.objectContaining({ + message: expect.stringContaining("No active transcription session"), + }) + ); + }); + + it("should reject unauthenticated WebSocket clients", async () => { + const client = createTestSocket({ + id: "unauth-ws-client", + handshake: { auth: {}, query: {}, headers: {} }, + }); + + await speechGateway.handleConnection(client as never); + + expect(client.disconnect).toHaveBeenCalled(); + expect(client.data.userId).toBeUndefined(); + }); + }); + + // ========================================== + // Scenario 5: Audio Validation (Invalid MIME Type) + // ========================================== + describe("Scenario 5: Audio Validation", () => { + it("should reject files with unsupported MIME types", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", Buffer.from("not-audio"), { + filename: "document.pdf", + contentType: "application/pdf", + }) + .expect(400); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("Unsupported audio format"); + expect(response.body.message).toContain("application/pdf"); + }); + + it("should reject files with text/plain MIME type", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", Buffer.from("plain text content"), { + filename: "notes.txt", + contentType: "text/plain", + }) + .expect(400); + + expect(response.body.message).toContain("Unsupported audio format"); + }); + + it("should reject video MIME types", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", Buffer.from("video-data"), { + filename: "video.mp4", + contentType: "video/mp4", + }) + .expect(400); + + expect(response.body.message).toContain("Unsupported audio format"); + }); + + it("should accept valid audio MIME types", async () => { + const validMimeTypes = [ + { mime: "audio/wav", ext: "wav" }, + { mime: "audio/mpeg", ext: "mp3" }, + { mime: "audio/webm", ext: "webm" }, + { mime: "audio/ogg", ext: "ogg" }, + { mime: "audio/flac", ext: "flac" }, + ]; + + for (const { mime, ext } of validMimeTypes) { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", TEST_AUDIO_BUFFER, { + filename: `test.${ext}`, + contentType: mime, + }) + .expect(201); + + expect(response.body).toHaveProperty("data"); + expect(response.body.data.text).toBe(MOCK_TRANSCRIPTION_RESULT.text); + } + }); + }); + + // ========================================== + // Scenario 6: File Size Limits + // ========================================== + describe("Scenario 6: File Size Limits", () => { + it("should reject files exceeding the maximum upload size (25 MB)", async () => { + // Create a buffer slightly over the 25 MB limit + const oversizedBuffer = Buffer.alloc(25_000_001, 0); + + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", oversizedBuffer, { + filename: "large-audio.wav", + contentType: "audio/wav", + }) + .expect(400); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("exceeds maximum allowed size"); + }); + + it("should accept files within the size limit", async () => { + // Create a buffer at the exact limit + const maxBuffer = Buffer.alloc(1024, 0); + + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .set("Authorization", "Bearer test-token") + .attach("file", maxBuffer, { + filename: "acceptable-audio.wav", + contentType: "audio/wav", + }) + .expect(201); + + expect(response.body).toHaveProperty("data"); + }); + }); + + // ========================================== + // Scenario 7: Authentication + // ========================================== + describe("Scenario 7: Authentication", () => { + it("should reject POST /speech/transcribe without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/transcribe") + .attach("file", TEST_AUDIO_BUFFER, { + filename: "test.wav", + contentType: "audio/wav", + }) + .expect(401); + + expect(response.body).toHaveProperty("message"); + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject POST /speech/synthesize without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .post("/speech/synthesize") + .send({ text: "Hello" }) + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject GET /speech/voices without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices") + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject GET /speech/health without authentication", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/health") + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + + it("should reject requests with an invalid token", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices") + .set("Authorization", "Bearer invalid-token-xyz") + .expect(401); + + expect(response.body.message).toContain("No authentication token provided"); + }); + }); + + // ========================================== + // Scenario 8: Voice Listing + // ========================================== + describe("Scenario 8: Voice Listing (GET /speech/voices)", () => { + it("should return all voices when no tier filter is provided", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices") + .set("Authorization", "Bearer test-token") + .expect(200); + + expect(response.body).toHaveProperty("data"); + expect(Array.isArray(response.body.data)).toBe(true); + + // Should have voices from all providers that returned voices + const voices = response.body.data as VoiceInfo[]; + expect(voices.length).toBeGreaterThan(0); + + // Verify voice structure + for (const voice of voices) { + expect(voice).toHaveProperty("id"); + expect(voice).toHaveProperty("name"); + expect(voice).toHaveProperty("tier"); + } + }); + + it("should filter voices by tier when tier query param is provided", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices?tier=default") + .set("Authorization", "Bearer test-token") + .expect(200); + + const voices = response.body.data as VoiceInfo[]; + expect(voices.length).toBeGreaterThan(0); + + for (const voice of voices) { + expect(voice.tier).toBe("default"); + } + + expect(defaultTTSProvider.listVoices).toHaveBeenCalled(); + }); + + it("should return empty array for tier with no voices", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices?tier=fallback") + .set("Authorization", "Bearer test-token") + .expect(200); + + expect(response.body.data).toEqual([]); + }); + + it("should include voice metadata (id, name, language, tier, isDefault)", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/voices?tier=default") + .set("Authorization", "Bearer test-token") + .expect(200); + + const voices = response.body.data as VoiceInfo[]; + const defaultVoice = voices.find((v) => v.isDefault === true); + + expect(defaultVoice).toBeDefined(); + expect(defaultVoice).toMatchObject({ + id: "af_heart", + name: "Heart", + language: "en", + tier: "default", + isDefault: true, + }); + }); + }); + + // ========================================== + // Scenario 9: Health Check + // ========================================== + describe("Scenario 9: Health Check (GET /speech/health)", () => { + it("should return health status for both STT and TTS providers", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/health") + .set("Authorization", "Bearer test-token") + .expect(200); + + expect(response.body).toHaveProperty("data"); + expect(response.body.data).toHaveProperty("stt"); + expect(response.body.data).toHaveProperty("tts"); + + expect(response.body.data.stt).toHaveProperty("available"); + expect(response.body.data.tts).toHaveProperty("available"); + + // Both should be available since we have mock providers registered and config enabled + expect(response.body.data.stt.available).toBe(true); + expect(response.body.data.tts.available).toBe(true); + }); + + it("should return consistent health check format", async () => { + const response = await request(app.getHttpServer() as App) + .get("/speech/health") + .set("Authorization", "Bearer test-token") + .expect(200); + + // Verify the response matches the expected shape + expect(response.body).toEqual({ + data: { + stt: { available: expect.any(Boolean) }, + tts: { available: expect.any(Boolean) }, + }, + }); + }); + }); +}); From dcbc8d10533f20396e11c1928296cbf487fdc6d6 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:27:21 -0600 Subject: [PATCH 18/19] =?UTF-8?q?chore(orchestrator):=20finalize=20M13-Spe?= =?UTF-8?q?echServices=20tasks.md=20=E2=80=94=20all=2018/18=20done?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All tasks completed successfully across 7 phases: - Phase 1: Config + Module foundation (2/2) - Phase 2: STT + TTS providers (5/5) - Phase 3: Middleware + REST endpoints (3/3) - Phase 4: WebSocket streaming (1/1) - Phase 5: Docker/DevOps (2/2) - Phase 6: Frontend components (3/3) - Phase 7: E2E tests + Documentation (2/2) Total: ~500+ tests across API and web packages. Co-Authored-By: Claude Opus 4.6 --- docs/tasks.md | 64 +++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 431cfec..fdaa28c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -8,53 +8,53 @@ ## Phase 1: Foundation (Config + Module + Providers) -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| SP-CFG-001 | not-started | #401: Speech services environment variables and ConfigModule integration | #401 | api | feature/m13-speech-services | | SP-MOD-001,SP-DOC-001 | | | | 15K | | | -| SP-MOD-001 | not-started | #389: Create SpeechModule with provider abstraction layer | #389 | api | feature/m13-speech-services | SP-CFG-001 | SP-STT-001,SP-TTS-001,SP-MID-001 | | | | 25K | | | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +| ---------- | ------ | ------------------------------------------------------------------------ | ----- | ---- | --------------------------- | ---------- | -------------------------------- | -------- | ----------------- | ----------------- | -------- | ---- | ----------------- | +| SP-CFG-001 | done | #401: Speech services environment variables and ConfigModule integration | #401 | api | feature/m13-speech-services | | SP-MOD-001,SP-DOC-001 | worker-1 | 2026-02-15T06:00Z | 2026-02-15T06:07Z | 15K | 15K | 51 tests, 4cc43be | +| SP-MOD-001 | done | #389: Create SpeechModule with provider abstraction layer | #389 | api | feature/m13-speech-services | SP-CFG-001 | SP-STT-001,SP-TTS-001,SP-MID-001 | worker-2 | 2026-02-15T06:08Z | 2026-02-15T06:14Z | 25K | 25K | 27 tests, c40373f | ## Phase 2: Providers (STT + TTS) -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| SP-STT-001 | not-started | #390: Implement STT provider with Speaches/faster-whisper integration | #390 | api | feature/m13-speech-services | SP-MOD-001 | SP-EP-001,SP-WS-001 | | | | 20K | | | -| SP-TTS-001 | not-started | #391: Implement tiered TTS provider architecture | #391 | api | feature/m13-speech-services | SP-MOD-001 | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-EP-002 | | | | 20K | | | -| SP-TTS-002 | not-started | #393: Implement Kokoro-FastAPI TTS provider (default tier) | #393 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | | | | 15K | | | -| SP-TTS-003 | not-started | #394: Implement Chatterbox TTS provider (premium tier, voice cloning) | #394 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | | | | 15K | | | -| SP-TTS-004 | not-started | #395: Implement Piper TTS provider via OpenedAI Speech (fallback tier) | #395 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | | | | 12K | | | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +| ---------- | ------ | ---------------------------------------------------------------------- | ----- | ---- | --------------------------- | ---------- | ------------------------------------------ | -------- | ----------------- | ----------------- | -------- | ---- | ----------------- | +| SP-STT-001 | done | #390: Implement STT provider with Speaches/faster-whisper integration | #390 | api | feature/m13-speech-services | SP-MOD-001 | SP-EP-001,SP-WS-001 | worker-4 | 2026-02-15T06:15Z | 2026-02-15T06:25Z | 20K | 50K | 27 tests, 3ae9e53 | +| SP-TTS-001 | done | #391: Implement tiered TTS provider architecture | #391 | api | feature/m13-speech-services | SP-MOD-001 | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-EP-002 | worker-5 | 2026-02-15T06:15Z | 2026-02-15T06:25Z | 20K | 35K | 30 tests, b5edb4f | +| SP-TTS-002 | done | #393: Implement Kokoro-FastAPI TTS provider (default tier) | #393 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | worker-6 | 2026-02-15T06:26Z | 2026-02-15T06:33Z | 15K | 25K | 48 tests, 79b1d81 | +| SP-TTS-003 | done | #394: Implement Chatterbox TTS provider (premium tier, voice cloning) | #394 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | worker-7 | 2026-02-15T06:26Z | 2026-02-15T06:34Z | 15K | 25K | 26 tests, d37c78f | +| SP-TTS-004 | done | #395: Implement Piper TTS provider via OpenedAI Speech (fallback tier) | #395 | api | feature/m13-speech-services | SP-TTS-001 | SP-EP-002 | worker-8 | 2026-02-15T06:35Z | 2026-02-15T06:44Z | 12K | 15K | 37 tests, 6c46556 | ## Phase 3: Middleware + REST Endpoints -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| SP-MID-001 | not-started | #398: Audio format validation and preprocessing middleware | #398 | api | feature/m13-speech-services | SP-MOD-001 | SP-EP-001,SP-EP-002 | | | | 15K | | | -| SP-EP-001 | not-started | #392: Create /api/speech/transcribe REST endpoint | #392 | api | feature/m13-speech-services | SP-STT-001,SP-MID-001 | SP-WS-001,SP-FE-001 | | | | 20K | | | -| SP-EP-002 | not-started | #396: Create /api/speech/synthesize REST endpoint | #396 | api | feature/m13-speech-services | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-MID-001 | SP-FE-002 | | | | 20K | | | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +| ---------- | ------ | ---------------------------------------------------------- | ----- | ---- | --------------------------- | ------------------------------------------- | ------------------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- | +| SP-MID-001 | done | #398: Audio format validation and preprocessing middleware | #398 | api | feature/m13-speech-services | SP-MOD-001 | SP-EP-001,SP-EP-002 | worker-9 | 2026-02-15T06:35Z | 2026-02-15T06:42Z | 15K | 25K | 36 tests, 7b4fda6 | +| SP-EP-001 | done | #392: Create /api/speech/transcribe REST endpoint | #392 | api | feature/m13-speech-services | SP-STT-001,SP-MID-001 | SP-WS-001,SP-FE-001 | worker-10 | 2026-02-15T06:45Z | 2026-02-15T06:52Z | 20K | 25K | 10 tests, 527262a | +| SP-EP-002 | done | #396: Create /api/speech/synthesize REST endpoint | #396 | api | feature/m13-speech-services | SP-TTS-002,SP-TTS-003,SP-TTS-004,SP-MID-001 | SP-FE-002 | worker-11 | 2026-02-15T06:45Z | 2026-02-15T06:53Z | 20K | 35K | 17 tests, 527262a | ## Phase 4: WebSocket Streaming -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| SP-WS-001 | not-started | #397: Implement WebSocket streaming transcription endpoint | #397 | api | feature/m13-speech-services | SP-STT-001,SP-EP-001 | SP-FE-001 | | | | 20K | | | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +| --------- | ------ | ---------------------------------------------------------- | ----- | ---- | --------------------------- | -------------------- | --------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- | +| SP-WS-001 | done | #397: Implement WebSocket streaming transcription endpoint | #397 | api | feature/m13-speech-services | SP-STT-001,SP-EP-001 | SP-FE-001 | worker-12 | 2026-02-15T06:54Z | 2026-02-15T07:00Z | 20K | 30K | 29 tests, 28c9e6f | ## Phase 5: Docker/DevOps -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| SP-DOC-001 | not-started | #399: Docker Compose dev overlay for speech services | #399 | devops | feature/m13-speech-services | SP-CFG-001 | SP-DOC-002 | | | | 10K | | | -| SP-DOC-002 | not-started | #400: Docker Compose swarm/prod deployment for speech services | #400 | devops | feature/m13-speech-services | SP-DOC-001 | | | | | 10K | | | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +| ---------- | ------ | -------------------------------------------------------------- | ----- | ------ | --------------------------- | ---------- | ---------- | --------- | ----------------- | ----------------- | -------- | ---- | ------- | +| SP-DOC-001 | done | #399: Docker Compose dev overlay for speech services | #399 | devops | feature/m13-speech-services | SP-CFG-001 | SP-DOC-002 | worker-3 | 2026-02-15T06:08Z | 2026-02-15T06:10Z | 10K | 15K | 52553c8 | +| SP-DOC-002 | done | #400: Docker Compose swarm/prod deployment for speech services | #400 | devops | feature/m13-speech-services | SP-DOC-001 | | worker-13 | 2026-02-15T06:54Z | 2026-02-15T06:56Z | 10K | 8K | b3d6d73 | ## Phase 6: Frontend -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| SP-FE-001 | not-started | #402: Frontend voice input component (microphone capture + transcription) | #402 | web | feature/m13-speech-services | SP-EP-001,SP-WS-001 | SP-FE-003 | | | | 25K | | | -| SP-FE-002 | not-started | #403: Frontend audio playback component for TTS output | #403 | web | feature/m13-speech-services | SP-EP-002 | SP-FE-003 | | | | 20K | | | -| SP-FE-003 | not-started | #404: Frontend speech settings page (provider selection, voice config) | #404 | web | feature/m13-speech-services | SP-FE-001,SP-FE-002 | SP-E2E-001 | | | | 20K | | | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +| --------- | ------ | ------------------------------------------------------------------------- | ----- | ---- | --------------------------- | ------------------- | ---------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- | +| SP-FE-001 | done | #402: Frontend voice input component (microphone capture + transcription) | #402 | web | feature/m13-speech-services | SP-EP-001,SP-WS-001 | SP-FE-003 | worker-14 | 2026-02-15T07:01Z | 2026-02-15T07:12Z | 25K | 50K | 34 tests, 74d6c10 | +| SP-FE-002 | done | #403: Frontend audio playback component for TTS output | #403 | web | feature/m13-speech-services | SP-EP-002 | SP-FE-003 | worker-15 | 2026-02-15T07:01Z | 2026-02-15T07:11Z | 20K | 50K | 32 tests, 74d6c10 | +| SP-FE-003 | done | #404: Frontend speech settings page (provider selection, voice config) | #404 | web | feature/m13-speech-services | SP-FE-001,SP-FE-002 | SP-E2E-001 | worker-16 | 2026-02-15T07:13Z | 2026-02-15T07:22Z | 20K | 35K | 30 tests, bc86947 | ## Phase 7: Testing + Documentation -| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| SP-E2E-001 | not-started | #405: E2E integration tests for speech services | #405 | api | feature/m13-speech-services | SP-EP-001,SP-EP-002,SP-WS-001,SP-FE-003 | SP-DOCS-001 | | | | 25K | | | -| SP-DOCS-001 | not-started | #406: Documentation - Speech services architecture, API, and deployment | #406 | docs | feature/m13-speech-services | SP-E2E-001 | | | | | 15K | | | +| id | status | description | issue | repo | branch | depends_on | blocks | agent | started_at | completed_at | estimate | used | notes | +| ----------- | ------ | ----------------------------------------------------------------------- | ----- | ---- | --------------------------- | --------------------------------------- | ----------- | --------- | ----------------- | ----------------- | -------- | ---- | ----------------- | +| SP-E2E-001 | done | #405: E2E integration tests for speech services | #405 | api | feature/m13-speech-services | SP-EP-001,SP-EP-002,SP-WS-001,SP-FE-003 | SP-DOCS-001 | worker-17 | 2026-02-15T07:23Z | 2026-02-15T07:32Z | 25K | 35K | 30 tests, d2c7602 | +| SP-DOCS-001 | done | #406: Documentation - Speech services architecture, API, and deployment | #406 | docs | feature/m13-speech-services | SP-E2E-001 | | worker-18 | 2026-02-15T07:23Z | 2026-02-15T07:29Z | 15K | 35K | 24065aa | From af9c5799af36abbc7f54d366c98d2530eaf4fb75 Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Sun, 15 Feb 2026 03:44:33 -0600 Subject: [PATCH 19/19] =?UTF-8?q?fix(#388):=20address=20PR=20review=20find?= =?UTF-8?q?ings=20=E2=80=94=20fix=20WebSocket/REST=20bugs,=20improve=20err?= =?UTF-8?q?or=20handling,=20fix=20types=20and=20comments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 --- apps/api/src/speech/AGENTS.md | 2 +- apps/api/src/speech/dto/synthesize.dto.ts | 30 +++-------- apps/api/src/speech/interfaces/index.ts | 1 + .../api/src/speech/interfaces/speech-types.ts | 10 ++-- .../interfaces/stt-provider.interface.ts | 2 +- .../speech/providers/kokoro-tts.provider.ts | 6 +-- .../speech/providers/piper-tts.provider.ts | 2 +- .../speech/providers/tts-provider.factory.ts | 4 +- apps/api/src/speech/speech.config.ts | 5 +- apps/api/src/speech/speech.gateway.ts | 15 ++++++ .../components/speech/TextToSpeechButton.tsx | 2 +- apps/web/src/hooks/useTextToSpeech.ts | 13 ++++- apps/web/src/hooks/useVoiceInput.ts | 50 ++++++++++++++----- docs/SPEECH.md | 2 +- 14 files changed, 91 insertions(+), 53 deletions(-) diff --git a/apps/api/src/speech/AGENTS.md b/apps/api/src/speech/AGENTS.md index 04b6d97..c3553b6 100644 --- a/apps/api/src/speech/AGENTS.md +++ b/apps/api/src/speech/AGENTS.md @@ -34,7 +34,7 @@ speech/ └── providers/ ├── base-tts.provider.ts # Abstract base class (OpenAI SDK + common logic) ├── base-tts.provider.spec.ts - ├── kokoro-tts.provider.ts # Default tier (CPU, 54 voices, 8 languages) + ├── kokoro-tts.provider.ts # Default tier (CPU, 53 voices, 8 languages) ├── kokoro-tts.provider.spec.ts ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control) ├── chatterbox-tts.provider.spec.ts diff --git a/apps/api/src/speech/dto/synthesize.dto.ts b/apps/api/src/speech/dto/synthesize.dto.ts index 171dc0e..4b2c1e7 100644 --- a/apps/api/src/speech/dto/synthesize.dto.ts +++ b/apps/api/src/speech/dto/synthesize.dto.ts @@ -2,7 +2,7 @@ * SynthesizeDto * * DTO for text-to-speech synthesis requests. - * The text field is validated by TextValidationPipe for length/emptiness. + * Text and option fields are validated by class-validator decorators. * Additional options control voice, speed, format, and tier selection. * * Issue #398 @@ -10,29 +10,13 @@ import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator"; import { Type } from "class-transformer"; +import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types"; import type { AudioFormat, SpeechTier } from "../interfaces/speech-types"; -/** - * Valid audio output formats for TTS synthesis. - */ -const VALID_AUDIO_FORMATS: readonly AudioFormat[] = [ - "mp3", - "wav", - "opus", - "flac", - "aac", - "pcm", -] as const; - -/** - * Valid TTS tiers for provider selection. - */ -const VALID_SPEECH_TIERS: readonly SpeechTier[] = ["default", "premium", "fallback"] as const; - export class SynthesizeDto { /** * Text to convert to speech. - * Validated separately by TextValidationPipe for length and emptiness. + * Validated by class-validator decorators for type and maximum length. */ @IsString({ message: "text must be a string" }) @MaxLength(4096, { message: "text must not exceed 4096 characters" }) @@ -66,8 +50,8 @@ export class SynthesizeDto { */ @IsOptional() @IsString({ message: "format must be a string" }) - @IsIn(VALID_AUDIO_FORMATS, { - message: `format must be one of: ${VALID_AUDIO_FORMATS.join(", ")}`, + @IsIn(AUDIO_FORMATS, { + message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`, }) format?: AudioFormat; @@ -78,8 +62,8 @@ export class SynthesizeDto { */ @IsOptional() @IsString({ message: "tier must be a string" }) - @IsIn(VALID_SPEECH_TIERS, { - message: `tier must be one of: ${VALID_SPEECH_TIERS.join(", ")}`, + @IsIn(SPEECH_TIERS, { + message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`, }) tier?: SpeechTier; } diff --git a/apps/api/src/speech/interfaces/index.ts b/apps/api/src/speech/interfaces/index.ts index ded8bd2..5674169 100644 --- a/apps/api/src/speech/interfaces/index.ts +++ b/apps/api/src/speech/interfaces/index.ts @@ -6,6 +6,7 @@ export type { ISTTProvider } from "./stt-provider.interface"; export type { ITTSProvider } from "./tts-provider.interface"; +export { SPEECH_TIERS, AUDIO_FORMATS } from "./speech-types"; export type { SpeechTier, AudioFormat, diff --git a/apps/api/src/speech/interfaces/speech-types.ts b/apps/api/src/speech/interfaces/speech-types.ts index c3b93c1..a472eae 100644 --- a/apps/api/src/speech/interfaces/speech-types.ts +++ b/apps/api/src/speech/interfaces/speech-types.ts @@ -12,19 +12,21 @@ // ========================================== /** - * TTS provider tier. + * Canonical array of TTS provider tiers. * Determines which TTS engine is used for synthesis. * * - default: Primary TTS engine (e.g., Kokoro) * - premium: Higher quality TTS engine (e.g., Chatterbox) * - fallback: Backup TTS engine (e.g., Piper/OpenedAI) */ -export type SpeechTier = "default" | "premium" | "fallback"; +export const SPEECH_TIERS = ["default", "premium", "fallback"] as const; +export type SpeechTier = (typeof SPEECH_TIERS)[number]; /** - * Audio output format for TTS synthesis. + * Canonical array of audio output formats for TTS synthesis. */ -export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm"; +export const AUDIO_FORMATS = ["mp3", "wav", "opus", "flac", "aac", "pcm"] as const; +export type AudioFormat = (typeof AUDIO_FORMATS)[number]; // ========================================== // STT Types diff --git a/apps/api/src/speech/interfaces/stt-provider.interface.ts b/apps/api/src/speech/interfaces/stt-provider.interface.ts index 871fdd1..8f36ce2 100644 --- a/apps/api/src/speech/interfaces/stt-provider.interface.ts +++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts @@ -16,7 +16,7 @@ import type { TranscribeOptions, TranscriptionResult } from "./speech-types"; * * @example * ```typescript - * class SpeachesProvider implements ISTTProvider { + * class SpeachesSttProvider implements ISTTProvider { * readonly name = "speaches"; * * async transcribe(audio: Buffer, options?: TranscribeOptions): Promise { diff --git a/apps/api/src/speech/providers/kokoro-tts.provider.ts b/apps/api/src/speech/providers/kokoro-tts.provider.ts index ac1b7d3..a7a0800 100644 --- a/apps/api/src/speech/providers/kokoro-tts.provider.ts +++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts @@ -5,7 +5,7 @@ * CPU-based, always available, Apache 2.0 license. * * Features: - * - 54 built-in voices across 8 languages + * - 53 built-in voices across 8 languages * - Speed control: 0.25x to 4.0x * - Output formats: mp3, wav, opus, flac * - Voice metadata derived from ID prefix (language, gender, accent) @@ -222,7 +222,7 @@ export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata { /** * Kokoro-FastAPI TTS provider (default tier). * - * CPU-based text-to-speech engine with 54 built-in voices across 8 languages. + * CPU-based text-to-speech engine with 53 built-in voices across 8 languages. * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI. * * @example @@ -254,7 +254,7 @@ export class KokoroTtsProvider extends BaseTTSProvider { /** * List all available Kokoro voices with metadata. * - * Returns the full catalog of 54 built-in voices with language, gender, + * Returns the full catalog of 53 built-in voices with language, gender, * and accent information derived from voice ID prefixes. * * @returns Array of VoiceInfo objects for all Kokoro voices diff --git a/apps/api/src/speech/providers/piper-tts.provider.ts b/apps/api/src/speech/providers/piper-tts.provider.ts index 40e4638..c86ffc4 100644 --- a/apps/api/src/speech/providers/piper-tts.provider.ts +++ b/apps/api/src/speech/providers/piper-tts.provider.ts @@ -9,7 +9,7 @@ * - OpenAI-compatible API via OpenedAI Speech server * - 100+ Piper voices across 40+ languages * - 6 standard OpenAI voice names mapped to Piper voices - * - Output formats: mp3, wav, opus, flac, aac, pcm + * - Output formats: mp3, wav, opus, flac * - CPU-only, no GPU required * - GPL license (via OpenedAI Speech) * diff --git a/apps/api/src/speech/providers/tts-provider.factory.ts b/apps/api/src/speech/providers/tts-provider.factory.ts index 5a1f69f..21d7b32 100644 --- a/apps/api/src/speech/providers/tts-provider.factory.ts +++ b/apps/api/src/speech/providers/tts-provider.factory.ts @@ -18,7 +18,7 @@ import { ChatterboxTTSProvider } from "./chatterbox-tts.provider"; import { KokoroTtsProvider } from "./kokoro-tts.provider"; import { PiperTtsProvider } from "./piper-tts.provider"; import type { ITTSProvider } from "../interfaces/tts-provider.interface"; -import type { SpeechTier, AudioFormat } from "../interfaces/speech-types"; +import type { SpeechTier } from "../interfaces/speech-types"; import type { SpeechConfig } from "../speech.config"; // ========================================== @@ -44,7 +44,7 @@ export function createTTSProviders(config: SpeechConfig): Map { if (!authenticatedClient.data.userId) { this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`); + authenticatedClient.emit("transcription-error", { + message: "Authentication timed out.", + }); authenticatedClient.disconnect(); } }, this.CONNECTION_TIMEOUT_MS); @@ -109,6 +112,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { if (!token) { this.logger.warn(`Client ${authenticatedClient.id} connected without token`); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: no token provided.", + }); authenticatedClient.disconnect(); clearTimeout(timeoutId); return; @@ -118,6 +124,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { if (!sessionData) { this.logger.warn(`Client ${authenticatedClient.id} has invalid token`); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: invalid or expired token.", + }); authenticatedClient.disconnect(); clearTimeout(timeoutId); return; @@ -133,6 +142,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { if (!workspaceMembership) { this.logger.warn(`User ${userId} has no workspace access`); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: no workspace access.", + }); authenticatedClient.disconnect(); clearTimeout(timeoutId); return; @@ -151,6 +163,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect { `Authentication failed for speech client ${authenticatedClient.id}:`, error instanceof Error ? error.message : "Unknown error" ); + authenticatedClient.emit("transcription-error", { + message: "Authentication failed: an unexpected error occurred.", + }); authenticatedClient.disconnect(); } } diff --git a/apps/web/src/components/speech/TextToSpeechButton.tsx b/apps/web/src/components/speech/TextToSpeechButton.tsx index a8f97f7..e208296 100644 --- a/apps/web/src/components/speech/TextToSpeechButton.tsx +++ b/apps/web/src/components/speech/TextToSpeechButton.tsx @@ -19,7 +19,7 @@ export interface TextToSpeechButtonProps { text: string; /** Optional voice ID to use */ voice?: string; - /** Optional tier (e.g. "standard", "premium") */ + /** Optional tier (e.g. "default", "premium", "fallback") */ tier?: string; /** Optional className for the container */ className?: string; diff --git a/apps/web/src/hooks/useTextToSpeech.ts b/apps/web/src/hooks/useTextToSpeech.ts index cc04cc4..c1152fa 100644 --- a/apps/web/src/hooks/useTextToSpeech.ts +++ b/apps/web/src/hooks/useTextToSpeech.ts @@ -173,8 +173,17 @@ export function useTextToSpeech(): UseTextToSpeechReturn { const play = useCallback(async (): Promise => { const audio = audioRef.current; if (audio) { - await audio.play(); - setIsPlaying(true); + try { + await audio.play(); + setIsPlaying(true); + } catch (err) { + const message = + err instanceof DOMException && err.name === "NotAllowedError" + ? "Playback was blocked by the browser. Try interacting with the page first." + : "Unable to play audio. The format may not be supported."; + setError(message); + setIsPlaying(false); + } } }, []); diff --git a/apps/web/src/hooks/useVoiceInput.ts b/apps/web/src/hooks/useVoiceInput.ts index 24e792d..46506a5 100644 --- a/apps/web/src/hooks/useVoiceInput.ts +++ b/apps/web/src/hooks/useVoiceInput.ts @@ -1,8 +1,8 @@ /** * useVoiceInput hook * - * Custom hook for microphone capture and real-time transcription. - * Supports WebSocket streaming for real-time partial transcriptions + * Custom hook for microphone capture and speech-to-text transcription. + * Supports WebSocket streaming with batch transcription on stop, * with REST upload fallback when WebSocket is unavailable. */ @@ -20,6 +20,8 @@ export interface UseVoiceInputOptions { useWebSocket?: boolean; /** Audio sample rate in Hz (default: 16000) */ sampleRate?: number; + /** Authentication token for WebSocket connection */ + token?: string; } /** Return type for the useVoiceInput hook */ @@ -75,14 +77,14 @@ function getAudioMimeType(): string { } /** - * Hook for microphone capture and real-time speech-to-text transcription. + * Hook for microphone capture and speech-to-text transcription. * - * Uses WebSocket streaming by default for real-time partial transcriptions. + * Uses WebSocket streaming by default with batch transcription on stop. * Falls back to REST upload (POST /api/speech/transcribe) if WebSocket * is disabled or unavailable. */ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInputReturn { - const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000 } = options; + const { onTranscript, useWebSocket: useWs = true, sampleRate = 16000, token } = options; const [isRecording, setIsRecording] = useState(false); const [transcript, setTranscript] = useState(""); @@ -143,9 +145,12 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput }; animationFrameRef.current = requestAnimationFrame(updateLevel); - } catch { + } catch (err) { // Audio analysis is non-critical; continue without it - console.warn("Audio analysis not available"); + console.warn( + "Audio level visualization unavailable:", + err instanceof Error ? err.message : String(err) + ); } }, []); @@ -169,11 +174,14 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput * Connect to the speech WebSocket namespace */ const connectSocket = useCallback((): Socket => { - const socket = io(API_BASE_URL, { + const socket = io(`${API_BASE_URL}/speech`, { path: "/socket.io", transports: ["websocket", "polling"], + ...(token ? { auth: { token } } : {}), }); + // Future use: the gateway does not currently emit transcription-partial, + // but the listener is registered for when real-time partial transcription is added. socket.on("transcription-partial", (data: TranscriptionPartialPayload) => { setPartialTranscript(data.text); }); @@ -188,9 +196,19 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput setError(data.message); }); + socket.on("connect_error", (err: Error) => { + setError(`WebSocket connection failed: ${err.message}`); + }); + + socket.on("disconnect", (reason: string) => { + if (reason !== "io client disconnect") { + setError(`WebSocket disconnected unexpectedly: ${reason}`); + } + }); + socketRef.current = socket; return socket; - }, []); + }, [token]); /** * Disconnect the WebSocket @@ -200,6 +218,8 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput socketRef.current.off("transcription-partial"); socketRef.current.off("transcription-final"); socketRef.current.off("transcription-error"); + socketRef.current.off("connect_error"); + socketRef.current.off("disconnect"); socketRef.current.disconnect(); socketRef.current = null; } @@ -211,7 +231,7 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput const sendAudioViaRest = useCallback(async (audioBlob: Blob): Promise => { try { const formData = new FormData(); - formData.append("audio", audioBlob, "recording.webm"); + formData.append("file", audioBlob, "recording.webm"); const response = await apiPostFormData( "/api/speech/transcribe", @@ -315,10 +335,16 @@ export function useVoiceInput(options: UseVoiceInputOptions = {}): UseVoiceInput }); // Handle errors - mediaRecorder.addEventListener("error", () => { - setError("Recording encountered an issue. Please try again."); + mediaRecorder.addEventListener("error", (event: Event) => { + let errorMessage = "Recording encountered an issue. Please try again."; + if ("error" in event && event.error instanceof DOMException) { + errorMessage = `Recording error: ${event.error.name} - ${event.error.message}`; + } + setError(errorMessage); setIsRecording(false); isRecordingRef.current = false; + stopMediaTracks(); + cleanupAudioAnalysis(); }); // Start recording with timeslice for streaming chunks (250ms intervals) diff --git a/docs/SPEECH.md b/docs/SPEECH.md index 3ea7dd4..2f2b078 100644 --- a/docs/SPEECH.md +++ b/docs/SPEECH.md @@ -494,7 +494,7 @@ Boolean parsing: `value === "true"` or `value === "1"`. Unset or empty values de **Capabilities:** -- 54 built-in voices across 8 languages +- 53 built-in voices across 8 languages - Speed control: 0.25x to 4.0x - Output formats: mp3, wav, opus, flac - Voice metadata derived from ID prefix (language, gender, accent)