fix(#388): address PR review findings — fix WebSocket/REST bugs, improve error handling, fix types and comments

Critical fixes: - Fix FormData field name mismatch (audio -> file) to match backend FileInterceptor - Add /speech namespace to WebSocket connection URL - Pass auth token in WebSocket handshake options - Wrap audio.play() in try-catch for NotAllowedError and DOMException handling - Replace bare catch block with named error parameter and descriptive message - Add connect_error and disconnect event handlers to WebSocket - Update JSDoc to accurately describe batch transcription (not real-time partial) Important fixes: - Emit transcription-error before disconnect in gateway auth failures - Capture MediaRecorder error details and clean up media tracks on error - Change TtsDefaultConfig.format type from string to AudioFormat - Define canonical SPEECH_TIERS and AUDIO_FORMATS arrays as single source of truth - Fix voice count from 54 to 53 in provider, AGENTS.md, and docs - Fix inaccurate comments (Piper formats, tier prop, SpeachesProvider, TextValidationPipe) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 03:44:33 -06:00
parent dcbc8d1053
commit af9c5799af
14 changed files with 91 additions and 53 deletions
--- a/apps/api/src/speech/AGENTS.md
+++ b/apps/api/src/speech/AGENTS.md
@@ -34,7 +34,7 @@ speech/
 └── providers/
    ├── base-tts.provider.ts       # Abstract base class (OpenAI SDK + common logic)
    ├── base-tts.provider.spec.ts
-    ├── kokoro-tts.provider.ts     # Default tier (CPU, 54 voices, 8 languages)
+    ├── kokoro-tts.provider.ts     # Default tier (CPU, 53 voices, 8 languages)
    ├── kokoro-tts.provider.spec.ts
    ├── chatterbox-tts.provider.ts # Premium tier (GPU, voice cloning, emotion control)
    ├── chatterbox-tts.provider.spec.ts
--- a/apps/api/src/speech/dto/synthesize.dto.ts
+++ b/apps/api/src/speech/dto/synthesize.dto.ts
@@ -2,7 +2,7 @@
 * SynthesizeDto
 *
 * DTO for text-to-speech synthesis requests.
- * The text field is validated by TextValidationPipe for length/emptiness.
+ * Text and option fields are validated by class-validator decorators.
 * Additional options control voice, speed, format, and tier selection.
 *
 * Issue #398
@@ -10,29 +10,13 @@

 import { IsString, IsOptional, IsNumber, IsIn, Min, Max, MaxLength } from "class-validator";
 import { Type } from "class-transformer";
+import { AUDIO_FORMATS, SPEECH_TIERS } from "../interfaces/speech-types";
 import type { AudioFormat, SpeechTier } from "../interfaces/speech-types";

-/**
- * Valid audio output formats for TTS synthesis.
- */
-const VALID_AUDIO_FORMATS: readonly AudioFormat[] = [
-  "mp3",
-  "wav",
-  "opus",
-  "flac",
-  "aac",
-  "pcm",
-] as const;
-
-/**
- * Valid TTS tiers for provider selection.
- */
-const VALID_SPEECH_TIERS: readonly SpeechTier[] = ["default", "premium", "fallback"] as const;
-
 export class SynthesizeDto {
  /**
   * Text to convert to speech.
-   * Validated separately by TextValidationPipe for length and emptiness.
+   * Validated by class-validator decorators for type and maximum length.
   */
  @IsString({ message: "text must be a string" })
  @MaxLength(4096, { message: "text must not exceed 4096 characters" })
@@ -66,8 +50,8 @@ export class SynthesizeDto {
   */
  @IsOptional()
  @IsString({ message: "format must be a string" })
-  @IsIn(VALID_AUDIO_FORMATS, {
-    message: `format must be one of: ${VALID_AUDIO_FORMATS.join(", ")}`,
+  @IsIn(AUDIO_FORMATS, {
+    message: `format must be one of: ${AUDIO_FORMATS.join(", ")}`,
  })
  format?: AudioFormat;

@@ -78,8 +62,8 @@ export class SynthesizeDto {
   */
  @IsOptional()
  @IsString({ message: "tier must be a string" })
-  @IsIn(VALID_SPEECH_TIERS, {
-    message: `tier must be one of: ${VALID_SPEECH_TIERS.join(", ")}`,
+  @IsIn(SPEECH_TIERS, {
+    message: `tier must be one of: ${SPEECH_TIERS.join(", ")}`,
  })
  tier?: SpeechTier;
 }
--- a/apps/api/src/speech/interfaces/index.ts
+++ b/apps/api/src/speech/interfaces/index.ts
@@ -6,6 +6,7 @@

 export type { ISTTProvider } from "./stt-provider.interface";
 export type { ITTSProvider } from "./tts-provider.interface";
+export { SPEECH_TIERS, AUDIO_FORMATS } from "./speech-types";
 export type {
  SpeechTier,
  AudioFormat,
--- a/apps/api/src/speech/interfaces/speech-types.ts
+++ b/apps/api/src/speech/interfaces/speech-types.ts
@@ -12,19 +12,21 @@
 // ==========================================

 /**
- * TTS provider tier.
+ * Canonical array of TTS provider tiers.
 * Determines which TTS engine is used for synthesis.
 *
 * - default: Primary TTS engine (e.g., Kokoro)
 * - premium: Higher quality TTS engine (e.g., Chatterbox)
 * - fallback: Backup TTS engine (e.g., Piper/OpenedAI)
 */
-export type SpeechTier = "default" | "premium" | "fallback";
+export const SPEECH_TIERS = ["default", "premium", "fallback"] as const;
+export type SpeechTier = (typeof SPEECH_TIERS)[number];

 /**
- * Audio output format for TTS synthesis.
+ * Canonical array of audio output formats for TTS synthesis.
 */
-export type AudioFormat = "mp3" | "wav" | "opus" | "flac" | "aac" | "pcm";
+export const AUDIO_FORMATS = ["mp3", "wav", "opus", "flac", "aac", "pcm"] as const;
+export type AudioFormat = (typeof AUDIO_FORMATS)[number];

 // ==========================================
 // STT Types
--- a/apps/api/src/speech/interfaces/stt-provider.interface.ts
+++ b/apps/api/src/speech/interfaces/stt-provider.interface.ts
@@ -16,7 +16,7 @@ import type { TranscribeOptions, TranscriptionResult } from "./speech-types";
 *
 * @example
 * ```typescript
- * class SpeachesProvider implements ISTTProvider {
+ * class SpeachesSttProvider implements ISTTProvider {
 *   readonly name = "speaches";
 *
 *   async transcribe(audio: Buffer, options?: TranscribeOptions): Promise<TranscriptionResult> {
--- a/apps/api/src/speech/providers/kokoro-tts.provider.ts
+++ b/apps/api/src/speech/providers/kokoro-tts.provider.ts
@@ -5,7 +5,7 @@
 * CPU-based, always available, Apache 2.0 license.
 *
 * Features:
- * - 54 built-in voices across 8 languages
+ * - 53 built-in voices across 8 languages
 * - Speed control: 0.25x to 4.0x
 * - Output formats: mp3, wav, opus, flac
 * - Voice metadata derived from ID prefix (language, gender, accent)
@@ -222,7 +222,7 @@ export function parseVoicePrefix(voiceId: string): VoicePrefixMetadata {
 /**
 * Kokoro-FastAPI TTS provider (default tier).
 *
- * CPU-based text-to-speech engine with 54 built-in voices across 8 languages.
+ * CPU-based text-to-speech engine with 53 built-in voices across 8 languages.
 * Uses the OpenAI-compatible API exposed by Kokoro-FastAPI.
 *
 * @example
@@ -254,7 +254,7 @@ export class KokoroTtsProvider extends BaseTTSProvider {
  /**
   * List all available Kokoro voices with metadata.
   *
-   * Returns the full catalog of 54 built-in voices with language, gender,
+   * Returns the full catalog of 53 built-in voices with language, gender,
   * and accent information derived from voice ID prefixes.
   *
   * @returns Array of VoiceInfo objects for all Kokoro voices
--- a/apps/api/src/speech/providers/piper-tts.provider.ts
+++ b/apps/api/src/speech/providers/piper-tts.provider.ts
@@ -9,7 +9,7 @@
 * - OpenAI-compatible API via OpenedAI Speech server
 * - 100+ Piper voices across 40+ languages
 * - 6 standard OpenAI voice names mapped to Piper voices
- * - Output formats: mp3, wav, opus, flac, aac, pcm
+ * - Output formats: mp3, wav, opus, flac
 * - CPU-only, no GPU required
 * - GPL license (via OpenedAI Speech)
 *
--- a/apps/api/src/speech/providers/tts-provider.factory.ts
+++ b/apps/api/src/speech/providers/tts-provider.factory.ts
@@ -18,7 +18,7 @@ import { ChatterboxTTSProvider } from "./chatterbox-tts.provider";
 import { KokoroTtsProvider } from "./kokoro-tts.provider";
 import { PiperTtsProvider } from "./piper-tts.provider";
 import type { ITTSProvider } from "../interfaces/tts-provider.interface";
-import type { SpeechTier, AudioFormat } from "../interfaces/speech-types";
+import type { SpeechTier } from "../interfaces/speech-types";
 import type { SpeechConfig } from "../speech.config";

 // ==========================================
@@ -44,7 +44,7 @@ export function createTTSProviders(config: SpeechConfig): Map<SpeechTier, ITTSPr
    const provider = new KokoroTtsProvider(
      config.tts.default.url,
      config.tts.default.voice,
-      config.tts.default.format as AudioFormat
+      config.tts.default.format
    );
    providers.set("default", provider);
    logger.log(`Registered default TTS provider: kokoro at ${config.tts.default.url}`);
--- a/apps/api/src/speech/speech.config.ts
+++ b/apps/api/src/speech/speech.config.ts
@@ -12,6 +12,7 @@
 */

 import { registerAs } from "@nestjs/config";
+import type { AudioFormat } from "./interfaces/speech-types";

 // ==========================================
 // Default values
@@ -58,7 +59,7 @@ export interface TtsDefaultConfig {
  enabled: boolean;
  url: string;
  voice: string;
-  format: string;
+  format: AudioFormat;
 }

 export interface TtsPremiumConfig {
@@ -247,7 +248,7 @@ export function getSpeechConfig(): SpeechConfig {
        enabled: isTtsEnabled(),
        url: process.env.TTS_DEFAULT_URL ?? TTS_DEFAULT_DEFAULTS.url,
        voice: process.env.TTS_DEFAULT_VOICE ?? TTS_DEFAULT_DEFAULTS.voice,
-        format: process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format,
+        format: (process.env.TTS_DEFAULT_FORMAT ?? TTS_DEFAULT_DEFAULTS.format) as AudioFormat,
      },
      premium: {
        enabled: isTtsPremiumEnabled(),
--- a/apps/api/src/speech/speech.gateway.ts
+++ b/apps/api/src/speech/speech.gateway.ts
@@ -100,6 +100,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
    const timeoutId = setTimeout(() => {
      if (!authenticatedClient.data.userId) {
        this.logger.warn(`Client ${authenticatedClient.id} timed out during authentication`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication timed out.",
+        });
        authenticatedClient.disconnect();
      }
    }, this.CONNECTION_TIMEOUT_MS);
@@ -109,6 +112,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {

      if (!token) {
        this.logger.warn(`Client ${authenticatedClient.id} connected without token`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: no token provided.",
+        });
        authenticatedClient.disconnect();
        clearTimeout(timeoutId);
        return;
@@ -118,6 +124,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {

      if (!sessionData) {
        this.logger.warn(`Client ${authenticatedClient.id} has invalid token`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: invalid or expired token.",
+        });
        authenticatedClient.disconnect();
        clearTimeout(timeoutId);
        return;
@@ -133,6 +142,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {

      if (!workspaceMembership) {
        this.logger.warn(`User ${userId} has no workspace access`);
+        authenticatedClient.emit("transcription-error", {
+          message: "Authentication failed: no workspace access.",
+        });
        authenticatedClient.disconnect();
        clearTimeout(timeoutId);
        return;
@@ -151,6 +163,9 @@ export class SpeechGateway implements OnGatewayConnection, OnGatewayDisconnect {
        `Authentication failed for speech client ${authenticatedClient.id}:`,
        error instanceof Error ? error.message : "Unknown error"
      );
+      authenticatedClient.emit("transcription-error", {
+        message: "Authentication failed: an unexpected error occurred.",
+      });
      authenticatedClient.disconnect();
    }
  }