feat(#400): add Docker Compose swarm/prod deployment for speech services

Add docker/docker-compose.sample.speech.yml for standalone speech services deployment in Docker Swarm with Portainer compatibility: - Speaches (STT + basic TTS) with Whisper model configuration - Kokoro TTS (default high-quality TTS) always deployed - Chatterbox TTS (premium, GPU) commented out as optional - Traefik labels for reverse proxy routing with TLS - Health checks on all services - Volume persistence for Whisper models - GPU reservation via Swarm generic resources for Chatterbox - Environment variable substitution for Portainer - Comprehensive header documentation Fixes #400
2026-02-15 02:51:13 -06:00
parent 527262af38
commit b3d6d73348
1 changed files with 164 additions and 0 deletions
--- a/docker/docker-compose.sample.speech.yml
+++ b/docker/docker-compose.sample.speech.yml
@@ -0,0 +1,164 @@
+# ==============================================
+# Speech Services - Sample Swarm Deployment
+# ==============================================
+#
+# Standalone speech services deployment for use with Mosaic Stack.
+# This is SEPARATE infrastructure — not part of the Mosaic Stack itself.
+# Mosaic connects to it via SPEACHES_URL and TTS_URL environment variables.
+#
+# Provides:
+#   - Speaches: Speech-to-Text (Whisper) + basic TTS fallback
+#   - Kokoro TTS: Default high-quality text-to-speech
+#   - Chatterbox TTS: Premium TTS with voice cloning (optional, requires GPU)
+#
+# Usage (Docker Swarm via Portainer):
+#   1. Create a new stack in Portainer
+#   2. Paste this file or point to the repo
+#   3. Set environment variables in Portainer's env var section
+#   4. Deploy the stack
+#
+# Usage (Docker Swarm CLI):
+#   1. Create .env file with variables below
+#   2. docker stack deploy -c docker-compose.sample.speech.yml speech
+#
+# Required Environment Variables:
+#   STT_DOMAIN=stt.example.com              # Domain for Speaches (STT + basic TTS)
+#   TTS_DOMAIN=tts.example.com              # Domain for Kokoro TTS (default TTS)
+#
+# Optional Environment Variables:
+#   WHISPER_MODEL=Systran/faster-whisper-large-v3-turbo  # Whisper model for STT
+#   CHATTERBOX_TTS_DOMAIN=tts-premium.example.com       # Domain for Chatterbox (premium TTS)
+#   TRAEFIK_ENTRYPOINT=websecure            # Traefik entrypoint name
+#   TRAEFIK_CERTRESOLVER=letsencrypt        # Traefik cert resolver
+#   TRAEFIK_DOCKER_NETWORK=traefik-public   # Traefik network name
+#   TRAEFIK_TLS_ENABLED=true                # Enable TLS on Traefik routers
+#
+# Connecting to Mosaic Stack:
+#   Add to your Mosaic Stack .env:
+#     SPEACHES_URL=http://speaches:8000      (if same Docker network)
+#     SPEACHES_URL=https://stt.example.com   (if external)
+#     TTS_URL=http://kokoro-tts:8880         (if same Docker network)
+#     TTS_URL=https://tts.example.com        (if external)
+#
+# GPU Requirements (Chatterbox only):
+#   - NVIDIA GPU with CUDA support
+#   - nvidia-container-toolkit installed on Docker host
+#   - Docker runtime configured for GPU access
+#   - Note: Docker Swarm requires "generic resources" for GPU scheduling.
+#     See: https://docs.docker.com/engine/daemon/nvidia-gpu/#configure-gpus-for-docker-swarm
+#
+# ==============================================
+
+services:
+  # ======================
+  # Speaches (STT + basic TTS)
+  # ======================
+  # Primary speech-to-text service using Whisper.
+  # Also provides basic TTS as a fallback.
+  speaches:
+    image: ghcr.io/speaches-ai/speaches:latest
+    environment:
+      WHISPER__MODEL: ${WHISPER_MODEL:-Systran/faster-whisper-large-v3-turbo}
+    volumes:
+      - speaches-models:/root/.cache/huggingface
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 120s
+    networks:
+      - internal
+      - traefik-public
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 10s
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.speech-stt.rule=Host(`${STT_DOMAIN}`)"
+        - "traefik.http.routers.speech-stt.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}"
+        - "traefik.http.routers.speech-stt.tls=${TRAEFIK_TLS_ENABLED:-true}"
+        - "traefik.http.routers.speech-stt.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}"
+        - "traefik.http.services.speech-stt.loadbalancer.server.port=8000"
+        - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}"
+
+  # ======================
+  # Kokoro TTS (Default TTS)
+  # ======================
+  # High-quality text-to-speech engine. Always deployed alongside Speaches.
+  kokoro-tts:
+    image: ghcr.io/remsky/kokoro-fastapi:latest-cpu
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8880/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 120s
+    networks:
+      - internal
+      - traefik-public
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 10s
+      labels:
+        - "traefik.enable=true"
+        - "traefik.http.routers.speech-tts.rule=Host(`${TTS_DOMAIN}`)"
+        - "traefik.http.routers.speech-tts.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}"
+        - "traefik.http.routers.speech-tts.tls=${TRAEFIK_TLS_ENABLED:-true}"
+        - "traefik.http.routers.speech-tts.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}"
+        - "traefik.http.services.speech-tts.loadbalancer.server.port=8880"
+        - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}"
+
+  # ======================
+  # Chatterbox TTS (Premium TTS - Optional)
+  # ======================
+  # Premium TTS with voice cloning capabilities. Requires NVIDIA GPU.
+  #
+  # To enable: Uncomment this service and set CHATTERBOX_TTS_DOMAIN.
+  #
+  # For Docker Swarm GPU scheduling, configure generic resources on the node:
+  #   /etc/docker/daemon.json:
+  #     { "runtimes": { "nvidia": { ... } },
+  #       "node-generic-resources": ["NVIDIA-GPU=0"] }
+  #
+  # chatterbox-tts:
+  #   image: devnen/chatterbox-tts-server:latest
+  #   healthcheck:
+  #     test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+  #     interval: 30s
+  #     timeout: 10s
+  #     retries: 5
+  #     start_period: 180s
+  #   networks:
+  #     - internal
+  #     - traefik-public
+  #   deploy:
+  #     restart_policy:
+  #       condition: on-failure
+  #       delay: 10s
+  #     resources:
+  #       reservations:
+  #         generic_resources:
+  #           - discrete_resource_spec:
+  #               kind: "NVIDIA-GPU"
+  #               value: 1
+  #     labels:
+  #       - "traefik.enable=true"
+  #       - "traefik.http.routers.speech-tts-premium.rule=Host(`${CHATTERBOX_TTS_DOMAIN}`)"
+  #       - "traefik.http.routers.speech-tts-premium.entrypoints=${TRAEFIK_ENTRYPOINT:-websecure}"
+  #       - "traefik.http.routers.speech-tts-premium.tls=${TRAEFIK_TLS_ENABLED:-true}"
+  #       - "traefik.http.routers.speech-tts-premium.tls.certresolver=${TRAEFIK_CERTRESOLVER:-}"
+  #       - "traefik.http.services.speech-tts-premium.loadbalancer.server.port=8000"
+  #       - "traefik.docker.network=${TRAEFIK_DOCKER_NETWORK:-traefik-public}"
+
+volumes:
+  speaches-models:
+
+networks:
+  internal:
+    driver: overlay
+  traefik-public:
+    external: true
+    name: ${TRAEFIK_DOCKER_NETWORK:-traefik-public}