stack/docker-compose.speech.yml

# ==============================================
# Speech Services - Docker Compose Dev Overlay
# ==============================================
#
# Adds STT and TTS services for local development.
#
# Usage:
#   Basic (STT + default TTS):
#     docker compose -f docker-compose.yml -f docker-compose.speech.yml up -d
#
#   With premium TTS (requires GPU):
#     docker compose -f docker-compose.yml -f docker-compose.speech.yml --profile premium-tts up -d
#
#   Or use Makefile targets:
#     make speech-up              # Basic speech services
#     make speech-down            # Stop speech services
#     make speech-logs            # View speech service logs
# ==============================================

services:
  # ======================
  # Speaches (STT + basic TTS)
  # ======================
  speaches:
    image: ghcr.io/speaches-ai/speaches:latest
    container_name: mosaic-speaches
    restart: unless-stopped
    environment:
      WHISPER__MODEL: ${SPEACHES_WHISPER_MODEL:-Systran/faster-whisper-large-v3-turbo}
    ports:
      - "${SPEACHES_PORT:-8090}:8000"
    volumes:
      - speaches_models:/root/.cache/huggingface
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 120s
    networks:
      - mosaic-internal
    labels:
      - "com.mosaic.service=speech-stt"
      - "com.mosaic.description=Speaches STT (Whisper) and basic TTS"

  # ======================
  # Kokoro TTS (Default TTS)
  # ======================
  kokoro-tts:
    image: ghcr.io/remsky/kokoro-fastapi:latest-cpu
    container_name: mosaic-kokoro-tts
    restart: unless-stopped
    ports:
      - "${KOKORO_TTS_PORT:-8880}:8880"
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8880/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 120s
    networks:
      - mosaic-internal
    labels:
      - "com.mosaic.service=speech-tts"
      - "com.mosaic.description=Kokoro FastAPI TTS engine"

  # ======================
  # Chatterbox TTS (Premium TTS - Optional)
  # ======================
  # Only starts with: --profile premium-tts
  # Requires NVIDIA GPU with docker nvidia runtime
  chatterbox-tts:
    image: devnen/chatterbox-tts-server:latest
    container_name: mosaic-chatterbox-tts
    restart: unless-stopped
    ports:
      - "${CHATTERBOX_TTS_PORT:-8881}:8000"
    profiles:
      - premium-tts
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 180s
    networks:
      - mosaic-internal
    labels:
      - "com.mosaic.service=speech-tts-premium"
      - "com.mosaic.description=Chatterbox premium TTS with voice cloning (GPU)"

# ======================
# Volumes
# ======================
volumes:
  speaches_models:
    name: mosaic-speaches-models
    driver: local

# ======================
# Networks
# ======================
networks:
  mosaic-internal:
    external: true
    name: mosaic-internal