docs: add Coolify deployment guide and compose file

Adds docker-compose.coolify.yml (core stack: postgres, valkey, api, web,
coordinator, orchestrator) and deployment documentation with Coolify API
patterns, architecture diagram, and known issues.

Related: #440, #441, #442, #443

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-22 01:22:37 -06:00
parent 17144b1c42
commit 0e2c7e88e6
2 changed files with 417 additions and 0 deletions

280
docker-compose.coolify.yml Normal file
View File

@@ -0,0 +1,280 @@
# ==============================================
# Mosaic Stack — Coolify Core Deployment
# ==============================================
#
# Core services only. For Matrix, speech, and other optional
# services, deploy them as separate Coolify services or extend
# this file.
#
# Usage (Coolify):
# 1. New Resource -> Docker Compose
# 2. Paste this file
# 3. Set environment variables in Coolify UI
# 4. Configure domains for web + api in Coolify UI
# 5. Deploy
#
# NOTE: Traefik labels are NOT included here. Coolify manages
# routing and TLS via its own proxy integration. Configure
# domains in the Coolify service settings.
#
# ==============================================
services:
# ======================
# PostgreSQL Database
# ======================
postgres:
image: git.mosaicstack.dev/mosaic/stack-postgres:${IMAGE_TAG:-latest}
restart: unless-stopped
environment:
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_DB=${POSTGRES_DB}
- POSTGRES_SHARED_BUFFERS=${POSTGRES_SHARED_BUFFERS:-256MB}
- POSTGRES_EFFECTIVE_CACHE_SIZE=${POSTGRES_EFFECTIVE_CACHE_SIZE:-1GB}
- POSTGRES_MAX_CONNECTIONS=${POSTGRES_MAX_CONNECTIONS:-100}
volumes:
- postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
networks:
- internal
# ======================
# Valkey Cache
# ======================
valkey:
image: valkey/valkey:8-alpine
restart: unless-stopped
command:
- valkey-server
- --maxmemory ${VALKEY_MAXMEMORY:-256mb}
- --maxmemory-policy noeviction
- --appendonly yes
volumes:
- valkey_data:/data
healthcheck:
test: ["CMD", "valkey-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
networks:
- internal
# ======================
# Mosaic API
# ======================
api:
image: git.mosaicstack.dev/mosaic/stack-api:${IMAGE_TAG:-latest}
restart: unless-stopped
environment:
# Coolify domain assignment (magic variable — tells Coolify this service gets a domain on port 3001)
- SERVICE_FQDN_API_3001
- NODE_ENV=production
- PORT=${API_PORT:-3001}
- API_HOST=${API_HOST:-0.0.0.0}
# Database
- DATABASE_URL=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB}
# Cache
- VALKEY_URL=redis://valkey:6379
# Auth (external Authentik — optional)
- OIDC_ENABLED=${OIDC_ENABLED:-false}
- OIDC_ISSUER=${OIDC_ISSUER:-}
- OIDC_CLIENT_ID=${OIDC_CLIENT_ID:-}
- OIDC_CLIENT_SECRET=${OIDC_CLIENT_SECRET:-}
- OIDC_REDIRECT_URI=${OIDC_REDIRECT_URI:-}
# JWT
- JWT_SECRET=${JWT_SECRET}
- JWT_EXPIRATION=${JWT_EXPIRATION:-24h}
# Better Auth
- BETTER_AUTH_SECRET=${BETTER_AUTH_SECRET}
- BETTER_AUTH_URL=${BETTER_AUTH_URL:-}
- CSRF_SECRET=${CSRF_SECRET}
- COOKIE_DOMAIN=${COOKIE_DOMAIN:-}
# Encryption
- ENCRYPTION_KEY=${ENCRYPTION_KEY}
# External services (optional — leave empty to disable)
- OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-}
- OLLAMA_MODEL=${OLLAMA_MODEL:-llama3.2}
- OPENBAO_ADDR=${OPENBAO_ADDR:-}
# Knowledge module
- KNOWLEDGE_CACHE_ENABLED=${KNOWLEDGE_CACHE_ENABLED:-true}
- KNOWLEDGE_CACHE_TTL=${KNOWLEDGE_CACHE_TTL:-300}
- SEMANTIC_SEARCH_SIMILARITY_THRESHOLD=${SEMANTIC_SEARCH_SIMILARITY_THRESHOLD:-0.5}
# Rate limiting
- RATE_LIMIT_TTL=${RATE_LIMIT_TTL:-60}
- RATE_LIMIT_GLOBAL_LIMIT=${RATE_LIMIT_GLOBAL_LIMIT:-100}
- RATE_LIMIT_STORAGE=${RATE_LIMIT_STORAGE:-redis}
# Speech services (disabled — not in core stack)
- STT_ENABLED=${STT_ENABLED:-false}
- TTS_ENABLED=${TTS_ENABLED:-false}
# Matrix bridge (disabled — not in core stack)
- MATRIX_ACCESS_TOKEN=${MATRIX_ACCESS_TOKEN:-}
# Telemetry (disabled by default)
- MOSAIC_TELEMETRY_ENABLED=${MOSAIC_TELEMETRY_ENABLED:-false}
- MOSAIC_TELEMETRY_SERVER_URL=${MOSAIC_TELEMETRY_SERVER_URL:-}
- MOSAIC_TELEMETRY_API_KEY=${MOSAIC_TELEMETRY_API_KEY:-}
- MOSAIC_TELEMETRY_INSTANCE_ID=${MOSAIC_TELEMETRY_INSTANCE_ID:-}
- MOSAIC_TELEMETRY_DRY_RUN=${MOSAIC_TELEMETRY_DRY_RUN:-false}
# Frontend URLs (for CORS and auth redirects)
- NEXT_PUBLIC_APP_URL=${NEXT_PUBLIC_APP_URL}
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
- TRUSTED_ORIGINS=${TRUSTED_ORIGINS:-}
depends_on:
postgres:
condition: service_healthy
valkey:
condition: service_healthy
healthcheck:
test:
[
"CMD-SHELL",
'node -e "require(''http'').get(''http://localhost:${API_PORT:-3001}/health'', (r) => {process.exit(r.statusCode === 200 ? 0 : 1)})"',
]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
networks:
- internal
# ======================
# Mosaic Web
# ======================
web:
image: git.mosaicstack.dev/mosaic/stack-web:${IMAGE_TAG:-latest}
restart: unless-stopped
environment:
# Coolify domain assignment (magic variable — tells Coolify this service gets a domain on port 3000)
- SERVICE_FQDN_WEB_3000
- NODE_ENV=production
- PORT=${WEB_PORT:-3000}
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
- NEXT_PUBLIC_APP_URL=${NEXT_PUBLIC_APP_URL}
- NEXT_PUBLIC_ORCHESTRATOR_URL=${NEXT_PUBLIC_ORCHESTRATOR_URL:-}
- NEXT_PUBLIC_AUTH_MODE=${NEXT_PUBLIC_AUTH_MODE:-real}
- ORCHESTRATOR_API_KEY=${ORCHESTRATOR_API_KEY:-}
depends_on:
api:
condition: service_healthy
healthcheck:
test:
[
"CMD-SHELL",
'node -e "require(''http'').get(''http://localhost:${WEB_PORT:-3000}'', (r) => {process.exit(r.statusCode === 200 ? 0 : 1)})"',
]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
networks:
- internal
# ======================
# Mosaic Coordinator
# ======================
coordinator:
image: git.mosaicstack.dev/mosaic/stack-coordinator:${IMAGE_TAG:-latest}
restart: unless-stopped
environment:
- GITEA_WEBHOOK_SECRET=${GITEA_WEBHOOK_SECRET:-}
- GITEA_URL=${GITEA_URL:-}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- LOG_LEVEL=${LOG_LEVEL:-info}
- HOST=0.0.0.0
- PORT=8000
- COORDINATOR_POLL_INTERVAL=${COORDINATOR_POLL_INTERVAL:-5.0}
- COORDINATOR_MAX_CONCURRENT_AGENTS=${COORDINATOR_MAX_CONCURRENT_AGENTS:-10}
- COORDINATOR_ENABLED=${COORDINATOR_ENABLED:-true}
# Telemetry
- MOSAIC_TELEMETRY_ENABLED=${MOSAIC_TELEMETRY_ENABLED:-false}
- MOSAIC_TELEMETRY_SERVER_URL=${MOSAIC_TELEMETRY_SERVER_URL:-}
- MOSAIC_TELEMETRY_API_KEY=${MOSAIC_TELEMETRY_API_KEY:-}
- MOSAIC_TELEMETRY_INSTANCE_ID=${MOSAIC_TELEMETRY_INSTANCE_ID:-}
- MOSAIC_TELEMETRY_DRY_RUN=${MOSAIC_TELEMETRY_DRY_RUN:-false}
healthcheck:
test:
[
"CMD",
"python",
"-c",
"import urllib.request; urllib.request.urlopen('http://localhost:8000/health')",
]
interval: 30s
timeout: 10s
retries: 3
start_period: 5s
networks:
- internal
# ======================
# Mosaic Orchestrator
# ======================
orchestrator:
image: git.mosaicstack.dev/mosaic/stack-orchestrator:${IMAGE_TAG:-latest}
restart: unless-stopped
user: "1000:1000"
environment:
- NODE_ENV=production
- ORCHESTRATOR_PORT=3001
- AI_PROVIDER=${AI_PROVIDER:-ollama}
- OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-}
- OLLAMA_MODEL=${OLLAMA_MODEL:-llama3.2}
- VALKEY_URL=redis://valkey:6379
- VALKEY_HOST=valkey
- VALKEY_PORT=6379
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
- ORCHESTRATOR_API_KEY=${ORCHESTRATOR_API_KEY:-}
- DOCKER_SOCKET=/var/run/docker.sock
- GIT_USER_NAME=Mosaic Orchestrator
- GIT_USER_EMAIL=orchestrator@mosaicstack.dev
- KILLSWITCH_ENABLED=true
- SANDBOX_ENABLED=true
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- orchestrator_workspace:/workspace
depends_on:
valkey:
condition: service_healthy
api:
condition: service_healthy
healthcheck:
test:
[
"CMD-SHELL",
'node -e "require(''http'').get(''http://localhost:3001/health'', (r) => {process.exit(r.statusCode === 200 ? 0 : 1)})"',
]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
networks:
- internal
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE
tmpfs:
- /tmp:noexec,nosuid,size=100m
# ======================
# Volumes
# ======================
volumes:
postgres_data:
valkey_data:
orchestrator_workspace:
# ======================
# Networks
# ======================
networks:
internal:
driver: bridge

137
docs/COOLIFY-DEPLOYMENT.md Normal file
View File

@@ -0,0 +1,137 @@
# Mosaic Stack — Coolify Deployment
## Overview
Coolify deployment on VM `10.1.1.44` (Proxmox). Replaces the Docker Swarm deployment on w-docker0 (`10.1.1.45`).
## Architecture
```
Internet → Cloudflare → Public IP (174.137.97.162)
→ Main Traefik (10.1.1.43) — TCP TLS passthrough for *.woltje.com
→ Coolify Traefik (10.1.1.44) — terminates TLS via Cloudflare DNS-01 wildcard certs
→ Service containers
```
## Services (Core Stack)
| Service | Image | Internal Port | External Domain |
| ------------ | ----------------------------------------------- | --------------- | ----------------------- |
| postgres | `git.mosaicstack.dev/mosaic/stack-postgres` | 5432 | — |
| valkey | `valkey/valkey:8-alpine` | 6379 | — |
| api | `git.mosaicstack.dev/mosaic/stack-api` | 3001 | `api.mosaic.woltje.com` |
| web | `git.mosaicstack.dev/mosaic/stack-web` | 3000 | `mosaic.woltje.com` |
| coordinator | `git.mosaicstack.dev/mosaic/stack-coordinator` | 8000 | — |
| orchestrator | `git.mosaicstack.dev/mosaic/stack-orchestrator` | 3001 (internal) | — |
Matrix (synapse, element-web) and speech services (speaches, kokoro-tts) are NOT included in the core stack. Deploy separately if needed.
## Compose File
`docker-compose.coolify.yml` in the repo root. This is the Coolify-compatible version of the deployment compose.
Key differences from the Swarm compose (`docker-compose.swarm.portainer.yml`):
- No `deploy:` blocks (Swarm-only)
- No Traefik labels (Coolify manages routing)
- Bridge network instead of overlay
- `restart: unless-stopped` instead of Swarm restart policies
- `SERVICE_FQDN_*` magic environment variables for Coolify domain assignment
- List-style environment syntax (required for Coolify magic vars)
## Coolify IDs
| Resource | UUID |
| ----------- | -------------------------- |
| Project | `rs04g008kgkkw4s0wgsk40w4` |
| Environment | `gko8csc804g8og0oosc8ccs8` |
| Service | `ug0ssok4g44wocok8kws8gg8` |
| Server | `as8kcogk08skskkcsok888g4` |
### Application UUIDs
| App | UUID |
| ------------ | --------------------------- |
| postgres | `jcw0ogskkw040os48ggkgkc8` |
| valkey | `skssgwcggc0c8owoogcso8og` |
| api | `mc40cgwwo8okwwoko84408k4k` |
| web | `c48gcwgc40ok44scscowc8cc` |
| coordinator | `s8gwog4c44w08c8sgkcg04k8` |
| orchestrator | `uo4wkg88co0ckc4c4k44sowc` |
## Coolify API
Base URL: `http://10.1.1.44:8000/api/v1`
Auth: Bearer token from `credentials.json``coolify.app_token`
### Patterns & Gotchas
- **Compose must be base64-encoded** when sending via `docker_compose_raw` field
- **`SERVICE_FQDN_*` magic vars**: Coolify reads these from the compose to auto-assign domains. Format: `SERVICE_FQDN_{NAME}_{PORT}` (e.g., `SERVICE_FQDN_API_3001`). Must use list-style env syntax (`- SERVICE_FQDN_API_3001`), NOT dict-style.
- **FQDN updates on sub-applications**: Coolify API doesn't support updating FQDNs on compose service sub-apps via REST. Workaround: update directly in Coolify's PostgreSQL DB (`coolify-db` container, `service_applications` table).
- **Environment variable management**: Use `PATCH /api/v1/services/{uuid}/envs` with `{ "key": "VAR_NAME", "value": "val", "is_preview": false }`
- **Service start**: `POST /api/v1/services/{uuid}/start`
- **Coolify uses PostgreSQL** (not SQLite) for its internal database — container `coolify-db`
### DB Access (for workarounds)
```bash
ssh localadmin@10.1.1.44
docker exec -it coolify-db psql -U coolify -d coolify
-- Check service app FQDNs
SELECT name, fqdn FROM service_applications WHERE service_id = (
SELECT id FROM services WHERE uuid = 'ug0ssok4g44wocok8kws8gg8'
);
```
## Environment Variables
All env vars are set via Coolify API and stored in `/data/coolify/services/{uuid}/.env` on the node.
Critical vars that were missing initially:
- `BETTER_AUTH_URL`**Required** in production. API won't start without it. Set to `https://api.mosaic.woltje.com`.
## Current State (2026-02-22)
### Working
- All 6 containers running and healthy
- API health endpoint responds at `https://api.mosaic.woltje.com/health`
- Database migrations completed
- Inter-service networking (api→postgres, api→valkey) confirmed via health checks
### Issues
1. **DNS: `mosaic.woltje.com` points to wrong server**
- Resolves to `10.1.1.45` (old Swarm node) instead of through Cloudflare (`174.137.97.162`)
- `api.mosaic.woltje.com` resolves correctly through Cloudflare
- Fix: Update Cloudflare DNS A record for `mosaic.woltje.com`
2. **Coordinator: OTLP exporter noise**
- Trying to export traces to `localhost:4318` which doesn't exist
- Container is healthy, errors are non-critical
- Fix: Set `MOSAIC_TELEMETRY_ENABLED=false` in Coolify env vars, or deploy an OTLP collector
3. **Coolify managed lifecycle**
- CoolifyTask was failing when starting the service via API/UI
- Containers were started manually via `docker compose up -d` from the service directory
- Coolify recognizes the containers (correct naming convention) but may not properly manage restarts/redeploys
- Needs investigation: check Coolify task logs, verify compose processing
4. **Full connectivity verification needed**
- web→api communication untested (blocked by DNS issue)
- Orchestrator→valkey and orchestrator→api connectivity unverified
- Coordinator webhook endpoint untested
## SSH Access
```bash
ssh localadmin@10.1.1.44
# Note: localadmin cannot sudo without TTY/password
# Use docker to access files:
docker run --rm -v /data/coolify/services:/srv alpine cat /srv/{uuid}/docker-compose.yml
# Use docker exec for Coolify DB:
docker exec -it coolify-db psql -U coolify -d coolify
```