fix(federation): healthcheck + restart policy for federated-test stacks (#492)
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/push/publish Pipeline was successful

This commit was merged in pull request #492.
This commit is contained in:
2026-04-22 02:56:40 +00:00
parent f2cda52e1a
commit bb24292cf7

View File

@@ -36,6 +36,12 @@
# tested locally — gateway boots, imports resolve, tier-detector runs.
# Update digest here when promoting a new build.
#
# HEALTHCHECK NOTE (2026-04-21)
# Switched from busybox wget to node http.get on 127.0.0.1 (not localhost) to
# avoid IPv6 resolution issues on Alpine. Retries increased to 5 and
# start_period to 60s to cover the NestJS/GC cold-start window (~40-50s).
# restart_policy set to `any` so SIGTERM/clean-exit also triggers restart.
#
# NOTE: This is a TEST template — production deployments use a separate
# parameterised template with stricter resource limits and secrets.
@@ -76,7 +82,7 @@ services:
deploy:
replicas: 1
restart_policy:
condition: on-failure
condition: any
delay: 5s
max_attempts: 3
labels:
@@ -88,11 +94,15 @@ services:
- 'traefik.http.routers.${STACK_NAME}.tls.certresolver=letsencrypt'
- 'traefik.http.services.${STACK_NAME}.loadbalancer.server.port=3000'
healthcheck:
test: ['CMD', 'wget', '-qO-', 'http://localhost:3000/health']
test:
- 'CMD'
- 'node'
- '-e'
- "require('http').get('http://127.0.0.1:3000/health',r=>process.exit(r.statusCode===200?0:1)).on('error',()=>process.exit(1))"
interval: 30s
timeout: 5s
retries: 3
start_period: 20s
retries: 5
start_period: 60s
depends_on:
- postgres
- valkey