fix(federation): healthcheck + restart policy for federated-test stacks (#492)
This commit was merged in pull request #492.
This commit is contained in:
@@ -36,6 +36,12 @@
|
|||||||
# tested locally — gateway boots, imports resolve, tier-detector runs.
|
# tested locally — gateway boots, imports resolve, tier-detector runs.
|
||||||
# Update digest here when promoting a new build.
|
# Update digest here when promoting a new build.
|
||||||
#
|
#
|
||||||
|
# HEALTHCHECK NOTE (2026-04-21)
|
||||||
|
# Switched from busybox wget to node http.get on 127.0.0.1 (not localhost) to
|
||||||
|
# avoid IPv6 resolution issues on Alpine. Retries increased to 5 and
|
||||||
|
# start_period to 60s to cover the NestJS/GC cold-start window (~40-50s).
|
||||||
|
# restart_policy set to `any` so SIGTERM/clean-exit also triggers restart.
|
||||||
|
#
|
||||||
# NOTE: This is a TEST template — production deployments use a separate
|
# NOTE: This is a TEST template — production deployments use a separate
|
||||||
# parameterised template with stricter resource limits and secrets.
|
# parameterised template with stricter resource limits and secrets.
|
||||||
|
|
||||||
@@ -76,7 +82,7 @@ services:
|
|||||||
deploy:
|
deploy:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
restart_policy:
|
restart_policy:
|
||||||
condition: on-failure
|
condition: any
|
||||||
delay: 5s
|
delay: 5s
|
||||||
max_attempts: 3
|
max_attempts: 3
|
||||||
labels:
|
labels:
|
||||||
@@ -88,11 +94,15 @@ services:
|
|||||||
- 'traefik.http.routers.${STACK_NAME}.tls.certresolver=letsencrypt'
|
- 'traefik.http.routers.${STACK_NAME}.tls.certresolver=letsencrypt'
|
||||||
- 'traefik.http.services.${STACK_NAME}.loadbalancer.server.port=3000'
|
- 'traefik.http.services.${STACK_NAME}.loadbalancer.server.port=3000'
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ['CMD', 'wget', '-qO-', 'http://localhost:3000/health']
|
test:
|
||||||
|
- 'CMD'
|
||||||
|
- 'node'
|
||||||
|
- '-e'
|
||||||
|
- "require('http').get('http://127.0.0.1:3000/health',r=>process.exit(r.statusCode===200?0:1)).on('error',()=>process.exit(1))"
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 3
|
retries: 5
|
||||||
start_period: 20s
|
start_period: 60s
|
||||||
depends_on:
|
depends_on:
|
||||||
- postgres
|
- postgres
|
||||||
- valkey
|
- valkey
|
||||||
|
|||||||
Reference in New Issue
Block a user