From bb24292cf71ff7091a617e923d969e19d4fa9fc2 Mon Sep 17 00:00:00 2001 From: "jason.woltje" Date: Wed, 22 Apr 2026 02:56:40 +0000 Subject: [PATCH] fix(federation): healthcheck + restart policy for federated-test stacks (#492) --- deploy/portainer/federated-test.stack.yml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/deploy/portainer/federated-test.stack.yml b/deploy/portainer/federated-test.stack.yml index 7604fad..76b0478 100644 --- a/deploy/portainer/federated-test.stack.yml +++ b/deploy/portainer/federated-test.stack.yml @@ -36,6 +36,12 @@ # tested locally — gateway boots, imports resolve, tier-detector runs. # Update digest here when promoting a new build. # +# HEALTHCHECK NOTE (2026-04-21) +# Switched from busybox wget to node http.get on 127.0.0.1 (not localhost) to +# avoid IPv6 resolution issues on Alpine. Retries increased to 5 and +# start_period to 60s to cover the NestJS/GC cold-start window (~40-50s). +# restart_policy set to `any` so SIGTERM/clean-exit also triggers restart. +# # NOTE: This is a TEST template — production deployments use a separate # parameterised template with stricter resource limits and secrets. @@ -76,7 +82,7 @@ services: deploy: replicas: 1 restart_policy: - condition: on-failure + condition: any delay: 5s max_attempts: 3 labels: @@ -88,11 +94,15 @@ services: - 'traefik.http.routers.${STACK_NAME}.tls.certresolver=letsencrypt' - 'traefik.http.services.${STACK_NAME}.loadbalancer.server.port=3000' healthcheck: - test: ['CMD', 'wget', '-qO-', 'http://localhost:3000/health'] + test: + - 'CMD' + - 'node' + - '-e' + - "require('http').get('http://127.0.0.1:3000/health',r=>process.exit(r.statusCode===200?0:1)).on('error',()=>process.exit(1))" interval: 30s timeout: 5s - retries: 3 - start_period: 20s + retries: 5 + start_period: 60s depends_on: - postgres - valkey