feat(fleet): launcher heartbeat sidecar — HB for all runtimes (pi/claude/codex)
Some checks failed
ci/woodpecker/push/ci Pipeline was canceled
ci/woodpecker/pr/ci Pipeline was canceled

Replace the terminal `exec tmux` with a plain `tmux new-session -d` so the
launcher continues running after creating the pane. The script then resolves
the pane PID via `tmux list-panes -F '#{pane_pid}'` (with a brief retry loop)
and spawns a detached, runtime-agnostic heartbeat sidecar via `setsid bash -c
... &` + `disown`. The sidecar loops while `kill -0 <pane_pid>` succeeds,
writing ~/.config/mosaic/fleet/run/<AGENT>.hb atomically (tmp + mv) every
MOSAIC_HEARTBEAT_INTERVAL seconds (default 15), then exits naturally when the
runtime process dies — making `mosaic fleet ps` show stale then dead.
HB_RUN_DIR and interval are configurable via env; sidecar startup is
best-effort (failures warn but do not abort the launch). Two new shell tests
cover pane-PID resolution (test 6, real tmux) and sidecar invocation
correctness (test 7, fake-tmux + fake-setsid shims).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
This commit is contained in:
Jarvis
2026-06-21 15:52:20 -05:00
parent afcbbb302f
commit b50a062021
2 changed files with 186 additions and 1 deletions

View File

@@ -50,6 +50,8 @@ grep -qF 'already running' /tmp/mosaic-start-agent-idempotent.out || fail "dupli
# - Intercepts 'new-session' calls and records its arguments to a file.
# - For 'has-session' calls, exits 1 (session does not exist) so the script
# proceeds to launch instead of printing "already running".
# - For 'list-panes' calls, returns a synthetic PID so the heartbeat sidecar
# path is exercised without needing a real tmux session.
# - For all other subcommands, exits 0.
#
# Assertions:
@@ -74,6 +76,10 @@ if [ "\$subcmd" = "new-session" ]; then
printf '%s\n' "\$@" > "$TMUX_ARGS_FILE"
exit 0
fi
if [ "\$subcmd" = "list-panes" ]; then
echo "99999" # synthetic pane PID for heartbeat path
exit 0
fi
exit 0
SHIM
chmod +x "$FAKE_BIN/tmux"
@@ -122,6 +128,10 @@ if [ "\$subcmd" = "new-session" ]; then
printf '%s\n' "\$@" > "$TMUX_ARGS_FILE2"
exit 0
fi
if [ "\$subcmd" = "list-panes" ]; then
echo "99998"
exit 0
fi
exit 0
SHIM2
chmod +x "$FAKE_BIN2/tmux"
@@ -171,6 +181,10 @@ if [ "\$subcmd" = "new-session" ]; then
printf '%s\n' "\$@" > "$TMUX_ARGS_FILE5"
exit 0
fi
if [ "\$subcmd" = "list-panes" ]; then
echo "99997"
exit 0
fi
exit 0
SHIM5
chmod +x "$FAKE_BIN5/tmux"
@@ -205,4 +219,123 @@ echo "$all_args5" | grep -qF "export PATH=" || \
echo "$all_args5" | grep -qF "$FAKE_RUNTIME_BIN5" || \
fail "test5: candidate dir (already on launcher PATH) was NOT baked into pane PATH — regression"
# ── Test 6: heartbeat sidecar — pane PID resolved + .hb file written ──────────
#
# Uses a real tmux session (same socket as test 1 which already has $AGENT) so
# list-panes returns a real pane PID. We override MOSAIC_HEARTBEAT_RUN_DIR to
# a temp dir and set a 1-second interval, then wait up to 3 s for the .hb file
# to appear and check its content.
HB_RUN_DIR=$(mktemp -d)
CLEANUP_DIRS+=("$HB_RUN_DIR")
# Re-use the session+agent created in Test 1 (still alive on $SOCKET / $AGENT).
# We need to invoke the script for a NEW agent on the same socket to exercise
# the heartbeat path with a real pane PID.
AGENT6="agent6-$RANDOM"
MOSAIC_TMUX_SOCKET="$SOCKET" \
MOSAIC_AGENT_WORKDIR="$WORKDIR" \
MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \
MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR" \
MOSAIC_HEARTBEAT_INTERVAL="1" \
"$START" "$AGENT6"
HB_FILE="$HB_RUN_DIR/${AGENT6}.hb"
# Wait up to 5 seconds for the heartbeat file to appear.
_waited=0
until [ -f "$HB_FILE" ] || [ "$_waited" -ge 5 ]; do
sleep 0.5
_waited=$((_waited + 1))
done
[ -f "$HB_FILE" ] || fail "test6: heartbeat file not written at $HB_FILE within 5s"
hb_content=$(cat "$HB_FILE")
echo "--- test 6: heartbeat file content ---"
echo "$hb_content"
echo "--- end test 6 ---"
# Verify required fields are present.
echo "$hb_content" | grep -qE '^ts=[0-9]{4}-[0-9]{2}-[0-9]{2}T' || \
fail "test6: heartbeat ts field missing or malformed"
echo "$hb_content" | grep -qE '^pid=[0-9]+' || \
fail "test6: heartbeat pid field missing or malformed"
echo "$hb_content" | grep -qF 'status=ok' || \
fail "test6: heartbeat status=ok missing"
# ── Test 7: heartbeat sidecar — targets correct .hb path per agent name ────────
#
# Uses the fake-tmux shim approach (like tests 3-5) to capture the sidecar
# invocation without needing a real session. A fake setsid shim records its
# arguments so we can assert the sidecar script targets the expected .hb path
# and uses the configured interval.
FAKE_BIN7=$(mktemp -d)
FAKE_RUNTIME_BIN7=$(mktemp -d)
SETSID_ARGS_FILE=$(mktemp)
HB_RUN_DIR7=$(mktemp -d)
CLEANUP_DIRS+=("$FAKE_BIN7" "$FAKE_RUNTIME_BIN7" "$HB_RUN_DIR7")
AGENT7="my-fleet-agent-$RANDOM"
INTERVAL7="42"
# Fake tmux: has-session → not found; new-session → ok; list-panes → known PID.
cat > "$FAKE_BIN7/tmux" <<SHIM7
#!/usr/bin/env bash
subcmd="\$3"
if [ "\$subcmd" = "has-session" ]; then exit 1; fi
if [ "\$subcmd" = "new-session" ]; then exit 0; fi
if [ "\$subcmd" = "list-panes" ]; then echo "88888"; exit 0; fi
exit 0
SHIM7
chmod +x "$FAKE_BIN7/tmux"
# Fake setsid: capture the bash -c <script> argument for inspection, then
# background an actual bash subshell so disown succeeds in the caller.
cat > "$FAKE_BIN7/setsid" <<'SETSID_SHIM'
#!/usr/bin/env bash
# argv: setsid bash -c <sidecar_script>
# Record the full argument list to the capture file, then exit cleanly.
printf '%s\0' "$@" > __SETSID_ARGS_FILE__
exit 0
SETSID_SHIM
# Patch the placeholder with the real capture-file path (avoids heredoc expansion issues).
sed -i "s|__SETSID_ARGS_FILE__|${SETSID_ARGS_FILE}|g" "$FAKE_BIN7/setsid"
chmod +x "$FAKE_BIN7/setsid"
SOCKET7="mosaic-agent-test7-$RANDOM-$$"
WORKDIR7=$(mktemp -d)
CLEANUP_DIRS+=("$WORKDIR7")
PATH="$FAKE_BIN7:$PATH" \
MOSAIC_TMUX_SOCKET="$SOCKET7" \
MOSAIC_AGENT_WORKDIR="$WORKDIR7" \
MOSAIC_AGENT_RUNTIME="pi" \
MOSAIC_RUNTIME_BIN="$FAKE_RUNTIME_BIN7" \
MOSAIC_AGENT_COMMAND="mosaic yolo pi" \
MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR7" \
MOSAIC_HEARTBEAT_INTERVAL="$INTERVAL7" \
"$START" "$AGENT7"
# Give the background setsid shim a moment to finish writing the capture file.
sleep 0.5
setsid_args=$(cat "$SETSID_ARGS_FILE" 2>/dev/null | tr '\0' '\n' || true)
rm -f "$SETSID_ARGS_FILE"
rm -rf "$WORKDIR7"
echo "--- test 7: captured setsid args ---"
echo "$setsid_args"
echo "--- end test 7 ---"
# The sidecar script (bash -c <script>) must reference the correct .hb path.
expected_hb="${HB_RUN_DIR7}/${AGENT7}.hb"
echo "$setsid_args" | grep -qF "$expected_hb" || \
fail "test7: sidecar script does not reference correct .hb path ($expected_hb)"
# The sidecar script must use the configured interval.
echo "$setsid_args" | grep -qF "$INTERVAL7" || \
fail "test7: sidecar script does not reference configured interval ($INTERVAL7)"
echo "ok - start-agent-session"