diff --git a/packages/mosaic/framework/tools/fleet/start-agent-session.sh b/packages/mosaic/framework/tools/fleet/start-agent-session.sh index dddf167..53782ff 100755 --- a/packages/mosaic/framework/tools/fleet/start-agent-session.sh +++ b/packages/mosaic/framework/tools/fleet/start-agent-session.sh @@ -6,6 +6,8 @@ MOSAIC_TMUX_SOCKET=${MOSAIC_TMUX_SOCKET:-mosaic-factory} MOSAIC_AGENT_RUNTIME=${MOSAIC_AGENT_RUNTIME:-pi} MOSAIC_AGENT_WORKDIR=${MOSAIC_AGENT_WORKDIR:-$HOME} MOSAIC_AGENT_COMMAND=${MOSAIC_AGENT_COMMAND:-} +MOSAIC_HEARTBEAT_RUN_DIR=${MOSAIC_HEARTBEAT_RUN_DIR:-$HOME/.config/mosaic/fleet/run} +MOSAIC_HEARTBEAT_INTERVAL=${MOSAIC_HEARTBEAT_INTERVAL:-15} if [ -z "$AGENT_NAME" ]; then echo "ERROR: agent name argument or MOSAIC_AGENT_NAME is required" >&2 @@ -96,5 +98,55 @@ else fi mkdir -p "$MOSAIC_AGENT_WORKDIR" -exec tmux -L "$MOSAIC_TMUX_SOCKET" new-session -d -s "$AGENT_NAME" -c "$MOSAIC_AGENT_WORKDIR" \ + +# ── Launch the tmux session (no exec — we continue to wire the heartbeat) ──── +tmux -L "$MOSAIC_TMUX_SOCKET" new-session -d -s "$AGENT_NAME" -c "$MOSAIC_AGENT_WORKDIR" \ bash -c "$PANE_SHELL_SNIPPET" + +# ── Resolve the pane PID (retry briefly to let the session initialise) ──────── +PANE_PID="" +for _retry in 1 2 3 4 5; do + PANE_PID=$(tmux -L "$MOSAIC_TMUX_SOCKET" list-panes \ + -t "=${AGENT_NAME}:0.0" -F '#{pane_pid}' 2>/dev/null || true) + [ -n "$PANE_PID" ] && break + sleep 0.2 +done + +# ── Spawn the heartbeat sidecar (detached, best-effort) ────────────────────── +# The sidecar writes ~/.config/mosaic/fleet/run/.hb atomically while the +# pane process is alive, then exits so the file goes stale (fleet ps shows stale +# then PANE=dead). It is runtime-agnostic: it only cares about the pane PID. +_start_heartbeat_sidecar() { + local agent="$1" + local pane_pid="$2" + local run_dir="$3" + local interval="$4" + local hb_file="${run_dir}/${agent}.hb" + + mkdir -p "$run_dir" + + # Write the sidecar as a self-contained bash one-liner so it carries no + # references to any variables from this script's environment. + local sidecar_script + sidecar_script=$(printf \ + 'hb=%s; pid=%s; iv=%s; mkdir -p "$(dirname "$hb")"; while kill -0 "$pid" 2>/dev/null; do tmp="$hb.tmp.$$"; printf "ts=%%s\npid=%%s\nstatus=ok\n" "$(date +%%Y-%%m-%%dT%%H:%%M:%%S%%z)" "$pid" > "$tmp" && mv "$tmp" "$hb"; sleep "$iv"; done' \ + "$hb_file" "$pane_pid" "$interval") + + # setsid + disown ensures the sidecar survives this script exiting. + # stderr/stdout go to /dev/null; failures are non-fatal. + if command -v setsid >/dev/null 2>&1; then + setsid bash -c "$sidecar_script" /dev/null 2>&1 & + else + bash -c "$sidecar_script" /dev/null 2>&1 & + fi + disown $! 2>/dev/null || true +} + +if [ -n "$PANE_PID" ]; then + # Guard: do not let sidecar startup failures abort the launcher (set -e). + _start_heartbeat_sidecar "$AGENT_NAME" "$PANE_PID" \ + "$MOSAIC_HEARTBEAT_RUN_DIR" "$MOSAIC_HEARTBEAT_INTERVAL" || \ + echo "WARNING: heartbeat sidecar could not be started for $AGENT_NAME" >&2 +else + echo "WARNING: could not resolve pane PID for $AGENT_NAME — heartbeat sidecar not started" >&2 +fi diff --git a/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh b/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh index 3325db0..df73747 100755 --- a/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh +++ b/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh @@ -50,6 +50,8 @@ grep -qF 'already running' /tmp/mosaic-start-agent-idempotent.out || fail "dupli # - Intercepts 'new-session' calls and records its arguments to a file. # - For 'has-session' calls, exits 1 (session does not exist) so the script # proceeds to launch instead of printing "already running". +# - For 'list-panes' calls, returns a synthetic PID so the heartbeat sidecar +# path is exercised without needing a real tmux session. # - For all other subcommands, exits 0. # # Assertions: @@ -74,6 +76,10 @@ if [ "\$subcmd" = "new-session" ]; then printf '%s\n' "\$@" > "$TMUX_ARGS_FILE" exit 0 fi +if [ "\$subcmd" = "list-panes" ]; then + echo "99999" # synthetic pane PID for heartbeat path + exit 0 +fi exit 0 SHIM chmod +x "$FAKE_BIN/tmux" @@ -122,6 +128,10 @@ if [ "\$subcmd" = "new-session" ]; then printf '%s\n' "\$@" > "$TMUX_ARGS_FILE2" exit 0 fi +if [ "\$subcmd" = "list-panes" ]; then + echo "99998" + exit 0 +fi exit 0 SHIM2 chmod +x "$FAKE_BIN2/tmux" @@ -171,6 +181,10 @@ if [ "\$subcmd" = "new-session" ]; then printf '%s\n' "\$@" > "$TMUX_ARGS_FILE5" exit 0 fi +if [ "\$subcmd" = "list-panes" ]; then + echo "99997" + exit 0 +fi exit 0 SHIM5 chmod +x "$FAKE_BIN5/tmux" @@ -205,4 +219,123 @@ echo "$all_args5" | grep -qF "export PATH=" || \ echo "$all_args5" | grep -qF "$FAKE_RUNTIME_BIN5" || \ fail "test5: candidate dir (already on launcher PATH) was NOT baked into pane PATH — regression" +# ── Test 6: heartbeat sidecar — pane PID resolved + .hb file written ────────── +# +# Uses a real tmux session (same socket as test 1 which already has $AGENT) so +# list-panes returns a real pane PID. We override MOSAIC_HEARTBEAT_RUN_DIR to +# a temp dir and set a 1-second interval, then wait up to 3 s for the .hb file +# to appear and check its content. + +HB_RUN_DIR=$(mktemp -d) +CLEANUP_DIRS+=("$HB_RUN_DIR") + +# Re-use the session+agent created in Test 1 (still alive on $SOCKET / $AGENT). +# We need to invoke the script for a NEW agent on the same socket to exercise +# the heartbeat path with a real pane PID. +AGENT6="agent6-$RANDOM" +MOSAIC_TMUX_SOCKET="$SOCKET" \ +MOSAIC_AGENT_WORKDIR="$WORKDIR" \ +MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \ +MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR" \ +MOSAIC_HEARTBEAT_INTERVAL="1" \ + "$START" "$AGENT6" + +HB_FILE="$HB_RUN_DIR/${AGENT6}.hb" + +# Wait up to 5 seconds for the heartbeat file to appear. +_waited=0 +until [ -f "$HB_FILE" ] || [ "$_waited" -ge 5 ]; do + sleep 0.5 + _waited=$((_waited + 1)) +done + +[ -f "$HB_FILE" ] || fail "test6: heartbeat file not written at $HB_FILE within 5s" + +hb_content=$(cat "$HB_FILE") +echo "--- test 6: heartbeat file content ---" +echo "$hb_content" +echo "--- end test 6 ---" + +# Verify required fields are present. +echo "$hb_content" | grep -qE '^ts=[0-9]{4}-[0-9]{2}-[0-9]{2}T' || \ + fail "test6: heartbeat ts field missing or malformed" +echo "$hb_content" | grep -qE '^pid=[0-9]+' || \ + fail "test6: heartbeat pid field missing or malformed" +echo "$hb_content" | grep -qF 'status=ok' || \ + fail "test6: heartbeat status=ok missing" + +# ── Test 7: heartbeat sidecar — targets correct .hb path per agent name ──────── +# +# Uses the fake-tmux shim approach (like tests 3-5) to capture the sidecar +# invocation without needing a real session. A fake setsid shim records its +# arguments so we can assert the sidecar script targets the expected .hb path +# and uses the configured interval. + +FAKE_BIN7=$(mktemp -d) +FAKE_RUNTIME_BIN7=$(mktemp -d) +SETSID_ARGS_FILE=$(mktemp) +HB_RUN_DIR7=$(mktemp -d) +CLEANUP_DIRS+=("$FAKE_BIN7" "$FAKE_RUNTIME_BIN7" "$HB_RUN_DIR7") + +AGENT7="my-fleet-agent-$RANDOM" +INTERVAL7="42" + +# Fake tmux: has-session → not found; new-session → ok; list-panes → known PID. +cat > "$FAKE_BIN7/tmux" < argument for inspection, then +# background an actual bash subshell so disown succeeds in the caller. +cat > "$FAKE_BIN7/setsid" <<'SETSID_SHIM' +#!/usr/bin/env bash +# argv: setsid bash -c +# Record the full argument list to the capture file, then exit cleanly. +printf '%s\0' "$@" > __SETSID_ARGS_FILE__ +exit 0 +SETSID_SHIM +# Patch the placeholder with the real capture-file path (avoids heredoc expansion issues). +sed -i "s|__SETSID_ARGS_FILE__|${SETSID_ARGS_FILE}|g" "$FAKE_BIN7/setsid" +chmod +x "$FAKE_BIN7/setsid" + +SOCKET7="mosaic-agent-test7-$RANDOM-$$" +WORKDIR7=$(mktemp -d) +CLEANUP_DIRS+=("$WORKDIR7") + +PATH="$FAKE_BIN7:$PATH" \ +MOSAIC_TMUX_SOCKET="$SOCKET7" \ +MOSAIC_AGENT_WORKDIR="$WORKDIR7" \ +MOSAIC_AGENT_RUNTIME="pi" \ +MOSAIC_RUNTIME_BIN="$FAKE_RUNTIME_BIN7" \ +MOSAIC_AGENT_COMMAND="mosaic yolo pi" \ +MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR7" \ +MOSAIC_HEARTBEAT_INTERVAL="$INTERVAL7" \ + "$START" "$AGENT7" + +# Give the background setsid shim a moment to finish writing the capture file. +sleep 0.5 + +setsid_args=$(cat "$SETSID_ARGS_FILE" 2>/dev/null | tr '\0' '\n' || true) +rm -f "$SETSID_ARGS_FILE" +rm -rf "$WORKDIR7" + +echo "--- test 7: captured setsid args ---" +echo "$setsid_args" +echo "--- end test 7 ---" + +# The sidecar script (bash -c