Compare commits

..

9 Commits

Author SHA1 Message Date
Jarvis
dd10f0046b fix(fleet): bounded-poll send --verify to eliminate false unverifiable on slow TUIs
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful
Replace the single fixed 300ms capture-pane delay in `agent send --verify` with a
bounded polling loop. After sending, the loop polls `capture-pane` every 400ms
(VERIFY_POLL_INTERVAL_MS) up to a configurable total timeout (default 6000ms,
VERIFY_DEFAULT_TIMEOUT_MS). classifySendResult is called on each poll: accepted/draft
return immediately; unverifiable keeps polling until timeout, then fails closed with
the existing "no pane change after send" message.

New `--verify-timeout <ms>` option on `agent send` (default 6000ms documented).
Injectable SleepFn added to FleetCommandDeps for test isolation — no real sleeps in
tests. Exports VERIFY_POLL_INTERVAL_MS and VERIFY_DEFAULT_TIMEOUT_MS as constants.
classifySendResult and all other pure functions remain unchanged.

Tests: multi-poll acceptance on 2nd/3rd poll => exit 0; pane unchanged until timeout
=> exit 1; draft detected on first poll => exit 1. All 386 tests pass.

docs/fleet/PRD.md Known-limitations updated: verify now polls up to bounded timeout
(default ~6s, --verify-timeout); definitive acceptance still deferred to Phase-3
heartbeat-ack.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 23:07:08 -05:00
Jarvis
8466ca2d81 fix(fleet): verify via pane-change diff + non-resizing watch
Some checks failed
ci/woodpecker/push/ci Pipeline was canceled
ci/woodpecker/pr/ci Pipeline was canceled
Blocker fix: send --verify now captures a BEFORE snapshot immediately
before the send and an AFTER snapshot after the delay, then uses
classifySendResult(before, after) to classify. A wedged pane showing
stale non-empty content is no longer falsely reported as 'accepted' —
BEFORE==AFTER maps to 'unverifiable' (exit 1, "no pane change after
send"). Blank AFTER still fails closed as 'unverifiable'. Only
AFTER != BEFORE without a draft suffix counts as 'accepted' (exit 0).

Should-fix: agent watch now uses a GROUPED VIEWER SESSION instead of a
bare 'tmux attach -r' against the agent session. A bare attach lets the
viewer terminal shrink the agent's window; a grouped session has
independent sizing so the agent's window is never affected.
Sequence: new-session -d -t '=<agent>' -s '<agent>-watch-<pid>' (runner),
attach -r to viewer session (interactiveRunner), kill-session on detach
(runner). New builder functions exported: buildAgentWatchCreateViewerCommand,
buildAgentWatchAttachCommand, buildAgentWatchKillViewerCommand,
buildViewerSessionName. buildAgentWatchCommand kept but deprecated.

New exports: classifySendResult(before, after) — the testable classifier.

Tests added:
- classifySendResult unit suite (6 cases): accepted/draft/unverifiable/
  stale-pane/both-blank/before-blank-after-response
- send --verify regression: stale (before==after non-empty) => exit 1
- send --verify regression: blank AFTER => exit 1
- send --verify regression: draft after pane change => exit 1
- send --verify regression: changed non-draft => exit 0
- send --verify: 3-call sequence assertion (before-capture, send, after-capture)
- watch dispatch: grouped viewer session created/attached/killed; no bare
  attach against agent session; viewer name matches <agent>-watch-<pid>

PRD Known-limitations updated: pane-change check rationale, Phase-3
heartbeat-ack requirement, grouped-session watch design.

All gates pass: pnpm typecheck, pnpm lint, pnpm --filter @mosaicstack/mosaic test
(382 tests, 74 fleet), prettier --check.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:57:00 -05:00
Jarvis
aec560162b fix(fleet): verify fails-closed on unverifiable + interactive watch
Some checks failed
ci/woodpecker/push/ci Pipeline was canceled
ci/woodpecker/pr/ci Pipeline was canceled
- isSendAccepted now returns 'accepted' | 'draft' | 'unverifiable' (was bool)
- Blank/empty capture => 'unverifiable' => process.exitCode=1 with distinct
  "could not verify delivery (blank/no response captured)" message; previously
  blank was treated as success, violating FR-5 fail-closed semantics
- Draft line ('^> ') => process.exitCode=1 with "left as unsubmitted draft"
  message; distinct wording from unverifiable case
- agent watch now dispatched through injectable InteractiveRunner (stdio:inherit)
  instead of the capturing CommandRunner; tmux attach requires TTY passthrough
- Default spawnInteractive implementation uses node:child_process spawn with
  stdio:'inherit'; injectable via FleetCommandDeps.interactiveRunner for tests
- Removed buildSystemdIsActiveCommand (dead code — exported but unused)
- Tests: blank=>exitCode=1, draft=>exitCode=1, real response=>exitCode=0,
  watch dispatched through interactiveRunner not capturing runner
- PRD: added "Known limitations" section (heuristic verify, blank fails closed,
  non-pi/claude draft detection is best-effort, watch requires TTY passthrough)
- Code comment on isSendAccepted notes pi/claude-specific draft heuristic

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:45:22 -05:00
Jarvis
ddeb200fdf style(fleet): prettier-format workstream docs
Some checks failed
ci/woodpecker/push/ci Pipeline was canceled
ci/woodpecker/pr/ci Pipeline was canceled
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:32:14 -05:00
Jarvis
11c4dbe6f3 docs(fleet): session-2 log — heartbeat live + launch-path/env findings
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:30:34 -05:00
Jarvis
c154ced6e5 docs(fleet): mark Phase-2 CLI tasks done (fleet ps/watch/send --verify live-verified)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:30:34 -05:00
Jarvis
cf304eebc3 feat(fleet): phase-2 observability — fleet ps + watch + send --verify
FR-1 fleet ps: joins systemd show (ActiveState/SubState/UnitFileState),
tmux list-panes (pid/command/dead/activity), and file-based heartbeat
(~/.config/mosaic/fleet/run/<name>.hb) into one table per roster agent.
Flags DRIFT (roster runtime ≠ actual pane command) and BOOT-ENABLE
(active but UnitFileState=disabled). --json output includes tenant_id
and host on every record (FR-6 zero-foreclosure for multi-tenant/host).

FR-3 agent watch: read-only tmux attach (-r flag) so the operator can
observe any session without injecting keystrokes or resizing the window.
Registered as a new verb alongside tail/send/reset in registerFleetAgentCommands.

FR-5 agent send --verify: after keystroke injection, captures the last 5
pane lines and checks for draft heuristic (last non-empty line starts
with '> '). Exits non-zero and writes to stderr if the message appears
unsubmitted. Default send behavior is unchanged when --verify is omitted.

New pure exported helpers (all unit-testable without real tmux/systemd):
buildSystemdShowCommand, buildTmuxListPanesCommand, buildAgentWatchCommand,
buildAgentVerifyAcceptedCommand, parseHeartbeat, parseSystemdShow,
parseTmuxListPanes, detectDrift, getDefaultTenantAndHost, isSendAccepted,
heartbeatPath. Added 31 new spec cases (62 total) covering exact command
construction, JSON shape, heartbeat parsing, drift detection, and verify flow.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:30:34 -05:00
Jarvis
c740c59359 docs(fleet): record dual-engine review + worktree discipline decisions
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:30:34 -05:00
Jarvis
b2071dc898 docs(fleet): north star + Phase-2 observability PRD/tasks (W-FLEET)
Establish Fleet workstream doctrine under mvp-20260312: north star (incl.
fleet-as-means-of-production), Phase-2 observability PRD, workstream tasks,
and scratchpad. Collision-safe: scoped to docs/fleet/, touches none of the
MVP single-writer control-plane files.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01RMoEx7hfdFGjUiCHuN1RRi
2026-06-20 22:30:34 -05:00
10 changed files with 24 additions and 725 deletions

View File

@@ -115,11 +115,6 @@ Every artifact, starting Phase 2, MUST:
- Observation: **read-only default, opt-in takeover**. - Observation: **read-only default, opt-in takeover**.
- Multi-host: **designed-for from day one**; control plane **rides federation (W1)**. - Multi-host: **designed-for from day one**; control plane **rides federation (W1)**.
- Delivery: **CLI-first now**, dogfood against the live stub fleet; webUI deferred to Phase 5. - Delivery: **CLI-first now**, dogfood against the live stub fleet; webUI deferred to Phase 5.
- Runtimes: fleet agents default to **Codex / pi-on-Codex**; **Claude is reserved for Claude
Code only** (avoid alternate-harness API pricing). Validated durable recipe:
`mosaic yolo pi --model openai-codex/gpt-5.5:high`. Durable detached launch requires the
runtime-bin on PATH (baked into the pane command) + boot-survival (`enable` + linger),
which `fleet init` should automate.
## Assumptions (veto-able) ## Assumptions (veto-able)

View File

@@ -73,28 +73,3 @@ with a second agent on `dragon-lin`.
tmux session-name fallback; the systemd/tmux env handoff needs a real fix. tmux session-name fallback; the systemd/tmux env handoff needs a real fix.
- Next: rebase on merged main, open Phase-2 PR, dual-engine review, merge, close - Next: rebase on merged main, open Phase-2 PR, dual-engine review, merge, close
`fleet-observability-1`. Defer launch-path + env-propagation fixes to Phase 3. `fleet-observability-1`. Defer launch-path + env-propagation fixes to Phase 3.
- 2026-06-21 (session 3): Phase-2 PR #579 merged (3 dual-engine rounds hardened
verify+watch). Then closed the launch-path question with Jason's input — CORRECTING
earlier findings:
- The ad-hoc launch deaths were NOT a fundamental TTY blocker: (a) codex was a stale
version (Jason updated it); (b) pi was misconfigured to Claude auth (Jason removed it;
default is now Codex). The REAL durable-launch bug is **PATH**: the detached tmux
launch shell is login+non-interactive, so it misses `~/.npm-global/bin` (added only in
`~/.bashrc`) -> `mosaic: command not found` (127) -> pane dies. tmux panes inherit the
tmux _server_ env, so PATH must be baked into the pane command.
- **Durable real-agent recipe (validated live on gpt-5.5, Claude-free):**
`mosaic yolo pi --model openai-codex/gpt-5.5:high` — pi tolerates detached tmux; a raw
interactive TUI (codex CLI) exits without an attached client. Status line confirmed
`(openai-codex) gpt-5.5 • high`.
- PATH fix landed in `start-agent-session.sh` (commit 32efc13, branch
feat/fleet-launch-path): derive runtime-bin prefix (MOSAIC_RUNTIME_BIN | npm prefix |
~/.npm-global/bin | ~/.local/bin), bake `export PATH=...; exec <cmd>` into the pane;
`exec` also fixes the drift false-positive. Live-tested under stripped PATH -> durable.
- Boot-survival: Jason ran `systemctl --user enable` (+ linger). TODO: auto-enable in
**fleet init** so operators never have to remember it (agentic-enhancement cycle).
- Future custom Pi harness build: pi cannot self-report its model (track
runtime/model/effort as fleet metadata); drift detection should recognize `node` as
pi's pane command (a node-wrapped pane can currently read as drift).
- Findings recorded in AI Guide playbooks/tmux-fleet.md (aiguide PR #7, merged).
- Policy: avoid Claude outside Claude Code (API pricing for alt-harness use) — fleet
runtimes default to Codex / pi-on-Codex; Claude stays in Claude Code only.

View File

@@ -70,9 +70,6 @@ Skills, hooks, MCP, and plugins are force multipliers you MUST use when applicab
## Missing core file ## Missing core file
If `CONSTITUTION.md`, `AGENTS.md`, `SOUL.md`, or the runtime contract is missing, stop and report it. If `CONSTITUTION.md`, `AGENTS.md`, `SOUL.md`, or the runtime contract is missing, stop and report it.
This agent-facing strictness is intentional and stricter than the launcher: the launcher injects
`CONSTITUTION.md` tolerantly (skipping it if absent so pre-upgrade hosts keep working), but once a host
is re-seeded a genuinely missing core file is a stop-and-report condition — not something to proceed past.
## Session Closure ## Session Closure

View File

@@ -2,11 +2,8 @@
The irreducible, non-negotiable law for every Mosaic agent on every harness. The irreducible, non-negotiable law for every Mosaic agent on every harness.
**Framework-owned.** This file is overwritten verbatim on every upgrade — do not edit it. There is **Framework-owned.** This file is overwritten verbatim on every upgrade — do not edit it. To change
**no `CONSTITUTION.local.md`**: hard gates are not locally overridable. A lower layer may only make behavior, add a `.local.md` overlay or a `policy/` file (tighten-only; see `constitution/LAYER-MODEL.md`).
behavior _stricter_, never relax or override a gate (see Precedence). Operator customization lives in
other layers — `SOUL.md` / `USER.md` and the tighten-only overlays `STANDARDS.local.md` /
`SOUL.local.md` / `USER.local.md` / `policy/*.md` (see `constitution/LAYER-MODEL.md`).
Authored in **capability verbs**: where a gate names a capability ("structured reasoning", "queue Authored in **capability verbs**: where a gate names a capability ("structured reasoning", "queue
guard"), the runtime adapter binds it to a concrete tool and states whether absence is a hard stop. guard"), the runtime adapter binds it to a concrete tool and states whether absence is a hard stop.

View File

@@ -6,8 +6,6 @@ MOSAIC_TMUX_SOCKET=${MOSAIC_TMUX_SOCKET:-mosaic-factory}
MOSAIC_AGENT_RUNTIME=${MOSAIC_AGENT_RUNTIME:-pi} MOSAIC_AGENT_RUNTIME=${MOSAIC_AGENT_RUNTIME:-pi}
MOSAIC_AGENT_WORKDIR=${MOSAIC_AGENT_WORKDIR:-$HOME} MOSAIC_AGENT_WORKDIR=${MOSAIC_AGENT_WORKDIR:-$HOME}
MOSAIC_AGENT_COMMAND=${MOSAIC_AGENT_COMMAND:-} MOSAIC_AGENT_COMMAND=${MOSAIC_AGENT_COMMAND:-}
MOSAIC_HEARTBEAT_RUN_DIR=${MOSAIC_HEARTBEAT_RUN_DIR:-$HOME/.config/mosaic/fleet/run}
MOSAIC_HEARTBEAT_INTERVAL=${MOSAIC_HEARTBEAT_INTERVAL:-15}
if [ -z "$AGENT_NAME" ]; then if [ -z "$AGENT_NAME" ]; then
echo "ERROR: agent name argument or MOSAIC_AGENT_NAME is required" >&2 echo "ERROR: agent name argument or MOSAIC_AGENT_NAME is required" >&2
@@ -28,125 +26,5 @@ if [ -z "$MOSAIC_AGENT_COMMAND" ]; then
MOSAIC_AGENT_COMMAND="mosaic yolo $MOSAIC_AGENT_RUNTIME" MOSAIC_AGENT_COMMAND="mosaic yolo $MOSAIC_AGENT_RUNTIME"
fi fi
# ── Derive a runtime-bin PATH prefix ─────────────────────────────────────────
# Precedence:
# 1. $MOSAIC_RUNTIME_BIN (explicit override)
# 2. $(npm config get prefix)/bin (if npm is on PATH)
# 3. Fallbacks: $HOME/.npm-global/bin and $HOME/.local/bin
#
# Only directories that already exist are included. The prefix is baked into
# the pane command regardless of what the LAUNCHER process's $PATH contains,
# because the tmux pane inherits the tmux SERVER environment (not this script's
# environment). A dir on the launcher's PATH may be absent from the server PATH,
# so every existing candidate must always be included. Dedup within the
# constructed prefix avoids listing the same dir twice.
_build_runtime_bin_prefix() {
local candidates=()
if [ -n "${MOSAIC_RUNTIME_BIN:-}" ]; then
candidates+=("$MOSAIC_RUNTIME_BIN")
fi
if command -v npm >/dev/null 2>&1; then
local npm_prefix
npm_prefix=$(npm config get prefix 2>/dev/null) || true
if [ -n "$npm_prefix" ]; then
candidates+=("${npm_prefix}/bin")
fi
fi
candidates+=("$HOME/.npm-global/bin")
candidates+=("$HOME/.local/bin")
local prefix=""
for dir in "${candidates[@]}"; do
[ -d "$dir" ] || continue
if [ -z "$prefix" ]; then
prefix="$dir"
else
case ":${prefix}:" in
*":${dir}:"*) ;; # already in our prefix — skip
*) prefix="${prefix}:${dir}" ;;
esac
fi
done
printf '%s' "$prefix"
}
MOSAIC_RUNTIME_BIN_PREFIX=$(_build_runtime_bin_prefix)
# ── Build the pane command ────────────────────────────────────────────────────
# The pane command must:
# - Export the augmented PATH so the runtime binary is found.
# - exec the agent command so the runtime is the pane's foreground process
# (makes `fleet ps` pane_current_command check reliable; no DRIFT false-positive).
#
# Quoting strategy: single-quote the inner shell snippet so that variable
# references in MOSAIC_AGENT_COMMAND are NOT expanded here — they expand inside
# the pane shell. However, MOSAIC_RUNTIME_BIN_PREFIX and PATH must be expanded
# NOW (in this script) because the pane shell inherits the tmux server
# environment, not this script's env.
#
# We build the snippet as a double-quoted here-string embedded in a printf call
# to avoid nested quoting problems.
if [ -n "$MOSAIC_RUNTIME_BIN_PREFIX" ]; then
PANE_SHELL_SNIPPET="export PATH=\"${MOSAIC_RUNTIME_BIN_PREFIX}:\${PATH}\"; exec ${MOSAIC_AGENT_COMMAND}"
else
PANE_SHELL_SNIPPET="exec ${MOSAIC_AGENT_COMMAND}"
fi
mkdir -p "$MOSAIC_AGENT_WORKDIR" mkdir -p "$MOSAIC_AGENT_WORKDIR"
exec tmux -L "$MOSAIC_TMUX_SOCKET" new-session -d -s "$AGENT_NAME" -c "$MOSAIC_AGENT_WORKDIR" "$MOSAIC_AGENT_COMMAND"
# ── Launch the tmux session (no exec — we continue to wire the heartbeat) ────
tmux -L "$MOSAIC_TMUX_SOCKET" new-session -d -s "$AGENT_NAME" -c "$MOSAIC_AGENT_WORKDIR" \
bash -c "$PANE_SHELL_SNIPPET"
# ── Resolve the pane PID (retry briefly to let the session initialise) ────────
PANE_PID=""
for _retry in 1 2 3 4 5; do
PANE_PID=$(tmux -L "$MOSAIC_TMUX_SOCKET" list-panes \
-t "=${AGENT_NAME}:0.0" -F '#{pane_pid}' 2>/dev/null || true)
[ -n "$PANE_PID" ] && break
sleep 0.2
done
# ── Spawn the heartbeat sidecar (detached, best-effort) ──────────────────────
# The sidecar writes ~/.config/mosaic/fleet/run/<AGENT>.hb atomically while the
# pane process is alive, then exits so the file goes stale (fleet ps shows stale
# then PANE=dead). It is runtime-agnostic: it only cares about the pane PID.
_start_heartbeat_sidecar() {
local agent="$1"
local pane_pid="$2"
local run_dir="$3"
local interval="$4"
local hb_file="${run_dir}/${agent}.hb"
mkdir -p "$run_dir"
# Write the sidecar as a self-contained bash one-liner so it carries no
# references to any variables from this script's environment.
local sidecar_script
sidecar_script=$(printf \
'hb=%s; pid=%s; iv=%s; mkdir -p "$(dirname "$hb")"; while kill -0 "$pid" 2>/dev/null; do tmp="$hb.tmp.$$"; printf "ts=%%s\npid=%%s\nstatus=ok\n" "$(date +%%Y-%%m-%%dT%%H:%%M:%%S%%z)" "$pid" > "$tmp" && mv "$tmp" "$hb"; sleep "$iv"; done' \
"$hb_file" "$pane_pid" "$interval")
# setsid + disown ensures the sidecar survives this script exiting.
# stderr/stdout go to /dev/null; failures are non-fatal.
if command -v setsid >/dev/null 2>&1; then
setsid bash -c "$sidecar_script" </dev/null >/dev/null 2>&1 &
else
bash -c "$sidecar_script" </dev/null >/dev/null 2>&1 &
fi
disown $! 2>/dev/null || true
}
if [ -n "$PANE_PID" ]; then
# Guard: do not let sidecar startup failures abort the launcher (set -e).
_start_heartbeat_sidecar "$AGENT_NAME" "$PANE_PID" \
"$MOSAIC_HEARTBEAT_RUN_DIR" "$MOSAIC_HEARTBEAT_INTERVAL" || \
echo "WARNING: heartbeat sidecar could not be started for $AGENT_NAME" >&2
else
echo "WARNING: could not resolve pane PID for $AGENT_NAME — heartbeat sidecar not started" >&2
fi

View File

@@ -6,26 +6,13 @@ START="$SCRIPT_DIR/start-agent-session.sh"
SOCKET="mosaic-agent-test-$RANDOM-$$" SOCKET="mosaic-agent-test-$RANDOM-$$"
AGENT="agent-$RANDOM" AGENT="agent-$RANDOM"
WORKDIR=$(mktemp -d) WORKDIR=$(mktemp -d)
trap 'tmux -L "$SOCKET" kill-server >/dev/null 2>&1 || true; rm -rf "$WORKDIR"' EXIT
# Keep a single cleanup trap that accumulates resources.
CLEANUP_DIRS=("$WORKDIR")
CLEANUP_SOCKETS=("$SOCKET")
trap '_cleanup' EXIT
_cleanup() {
for s in "${CLEANUP_SOCKETS[@]:-}"; do
tmux -L "$s" kill-server >/dev/null 2>&1 || true
done
for d in "${CLEANUP_DIRS[@]:-}"; do
rm -rf "$d"
done
}
fail() { fail() {
echo "FAIL: $*" >&2 echo "FAIL: $*" >&2
exit 1 exit 1
} }
# ── Test 1: basic session creation with workdir check ─────────────────────────
MOSAIC_TMUX_SOCKET="$SOCKET" \ MOSAIC_TMUX_SOCKET="$SOCKET" \
MOSAIC_AGENT_WORKDIR="$WORKDIR" \ MOSAIC_AGENT_WORKDIR="$WORKDIR" \
MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \ MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \
@@ -35,7 +22,6 @@ tmux -L "$SOCKET" has-session -t "=$AGENT:0.0" || fail "agent session was not cr
actual_dir=$(tmux -L "$SOCKET" display-message -p -t "=$AGENT:0.0" '#{pane_current_path}') actual_dir=$(tmux -L "$SOCKET" display-message -p -t "=$AGENT:0.0" '#{pane_current_path}')
[ "$actual_dir" = "$WORKDIR" ] || fail "agent workdir mismatch: $actual_dir" [ "$actual_dir" = "$WORKDIR" ] || fail "agent workdir mismatch: $actual_dir"
# ── Test 2: idempotency (duplicate start prints 'already running') ─────────────
MOSAIC_TMUX_SOCKET="$SOCKET" \ MOSAIC_TMUX_SOCKET="$SOCKET" \
MOSAIC_AGENT_WORKDIR="$WORKDIR" \ MOSAIC_AGENT_WORKDIR="$WORKDIR" \
MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \ MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \
@@ -43,310 +29,4 @@ MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \
grep -qF 'already running' /tmp/mosaic-start-agent-idempotent.out || fail "duplicate start was not idempotent" grep -qF 'already running' /tmp/mosaic-start-agent-idempotent.out || fail "duplicate start was not idempotent"
# ── Test 3: runtime-bin PATH prefix is baked into the pane command ────────────
#
# We capture the command the script would hand to tmux by injecting a fake
# 'tmux' shim into PATH. The shim:
# - Intercepts 'new-session' calls and records its arguments to a file.
# - For 'has-session' calls, exits 1 (session does not exist) so the script
# proceeds to launch instead of printing "already running".
# - For 'list-panes' calls, returns empty so PANE_PID stays unset and the
# heartbeat sidecar is NOT spawned (heartbeat is not the focus of this test;
# test 6 and 7 cover that path). This prevents any real-filesystem side
# effects or leaked background processes.
# - For all other subcommands, exits 0.
#
# Assertions:
# a) 'export PATH=' with the synthetic MOSAIC_RUNTIME_BIN prefix appears.
# b) 'exec' appears so the runtime replaces the wrapper shell.
# c) MOSAIC_AGENT_COMMAND with flags is forwarded intact.
FAKE_BIN=$(mktemp -d)
FAKE_RUNTIME_BIN=$(mktemp -d)
TMUX_ARGS_FILE=$(mktemp)
HB_RUN_DIR3=$(mktemp -d)
CLEANUP_DIRS+=("$FAKE_BIN" "$FAKE_RUNTIME_BIN" "$HB_RUN_DIR3")
# Write the fake tmux shim (uses only positional args, no sourced vars).
cat > "$FAKE_BIN/tmux" <<SHIM
#!/usr/bin/env bash
# Fake tmux: record new-session args; report has-session as missing.
subcmd="\$3" # argv: tmux -L <socket> <subcmd> ...
if [ "\$subcmd" = "has-session" ]; then
exit 1 # session not found → script will attempt new-session
fi
if [ "\$subcmd" = "new-session" ]; then
printf '%s\n' "\$@" > "$TMUX_ARGS_FILE"
exit 0
fi
if [ "\$subcmd" = "list-panes" ]; then
# Return empty: no sidecar spawned (heartbeat is not the focus of this test).
echo ""
exit 0
fi
exit 0
SHIM
chmod +x "$FAKE_BIN/tmux"
SOCKET3="mosaic-agent-test3-$RANDOM-$$"
AGENT3="agent3-$RANDOM"
WORKDIR3=$(mktemp -d)
CLEANUP_DIRS+=("$WORKDIR3")
PATH="$FAKE_BIN:$PATH" \
MOSAIC_TMUX_SOCKET="$SOCKET3" \
MOSAIC_AGENT_WORKDIR="$WORKDIR3" \
MOSAIC_AGENT_RUNTIME="pi" \
MOSAIC_RUNTIME_BIN="$FAKE_RUNTIME_BIN" \
MOSAIC_AGENT_COMMAND="mosaic yolo pi --model openai-codex/gpt-5.5:high" \
MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR3" \
"$START" "$AGENT3"
all_args=$(cat "$TMUX_ARGS_FILE" 2>/dev/null || true)
rm -f "$TMUX_ARGS_FILE"
echo "--- captured tmux new-session args ---"
echo "$all_args"
echo "--- end args ---"
# a) PATH prefix containing FAKE_RUNTIME_BIN must appear.
echo "$all_args" | grep -qF "export PATH=" || fail "pane command does not export PATH"
echo "$all_args" | grep -qF "$FAKE_RUNTIME_BIN" || fail "pane command does not include MOSAIC_RUNTIME_BIN in PATH prefix"
# b) exec must appear so the runtime replaces the wrapper shell.
echo "$all_args" | grep -qF "exec " || fail "pane command does not use exec"
# c) Full MOSAIC_AGENT_COMMAND (with flags) must be forwarded.
echo "$all_args" | grep -qF "mosaic yolo pi --model openai-codex/gpt-5.5:high" || \
fail "pane command does not forward MOSAIC_AGENT_COMMAND with flags intact"
# ── Test 4: when no extra runtime-bin dirs exist, exec still appears ───────────
TMUX_ARGS_FILE2=$(mktemp)
FAKE_BIN2=$(mktemp -d)
HB_RUN_DIR4=$(mktemp -d)
CLEANUP_DIRS+=("$FAKE_BIN2" "$HB_RUN_DIR4")
cat > "$FAKE_BIN2/tmux" <<SHIM2
#!/usr/bin/env bash
subcmd="\$3"
if [ "\$subcmd" = "has-session" ]; then exit 1; fi
if [ "\$subcmd" = "new-session" ]; then
printf '%s\n' "\$@" > "$TMUX_ARGS_FILE2"
exit 0
fi
if [ "\$subcmd" = "list-panes" ]; then
# Return empty: no sidecar spawned (heartbeat is not the focus of this test).
echo ""
exit 0
fi
exit 0
SHIM2
chmod +x "$FAKE_BIN2/tmux"
SOCKET4="mosaic-agent-test4-$RANDOM-$$"
AGENT4="agent4-$RANDOM"
WORKDIR4=$(mktemp -d)
CLEANUP_DIRS+=("$WORKDIR4")
# MOSAIC_RUNTIME_BIN points to a non-existent dir so prefix will be empty;
# .npm-global/bin and .local/bin may or may not exist but we just want exec.
PATH="$FAKE_BIN2:$PATH" \
MOSAIC_TMUX_SOCKET="$SOCKET4" \
MOSAIC_AGENT_WORKDIR="$WORKDIR4" \
MOSAIC_AGENT_RUNTIME="pi" \
MOSAIC_RUNTIME_BIN="/nonexistent-dir-$$" \
MOSAIC_AGENT_COMMAND="mosaic yolo pi" \
MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR4" \
"$START" "$AGENT4"
all_args4=$(cat "$TMUX_ARGS_FILE2" 2>/dev/null || true)
rm -f "$TMUX_ARGS_FILE2"
rm -rf "$WORKDIR4"
echo "$all_args4" | grep -qF "exec " || fail "pane command (no prefix dirs) does not use exec"
echo "$all_args4" | grep -qF "mosaic yolo pi" || fail "pane command does not include agent command when no prefix"
# ── Test 5: candidate dir already in LAUNCHER $PATH is still baked into pane ──
#
# Regression guard for the bug where _build_runtime_bin_prefix() used to skip
# a candidate because it was already present in the launcher process's $PATH.
# That check was wrong: the pane inherits the tmux SERVER environment, not the
# launcher's env. Even if a dir is on the launcher's PATH it must always be
# baked into the pane's PATH export.
#
# We prove this by setting PATH to include FAKE_RUNTIME_BIN5 (the candidate),
# then asserting the generated new-session command still exports it.
TMUX_ARGS_FILE5=$(mktemp)
FAKE_BIN5=$(mktemp -d)
FAKE_RUNTIME_BIN5=$(mktemp -d) # this dir IS on the launcher's PATH below
HB_RUN_DIR5=$(mktemp -d)
CLEANUP_DIRS+=("$FAKE_BIN5" "$FAKE_RUNTIME_BIN5" "$HB_RUN_DIR5")
cat > "$FAKE_BIN5/tmux" <<SHIM5
#!/usr/bin/env bash
subcmd="\$3"
if [ "\$subcmd" = "has-session" ]; then exit 1; fi
if [ "\$subcmd" = "new-session" ]; then
printf '%s\n' "\$@" > "$TMUX_ARGS_FILE5"
exit 0
fi
if [ "\$subcmd" = "list-panes" ]; then
# Return empty: no sidecar spawned (heartbeat is not the focus of this test).
echo ""
exit 0
fi
exit 0
SHIM5
chmod +x "$FAKE_BIN5/tmux"
SOCKET5="mosaic-agent-test5-$RANDOM-$$"
AGENT5="agent5-$RANDOM"
WORKDIR5=$(mktemp -d)
CLEANUP_DIRS+=("$WORKDIR5")
CLEANUP_SOCKETS+=("$SOCKET5")
# FAKE_RUNTIME_BIN5 is deliberately placed on the LAUNCHER PATH so that the
# old (buggy) code would have skipped it. The correct code must still include
# it in the pane PATH export.
PATH="$FAKE_BIN5:$FAKE_RUNTIME_BIN5:$PATH" \
MOSAIC_TMUX_SOCKET="$SOCKET5" \
MOSAIC_AGENT_WORKDIR="$WORKDIR5" \
MOSAIC_AGENT_RUNTIME="pi" \
MOSAIC_RUNTIME_BIN="$FAKE_RUNTIME_BIN5" \
MOSAIC_AGENT_COMMAND="mosaic yolo pi" \
MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR5" \
"$START" "$AGENT5"
all_args5=$(cat "$TMUX_ARGS_FILE5" 2>/dev/null || true)
rm -f "$TMUX_ARGS_FILE5"
rm -rf "$WORKDIR5"
echo "--- test 5: launcher-PATH candidate must still appear in pane export ---"
echo "$all_args5"
echo "--- end test 5 args ---"
echo "$all_args5" | grep -qF "export PATH=" || \
fail "test5: pane command does not export PATH when candidate is on launcher PATH"
echo "$all_args5" | grep -qF "$FAKE_RUNTIME_BIN5" || \
fail "test5: candidate dir (already on launcher PATH) was NOT baked into pane PATH — regression"
# ── Test 6: heartbeat sidecar — pane PID resolved + .hb file written ──────────
#
# Uses a real tmux session (same socket as test 1 which already has $AGENT) so
# list-panes returns a real pane PID. We override MOSAIC_HEARTBEAT_RUN_DIR to
# a temp dir and set a 1-second interval, then wait up to 3 s for the .hb file
# to appear and check its content.
HB_RUN_DIR=$(mktemp -d)
CLEANUP_DIRS+=("$HB_RUN_DIR")
# Re-use the session+agent created in Test 1 (still alive on $SOCKET / $AGENT).
# We need to invoke the script for a NEW agent on the same socket to exercise
# the heartbeat path with a real pane PID.
AGENT6="agent6-$RANDOM"
MOSAIC_TMUX_SOCKET="$SOCKET" \
MOSAIC_AGENT_WORKDIR="$WORKDIR" \
MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \
MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR" \
MOSAIC_HEARTBEAT_INTERVAL="1" \
"$START" "$AGENT6"
HB_FILE="$HB_RUN_DIR/${AGENT6}.hb"
# Wait up to 5 seconds for the heartbeat file to appear.
_waited=0
until [ -f "$HB_FILE" ] || [ "$_waited" -ge 5 ]; do
sleep 0.5
_waited=$((_waited + 1))
done
[ -f "$HB_FILE" ] || fail "test6: heartbeat file not written at $HB_FILE within 5s"
hb_content=$(cat "$HB_FILE")
echo "--- test 6: heartbeat file content ---"
echo "$hb_content"
echo "--- end test 6 ---"
# Verify required fields are present.
echo "$hb_content" | grep -qE '^ts=[0-9]{4}-[0-9]{2}-[0-9]{2}T' || \
fail "test6: heartbeat ts field missing or malformed"
echo "$hb_content" | grep -qE '^pid=[0-9]+' || \
fail "test6: heartbeat pid field missing or malformed"
echo "$hb_content" | grep -qF 'status=ok' || \
fail "test6: heartbeat status=ok missing"
# ── Test 7: heartbeat sidecar — targets correct .hb path per agent name ────────
#
# Uses the fake-tmux shim approach (like tests 3-5) to capture the sidecar
# invocation without needing a real session. A fake setsid shim records its
# arguments so we can assert the sidecar script targets the expected .hb path
# and uses the configured interval.
FAKE_BIN7=$(mktemp -d)
FAKE_RUNTIME_BIN7=$(mktemp -d)
SETSID_ARGS_FILE=$(mktemp)
HB_RUN_DIR7=$(mktemp -d)
CLEANUP_DIRS+=("$FAKE_BIN7" "$FAKE_RUNTIME_BIN7" "$HB_RUN_DIR7")
AGENT7="my-fleet-agent-$RANDOM"
INTERVAL7="42"
# Fake tmux: has-session → not found; new-session → ok; list-panes → known PID.
cat > "$FAKE_BIN7/tmux" <<SHIM7
#!/usr/bin/env bash
subcmd="\$3"
if [ "\$subcmd" = "has-session" ]; then exit 1; fi
if [ "\$subcmd" = "new-session" ]; then exit 0; fi
if [ "\$subcmd" = "list-panes" ]; then echo "88888"; exit 0; fi
exit 0
SHIM7
chmod +x "$FAKE_BIN7/tmux"
# Fake setsid: capture the bash -c <script> argument for inspection, then
# background an actual bash subshell so disown succeeds in the caller.
cat > "$FAKE_BIN7/setsid" <<'SETSID_SHIM'
#!/usr/bin/env bash
# argv: setsid bash -c <sidecar_script>
# Record the full argument list to the capture file, then exit cleanly.
printf '%s\0' "$@" > __SETSID_ARGS_FILE__
exit 0
SETSID_SHIM
# Patch the placeholder with the real capture-file path (avoids heredoc expansion issues).
sed -i "s|__SETSID_ARGS_FILE__|${SETSID_ARGS_FILE}|g" "$FAKE_BIN7/setsid"
chmod +x "$FAKE_BIN7/setsid"
SOCKET7="mosaic-agent-test7-$RANDOM-$$"
WORKDIR7=$(mktemp -d)
CLEANUP_DIRS+=("$WORKDIR7")
PATH="$FAKE_BIN7:$PATH" \
MOSAIC_TMUX_SOCKET="$SOCKET7" \
MOSAIC_AGENT_WORKDIR="$WORKDIR7" \
MOSAIC_AGENT_RUNTIME="pi" \
MOSAIC_RUNTIME_BIN="$FAKE_RUNTIME_BIN7" \
MOSAIC_AGENT_COMMAND="mosaic yolo pi" \
MOSAIC_HEARTBEAT_RUN_DIR="$HB_RUN_DIR7" \
MOSAIC_HEARTBEAT_INTERVAL="$INTERVAL7" \
"$START" "$AGENT7"
# Give the background setsid shim a moment to finish writing the capture file.
sleep 0.5
setsid_args=$(cat "$SETSID_ARGS_FILE" 2>/dev/null | tr '\0' '\n' || true)
rm -f "$SETSID_ARGS_FILE"
rm -rf "$WORKDIR7"
echo "--- test 7: captured setsid args ---"
echo "$setsid_args"
echo "--- end test 7 ---"
# The sidecar script (bash -c <script>) must reference the correct .hb path.
expected_hb="${HB_RUN_DIR7}/${AGENT7}.hb"
echo "$setsid_args" | grep -qF "$expected_hb" || \
fail "test7: sidecar script does not reference correct .hb path ($expected_hb)"
# The sidecar script must use the configured interval.
echo "$setsid_args" | grep -qF "$INTERVAL7" || \
fail "test7: sidecar script does not reference configured interval ($INTERVAL7)"
echo "ok - start-agent-session" echo "ok - start-agent-session"

View File

@@ -12,7 +12,7 @@
# 2. STRUCTURAL (private $HOME default in *.sh) — scanned everywhere EXCEPT examples/, # 2. STRUCTURAL (private $HOME default in *.sh) — scanned everywhere EXCEPT examples/,
# because worked example overlays/personas legitimately show placeholder paths. # because worked example overlays/personas legitimately show placeholder paths.
# #
# File types: *.md, *.sh, *.ps1, *.json, *.yml/*.yaml, *.toml, *.env, *.service, and the CLI scripts under # File types: *.md, *.sh, *.ps1, *.json, and the extensionless CLI scripts under
# tools/_scripts/. Excludes node_modules/ and this gate file. # tools/_scripts/. Excludes node_modules/ and this gate file.
# #
# NOTE: '\bPDA\b' intentionally matches "PDA-friendly" (the contamination removed in P2); # NOTE: '\bPDA\b' intentionally matches "PDA-friendly" (the contamination removed in P2);
@@ -39,7 +39,7 @@ cd "$FRAMEWORK_ROOT" || { echo "FRAMEWORK_ROOT not found: $FRAMEWORK_ROOT" >&2;
# Identity scope = ALL shipped text files (examples/ INCLUDED). # Identity scope = ALL shipped text files (examples/ INCLUDED).
_files_identity() { _files_identity() {
find . -type f \ find . -type f \
\( -name '*.md' -o -name '*.sh' -o -name '*.ps1' -o -name '*.json' -o -name '*.yml' -o -name '*.yaml' -o -name '*.toml' -o -name '*.env' -o -name '*.service' -o -path '*/tools/_scripts/*' \) \ \( -name '*.md' -o -name '*.sh' -o -name '*.ps1' -o -name '*.json' -o -path '*/tools/_scripts/*' \) \
-not -path '*/node_modules/*' -not -path "./$SELF_REL" -print0 -not -path '*/node_modules/*' -not -path "./$SELF_REL" -print0
} }
# Structural scope = shipped scripts, examples/ EXCLUDED. # Structural scope = shipped scripts, examples/ EXCLUDED.

View File

@@ -1,6 +1,6 @@
{ {
"name": "@mosaicstack/mosaic", "name": "@mosaicstack/mosaic",
"version": "0.0.35", "version": "0.0.34",
"repository": { "repository": {
"type": "git", "type": "git",
"url": "https://git.mosaicstack.dev/mosaicstack/stack.git", "url": "https://git.mosaicstack.dev/mosaicstack/stack.git",

View File

@@ -10,14 +10,11 @@ import {
buildAgentWatchCreateViewerCommand, buildAgentWatchCreateViewerCommand,
buildAgentWatchKillViewerCommand, buildAgentWatchKillViewerCommand,
buildAgentVerifyAcceptedCommand, buildAgentVerifyAcceptedCommand,
buildEnableLingerCommand,
buildFleetServiceCommand, buildFleetServiceCommand,
buildSystemdEnableCommand,
buildSystemdShowCommand, buildSystemdShowCommand,
buildTmuxListPanesCommand, buildTmuxListPanesCommand,
classifySendResult, classifySendResult,
detectDrift, detectDrift,
enableFleetUnits,
generateAgentEnv, generateAgentEnv,
getDefaultOperatorSourceLabel, getDefaultOperatorSourceLabel,
getDefaultTenantAndHost, getDefaultTenantAndHost,
@@ -31,12 +28,10 @@ import {
parseTmuxListPanes, parseTmuxListPanes,
registerFleetCommand, registerFleetCommand,
resolveFleetPaths, resolveFleetPaths,
RUNTIME_ACCEPTABLE_COMMANDS,
VERIFY_DEFAULT_TIMEOUT_MS, VERIFY_DEFAULT_TIMEOUT_MS,
VERIFY_POLL_INTERVAL_MS, VERIFY_POLL_INTERVAL_MS,
type AgentPsRow, type AgentPsRow,
type CommandRunner, type CommandRunner,
type FleetRoster,
type InteractiveRunner, type InteractiveRunner,
type SleepFn, type SleepFn,
} from './fleet.js'; } from './fleet.js';
@@ -914,118 +909,6 @@ describe('fleet ps — drift detection', () => {
it('does NOT flag drift when pane command is null (pane dead)', () => { it('does NOT flag drift when pane command is null (pane dead)', () => {
expect(detectDrift('pi', null)).toBe(false); expect(detectDrift('pi', null)).toBe(false);
}); });
it('does NOT flag drift when pane=node for wrapped pi agent (mosaic yolo pi)', () => {
expect(detectDrift('pi', 'node')).toBe(false);
});
it('does NOT flag drift when pane=node for wrapped codex agent (mosaic yolo codex)', () => {
expect(detectDrift('codex', 'node')).toBe(false);
});
it('flags drift when pane=python3 for pi runtime (canary-pi dogfood regression guard)', () => {
expect(detectDrift('pi', 'python3')).toBe(true);
});
it('does NOT flag drift when pane=python3 for dogfood runtime', () => {
expect(detectDrift('dogfood', 'python3')).toBe(false);
});
it('flags drift for unknown pane command on known runtime', () => {
expect(detectDrift('claude', 'bash')).toBe(true);
});
it('RUNTIME_ACCEPTABLE_COMMANDS is exported and contains expected entries', () => {
expect(RUNTIME_ACCEPTABLE_COMMANDS['pi']).toContain('node');
expect(RUNTIME_ACCEPTABLE_COMMANDS['pi']).not.toContain('python3');
expect(RUNTIME_ACCEPTABLE_COMMANDS['dogfood']).toContain('python3');
expect(RUNTIME_ACCEPTABLE_COMMANDS['codex']).toContain('node');
});
});
describe('fleet install — auto-enable units for boot-survival', () => {
it('buildSystemdEnableCommand and buildEnableLingerCommand return correct command arrays', () => {
expect(buildSystemdEnableCommand('mosaic-tmux-holder.service')).toEqual([
'systemctl',
'--user',
'enable',
'mosaic-tmux-holder.service',
]);
expect(buildEnableLingerCommand('testuser')).toEqual(['loginctl', 'enable-linger', 'testuser']);
});
it('enables holder and each agent unit via injected runner after install', async () => {
const minimalRoster: FleetRoster = {
version: 1,
transport: 'tmux',
tmux: { socketName: 'mosaic-factory', holderSession: '_holder' },
defaults: { workingDirectory: '~/src' },
runtimes: { codex: { resetCommand: '/clear' } },
agents: [{ name: 'coder0', runtime: 'codex', className: 'worker' }],
};
const calls: string[][] = [];
const runner: CommandRunner = async (command, args) => {
calls.push([command, ...args]);
return { stdout: '', stderr: '', exitCode: 0 };
};
await enableFleetUnits(runner, minimalRoster, {});
expect(calls).toContainEqual(['systemctl', '--user', 'enable', 'mosaic-tmux-holder.service']);
expect(calls).toContainEqual(['systemctl', '--user', 'enable', 'mosaic-agent@coder0.service']);
});
it('install still succeeds when systemctl enable returns non-zero (non-fatal)', async () => {
const minimalRoster: FleetRoster = {
version: 1,
transport: 'tmux',
tmux: { socketName: 'mosaic-factory', holderSession: '_holder' },
defaults: { workingDirectory: '~/src' },
runtimes: { codex: { resetCommand: '/clear' } },
agents: [{ name: 'coder0', runtime: 'codex', className: 'worker' }],
};
const calls: string[][] = [];
const runner: CommandRunner = async (command, args) => {
calls.push([command, ...args]);
// Simulate systemctl enable failure
if (command === 'systemctl' && args.includes('enable')) {
return { stdout: '', stderr: 'Unit not found', exitCode: 1 };
}
return { stdout: '', stderr: '', exitCode: 0 };
};
// Must NOT reject/throw even when enable calls fail
await expect(enableFleetUnits(runner, minimalRoster, {})).resolves.toBeUndefined();
// The enable attempt must have been made
expect(calls.some((c) => c.includes('enable'))).toBe(true);
});
it('--no-enable skips all systemctl enable and loginctl linger calls', async () => {
const minimalRoster: FleetRoster = {
version: 1,
transport: 'tmux',
tmux: { socketName: 'mosaic-factory', holderSession: '_holder' },
defaults: { workingDirectory: '~/src' },
runtimes: { codex: { resetCommand: '/clear' } },
agents: [{ name: 'coder0', runtime: 'codex', className: 'worker' }],
};
const calls: string[][] = [];
const runner: CommandRunner = async (command, args) => {
calls.push([command, ...args]);
return { stdout: '', stderr: '', exitCode: 0 };
};
await enableFleetUnits(runner, minimalRoster, { enable: false });
// No calls should include 'enable'
expect(calls.every((c) => !c.includes('enable'))).toBe(true);
// No loginctl calls at all
expect(calls.every((c) => c[0] !== 'loginctl')).toBe(true);
});
}); });
describe('fleet ps — tenant and host', () => { describe('fleet ps — tenant and host', () => {

View File

@@ -210,93 +210,6 @@ export function buildFleetServiceCommand(action: FleetServiceAction, agentName?:
return ['systemctl', '--user', action, service]; return ['systemctl', '--user', action, service];
} }
/**
* Returns the systemctl --user enable command for a given unit.
* Used by the install auto-enable step to persist units across reboots.
*/
export function buildSystemdEnableCommand(unit: string): string[] {
return ['systemctl', '--user', 'enable', unit];
}
/**
* Returns the loginctl enable-linger command for a given user.
* Linger allows user systemd services to survive logout.
*/
export function buildEnableLingerCommand(user: string): string[] {
return ['loginctl', 'enable-linger', user];
}
/**
* Enable fleet units for boot-survival after install.
* Non-fatal: if systemctl enable returns non-zero, a warning is printed and we continue.
* If opts.enable === false (--no-enable flag), the whole step is skipped.
*/
export async function enableFleetUnits(
runner: CommandRunner,
roster: FleetRoster,
opts: { enable?: boolean },
): Promise<void> {
if (opts.enable === false) {
return;
}
try {
let succeeded = 0;
let failed = 0;
const holderResult = await runner(
...splitCommand(buildSystemdEnableCommand('mosaic-tmux-holder.service')),
);
if (holderResult.exitCode === 0) {
succeeded++;
} else {
failed++;
process.stderr.write(
`Warning: could not enable mosaic-tmux-holder.service: ${holderResult.stderr || holderResult.stdout || 'non-zero exit'}\n`,
);
}
for (const agent of roster.agents) {
const unit = `mosaic-agent@${agent.name}.service`;
const result = await runner(...splitCommand(buildSystemdEnableCommand(unit)));
if (result.exitCode === 0) {
succeeded++;
} else {
failed++;
process.stderr.write(
`Warning: could not enable ${unit}: ${result.stderr || result.stdout || 'non-zero exit'}\n`,
);
}
}
if (succeeded > 0) {
console.log(`Enabled ${succeeded} unit(s) for boot-survival.`);
}
if (failed > 0) {
process.stderr.write(
`Warning: ${failed} unit(s) could not be enabled (systemctl unavailable?). Run manually if needed.\n`,
);
}
// Best-effort linger
let username: string;
try {
username = userInfo().username;
} catch {
username = process.env['USER'] ?? process.env['LOGNAME'] ?? 'unknown';
}
const lingerResult = await runner(...splitCommand(buildEnableLingerCommand(username)));
if (lingerResult.exitCode !== 0) {
process.stderr.write(
`Hint: run 'loginctl enable-linger ${username}' as root to survive logout.\n`,
);
}
} catch (err) {
process.stderr.write(
`Warning: auto-enable step failed unexpectedly: ${err instanceof Error ? err.message : String(err)}\n`,
);
}
}
export function buildAgentSendCommand( export function buildAgentSendCommand(
paths: FleetPaths, paths: FleetPaths,
agentName: string, agentName: string,
@@ -524,41 +437,32 @@ export function parseTmuxListPanes(
return { pid, command, dead, idleSeconds }; return { pid, command, dead, idleSeconds };
} }
/**
* Maps each known runtime to the set of acceptable pane commands.
* A pane running any of these commands for the given runtime is NOT considered drifted.
* Runtimes launched via `mosaic yolo` wrap in node, so 'node' is acceptable for most.
* The dogfood runtime accepts python3/python (the canary-pi dogfood stub).
*/
export const RUNTIME_ACCEPTABLE_COMMANDS: Record<string, readonly string[]> = {
claude: ['claude', 'node'],
codex: ['codex', 'node'],
opencode: ['opencode', 'node'],
pi: ['pi', 'node'],
dogfood: ['python3', 'python'],
};
/** /**
* Determine if there is a runtime drift: roster says one runtime but the pane * Determine if there is a runtime drift: roster says one runtime but the pane
* is actually running something from a different runtime. We detect this by * is actually running something from a different runtime. We detect this by
* checking if the pane command doesn't match a known acceptable command for the * checking if the pane command doesn't match a known canonical command for the
* roster's declared runtime. * roster's declared runtime.
* *
* Known acceptable commands per runtime (see RUNTIME_ACCEPTABLE_COMMANDS): * Known canonical commands per runtime:
* claude → claude, node (node covers mosaic yolo wrapper) * claude → claude
* codex → codex, node * codex → codex
* opencode → opencode, node * opencode → opencode
* pi → pi, node (python3 still flags drift for canary-pi dogfood stub) * pi → pi
* dogfood → python3, python
* *
* If the pane is running something else (e.g., python3/dogfood-agent.py) for * If the pane is running something else (e.g., python3/dogfood-agent.py) for
* an agent whose roster runtime is "pi", that's a drift. * an agent whose roster runtime is "pi", that's a drift.
*/ */
export function detectDrift(rosterRuntime: string, paneCommand: string | null): boolean { export function detectDrift(rosterRuntime: string, paneCommand: string | null): boolean {
if (!paneCommand) return false; if (!paneCommand) return false;
const acceptable = RUNTIME_ACCEPTABLE_COMMANDS[rosterRuntime]; const knownCommands: Record<string, string[]> = {
if (!acceptable) return false; claude: ['claude'],
return !acceptable.includes(paneCommand); codex: ['codex'],
opencode: ['opencode'],
pi: ['pi'],
};
const expected = knownCommands[rosterRuntime];
if (!expected) return false;
return !expected.includes(paneCommand);
} }
/** /**
@@ -802,22 +706,12 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
cmd cmd
.command('install') .command('install')
.description('Install local fleet tools and user systemd units') .description('Install local fleet tools and user systemd units')
.option('--no-enable', 'Skip enabling units for boot-survival') .action(async () => installFleet(cmd, frameworkRoot));
.action(async (opts: { enable?: boolean }) => {
await installFleet(cmd, frameworkRoot);
const roster = await loadRosterForCommand(cmd);
await enableFleetUnits(runner, roster, opts);
});
cmd cmd
.command('install-systemd') .command('install-systemd')
.description('Install local fleet tools and user systemd units') .description('Install local fleet tools and user systemd units')
.option('--no-enable', 'Skip enabling units for boot-survival') .action(async () => installFleet(cmd, frameworkRoot));
.action(async (opts: { enable?: boolean }) => {
await installFleet(cmd, frameworkRoot);
const roster = await loadRosterForCommand(cmd);
await enableFleetUnits(runner, roster, opts);
});
for (const action of ['start', 'stop', 'restart'] as const) { for (const action of ['start', 'stop', 'restart'] as const) {
cmd cmd