diff --git a/docs/fleet/north-star.md b/docs/fleet/north-star.md index 22b6857..2e6dbba 100644 --- a/docs/fleet/north-star.md +++ b/docs/fleet/north-star.md @@ -115,6 +115,11 @@ Every artifact, starting Phase 2, MUST: - Observation: **read-only default, opt-in takeover**. - Multi-host: **designed-for from day one**; control plane **rides federation (W1)**. - Delivery: **CLI-first now**, dogfood against the live stub fleet; webUI deferred to Phase 5. +- Runtimes: fleet agents default to **Codex / pi-on-Codex**; **Claude is reserved for Claude + Code only** (avoid alternate-harness API pricing). Validated durable recipe: + `mosaic yolo pi --model openai-codex/gpt-5.5:high`. Durable detached launch requires the + runtime-bin on PATH (baked into the pane command) + boot-survival (`enable` + linger), + which `fleet init` should automate. ## Assumptions (veto-able) diff --git a/docs/scratchpads/fleet-observability-phase2.md b/docs/scratchpads/fleet-observability-phase2.md index 22f0694..e499f6d 100644 --- a/docs/scratchpads/fleet-observability-phase2.md +++ b/docs/scratchpads/fleet-observability-phase2.md @@ -73,3 +73,28 @@ with a second agent on `dragon-lin`. tmux session-name fallback; the systemd/tmux env handoff needs a real fix. - Next: rebase on merged main, open Phase-2 PR, dual-engine review, merge, close `fleet-observability-1`. Defer launch-path + env-propagation fixes to Phase 3. +- 2026-06-21 (session 3): Phase-2 PR #579 merged (3 dual-engine rounds hardened + verify+watch). Then closed the launch-path question with Jason's input — CORRECTING + earlier findings: + - The ad-hoc launch deaths were NOT a fundamental TTY blocker: (a) codex was a stale + version (Jason updated it); (b) pi was misconfigured to Claude auth (Jason removed it; + default is now Codex). The REAL durable-launch bug is **PATH**: the detached tmux + launch shell is login+non-interactive, so it misses `~/.npm-global/bin` (added only in + `~/.bashrc`) -> `mosaic: command not found` (127) -> pane dies. tmux panes inherit the + tmux _server_ env, so PATH must be baked into the pane command. + - **Durable real-agent recipe (validated live on gpt-5.5, Claude-free):** + `mosaic yolo pi --model openai-codex/gpt-5.5:high` — pi tolerates detached tmux; a raw + interactive TUI (codex CLI) exits without an attached client. Status line confirmed + `(openai-codex) gpt-5.5 • high`. + - PATH fix landed in `start-agent-session.sh` (commit 32efc13, branch + feat/fleet-launch-path): derive runtime-bin prefix (MOSAIC_RUNTIME_BIN | npm prefix | + ~/.npm-global/bin | ~/.local/bin), bake `export PATH=...; exec ` into the pane; + `exec` also fixes the drift false-positive. Live-tested under stripped PATH -> durable. + - Boot-survival: Jason ran `systemctl --user enable` (+ linger). TODO: auto-enable in + **fleet init** so operators never have to remember it (agentic-enhancement cycle). + - Future custom Pi harness build: pi cannot self-report its model (track + runtime/model/effort as fleet metadata); drift detection should recognize `node` as + pi's pane command (a node-wrapped pane can currently read as drift). + - Findings recorded in AI Guide playbooks/tmux-fleet.md (aiguide PR #7, merged). + - Policy: avoid Claude outside Claude Code (API pricing for alt-harness use) — fleet + runtimes default to Codex / pi-on-Codex; Claude stays in Claude Code only. diff --git a/packages/mosaic/framework/tools/fleet/start-agent-session.sh b/packages/mosaic/framework/tools/fleet/start-agent-session.sh index 39f2653..dddf167 100755 --- a/packages/mosaic/framework/tools/fleet/start-agent-session.sh +++ b/packages/mosaic/framework/tools/fleet/start-agent-session.sh @@ -26,5 +26,75 @@ if [ -z "$MOSAIC_AGENT_COMMAND" ]; then MOSAIC_AGENT_COMMAND="mosaic yolo $MOSAIC_AGENT_RUNTIME" fi +# ── Derive a runtime-bin PATH prefix ───────────────────────────────────────── +# Precedence: +# 1. $MOSAIC_RUNTIME_BIN (explicit override) +# 2. $(npm config get prefix)/bin (if npm is on PATH) +# 3. Fallbacks: $HOME/.npm-global/bin and $HOME/.local/bin +# +# Only directories that already exist are included. The prefix is baked into +# the pane command regardless of what the LAUNCHER process's $PATH contains, +# because the tmux pane inherits the tmux SERVER environment (not this script's +# environment). A dir on the launcher's PATH may be absent from the server PATH, +# so every existing candidate must always be included. Dedup within the +# constructed prefix avoids listing the same dir twice. +_build_runtime_bin_prefix() { + local candidates=() + + if [ -n "${MOSAIC_RUNTIME_BIN:-}" ]; then + candidates+=("$MOSAIC_RUNTIME_BIN") + fi + + if command -v npm >/dev/null 2>&1; then + local npm_prefix + npm_prefix=$(npm config get prefix 2>/dev/null) || true + if [ -n "$npm_prefix" ]; then + candidates+=("${npm_prefix}/bin") + fi + fi + + candidates+=("$HOME/.npm-global/bin") + candidates+=("$HOME/.local/bin") + + local prefix="" + for dir in "${candidates[@]}"; do + [ -d "$dir" ] || continue + if [ -z "$prefix" ]; then + prefix="$dir" + else + case ":${prefix}:" in + *":${dir}:"*) ;; # already in our prefix — skip + *) prefix="${prefix}:${dir}" ;; + esac + fi + done + + printf '%s' "$prefix" +} + +MOSAIC_RUNTIME_BIN_PREFIX=$(_build_runtime_bin_prefix) + +# ── Build the pane command ──────────────────────────────────────────────────── +# The pane command must: +# - Export the augmented PATH so the runtime binary is found. +# - exec the agent command so the runtime is the pane's foreground process +# (makes `fleet ps` pane_current_command check reliable; no DRIFT false-positive). +# +# Quoting strategy: single-quote the inner shell snippet so that variable +# references in MOSAIC_AGENT_COMMAND are NOT expanded here — they expand inside +# the pane shell. However, MOSAIC_RUNTIME_BIN_PREFIX and PATH must be expanded +# NOW (in this script) because the pane shell inherits the tmux server +# environment, not this script's env. +# +# We build the snippet as a double-quoted here-string embedded in a printf call +# to avoid nested quoting problems. + +if [ -n "$MOSAIC_RUNTIME_BIN_PREFIX" ]; then + PANE_SHELL_SNIPPET="export PATH=\"${MOSAIC_RUNTIME_BIN_PREFIX}:\${PATH}\"; exec ${MOSAIC_AGENT_COMMAND}" +else + PANE_SHELL_SNIPPET="exec ${MOSAIC_AGENT_COMMAND}" +fi + mkdir -p "$MOSAIC_AGENT_WORKDIR" -exec tmux -L "$MOSAIC_TMUX_SOCKET" new-session -d -s "$AGENT_NAME" -c "$MOSAIC_AGENT_WORKDIR" "$MOSAIC_AGENT_COMMAND" +exec tmux -L "$MOSAIC_TMUX_SOCKET" new-session -d -s "$AGENT_NAME" -c "$MOSAIC_AGENT_WORKDIR" \ + bash -c "$PANE_SHELL_SNIPPET" diff --git a/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh b/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh index 47107c5..3325db0 100755 --- a/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh +++ b/packages/mosaic/framework/tools/fleet/test-start-agent-session.sh @@ -6,13 +6,26 @@ START="$SCRIPT_DIR/start-agent-session.sh" SOCKET="mosaic-agent-test-$RANDOM-$$" AGENT="agent-$RANDOM" WORKDIR=$(mktemp -d) -trap 'tmux -L "$SOCKET" kill-server >/dev/null 2>&1 || true; rm -rf "$WORKDIR"' EXIT + +# Keep a single cleanup trap that accumulates resources. +CLEANUP_DIRS=("$WORKDIR") +CLEANUP_SOCKETS=("$SOCKET") +trap '_cleanup' EXIT +_cleanup() { + for s in "${CLEANUP_SOCKETS[@]:-}"; do + tmux -L "$s" kill-server >/dev/null 2>&1 || true + done + for d in "${CLEANUP_DIRS[@]:-}"; do + rm -rf "$d" + done +} fail() { echo "FAIL: $*" >&2 exit 1 } +# ── Test 1: basic session creation with workdir check ───────────────────────── MOSAIC_TMUX_SOCKET="$SOCKET" \ MOSAIC_AGENT_WORKDIR="$WORKDIR" \ MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \ @@ -22,6 +35,7 @@ tmux -L "$SOCKET" has-session -t "=$AGENT:0.0" || fail "agent session was not cr actual_dir=$(tmux -L "$SOCKET" display-message -p -t "=$AGENT:0.0" '#{pane_current_path}') [ "$actual_dir" = "$WORKDIR" ] || fail "agent workdir mismatch: $actual_dir" +# ── Test 2: idempotency (duplicate start prints 'already running') ───────────── MOSAIC_TMUX_SOCKET="$SOCKET" \ MOSAIC_AGENT_WORKDIR="$WORKDIR" \ MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \ @@ -29,4 +43,166 @@ MOSAIC_AGENT_COMMAND='bash --noprofile --norc -i' \ grep -qF 'already running' /tmp/mosaic-start-agent-idempotent.out || fail "duplicate start was not idempotent" +# ── Test 3: runtime-bin PATH prefix is baked into the pane command ──────────── +# +# We capture the command the script would hand to tmux by injecting a fake +# 'tmux' shim into PATH. The shim: +# - Intercepts 'new-session' calls and records its arguments to a file. +# - For 'has-session' calls, exits 1 (session does not exist) so the script +# proceeds to launch instead of printing "already running". +# - For all other subcommands, exits 0. +# +# Assertions: +# a) 'export PATH=' with the synthetic MOSAIC_RUNTIME_BIN prefix appears. +# b) 'exec' appears so the runtime replaces the wrapper shell. +# c) MOSAIC_AGENT_COMMAND with flags is forwarded intact. + +FAKE_BIN=$(mktemp -d) +FAKE_RUNTIME_BIN=$(mktemp -d) +TMUX_ARGS_FILE=$(mktemp) +CLEANUP_DIRS+=("$FAKE_BIN" "$FAKE_RUNTIME_BIN") + +# Write the fake tmux shim (uses only positional args, no sourced vars). +cat > "$FAKE_BIN/tmux" < ... +if [ "\$subcmd" = "has-session" ]; then + exit 1 # session not found → script will attempt new-session +fi +if [ "\$subcmd" = "new-session" ]; then + printf '%s\n' "\$@" > "$TMUX_ARGS_FILE" + exit 0 +fi +exit 0 +SHIM +chmod +x "$FAKE_BIN/tmux" + +SOCKET3="mosaic-agent-test3-$RANDOM-$$" +AGENT3="agent3-$RANDOM" +WORKDIR3=$(mktemp -d) +CLEANUP_DIRS+=("$WORKDIR3") + +PATH="$FAKE_BIN:$PATH" \ +MOSAIC_TMUX_SOCKET="$SOCKET3" \ +MOSAIC_AGENT_WORKDIR="$WORKDIR3" \ +MOSAIC_AGENT_RUNTIME="pi" \ +MOSAIC_RUNTIME_BIN="$FAKE_RUNTIME_BIN" \ +MOSAIC_AGENT_COMMAND="mosaic yolo pi --model openai-codex/gpt-5.5:high" \ + "$START" "$AGENT3" + +all_args=$(cat "$TMUX_ARGS_FILE" 2>/dev/null || true) +rm -f "$TMUX_ARGS_FILE" + +echo "--- captured tmux new-session args ---" +echo "$all_args" +echo "--- end args ---" + +# a) PATH prefix containing FAKE_RUNTIME_BIN must appear. +echo "$all_args" | grep -qF "export PATH=" || fail "pane command does not export PATH" +echo "$all_args" | grep -qF "$FAKE_RUNTIME_BIN" || fail "pane command does not include MOSAIC_RUNTIME_BIN in PATH prefix" + +# b) exec must appear so the runtime replaces the wrapper shell. +echo "$all_args" | grep -qF "exec " || fail "pane command does not use exec" + +# c) Full MOSAIC_AGENT_COMMAND (with flags) must be forwarded. +echo "$all_args" | grep -qF "mosaic yolo pi --model openai-codex/gpt-5.5:high" || \ + fail "pane command does not forward MOSAIC_AGENT_COMMAND with flags intact" + +# ── Test 4: when no extra runtime-bin dirs exist, exec still appears ─────────── +TMUX_ARGS_FILE2=$(mktemp) +FAKE_BIN2=$(mktemp -d) +CLEANUP_DIRS+=("$FAKE_BIN2") + +cat > "$FAKE_BIN2/tmux" < "$TMUX_ARGS_FILE2" + exit 0 +fi +exit 0 +SHIM2 +chmod +x "$FAKE_BIN2/tmux" + +SOCKET4="mosaic-agent-test4-$RANDOM-$$" +AGENT4="agent4-$RANDOM" +WORKDIR4=$(mktemp -d) +CLEANUP_DIRS+=("$WORKDIR4") + +# MOSAIC_RUNTIME_BIN points to a non-existent dir so prefix will be empty; +# .npm-global/bin and .local/bin may or may not exist but we just want exec. +PATH="$FAKE_BIN2:$PATH" \ +MOSAIC_TMUX_SOCKET="$SOCKET4" \ +MOSAIC_AGENT_WORKDIR="$WORKDIR4" \ +MOSAIC_AGENT_RUNTIME="pi" \ +MOSAIC_RUNTIME_BIN="/nonexistent-dir-$$" \ +MOSAIC_AGENT_COMMAND="mosaic yolo pi" \ + "$START" "$AGENT4" + +all_args4=$(cat "$TMUX_ARGS_FILE2" 2>/dev/null || true) +rm -f "$TMUX_ARGS_FILE2" +rm -rf "$WORKDIR4" + +echo "$all_args4" | grep -qF "exec " || fail "pane command (no prefix dirs) does not use exec" +echo "$all_args4" | grep -qF "mosaic yolo pi" || fail "pane command does not include agent command when no prefix" + +# ── Test 5: candidate dir already in LAUNCHER $PATH is still baked into pane ── +# +# Regression guard for the bug where _build_runtime_bin_prefix() used to skip +# a candidate because it was already present in the launcher process's $PATH. +# That check was wrong: the pane inherits the tmux SERVER environment, not the +# launcher's env. Even if a dir is on the launcher's PATH it must always be +# baked into the pane's PATH export. +# +# We prove this by setting PATH to include FAKE_RUNTIME_BIN5 (the candidate), +# then asserting the generated new-session command still exports it. +TMUX_ARGS_FILE5=$(mktemp) +FAKE_BIN5=$(mktemp -d) +FAKE_RUNTIME_BIN5=$(mktemp -d) # this dir IS on the launcher's PATH below +CLEANUP_DIRS+=("$FAKE_BIN5" "$FAKE_RUNTIME_BIN5") + +cat > "$FAKE_BIN5/tmux" < "$TMUX_ARGS_FILE5" + exit 0 +fi +exit 0 +SHIM5 +chmod +x "$FAKE_BIN5/tmux" + +SOCKET5="mosaic-agent-test5-$RANDOM-$$" +AGENT5="agent5-$RANDOM" +WORKDIR5=$(mktemp -d) +CLEANUP_DIRS+=("$WORKDIR5") +CLEANUP_SOCKETS+=("$SOCKET5") + +# FAKE_RUNTIME_BIN5 is deliberately placed on the LAUNCHER PATH so that the +# old (buggy) code would have skipped it. The correct code must still include +# it in the pane PATH export. +PATH="$FAKE_BIN5:$FAKE_RUNTIME_BIN5:$PATH" \ +MOSAIC_TMUX_SOCKET="$SOCKET5" \ +MOSAIC_AGENT_WORKDIR="$WORKDIR5" \ +MOSAIC_AGENT_RUNTIME="pi" \ +MOSAIC_RUNTIME_BIN="$FAKE_RUNTIME_BIN5" \ +MOSAIC_AGENT_COMMAND="mosaic yolo pi" \ + "$START" "$AGENT5" + +all_args5=$(cat "$TMUX_ARGS_FILE5" 2>/dev/null || true) +rm -f "$TMUX_ARGS_FILE5" +rm -rf "$WORKDIR5" + +echo "--- test 5: launcher-PATH candidate must still appear in pane export ---" +echo "$all_args5" +echo "--- end test 5 args ---" + +echo "$all_args5" | grep -qF "export PATH=" || \ + fail "test5: pane command does not export PATH when candidate is on launcher PATH" +echo "$all_args5" | grep -qF "$FAKE_RUNTIME_BIN5" || \ + fail "test5: candidate dir (already on launcher PATH) was NOT baked into pane PATH — regression" + echo "ok - start-agent-session"