Compare commits
10 Commits
chore/ci-b
...
release/mo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
96a9892717 | ||
| d887555852 | |||
| e3adc6a1bc | |||
| aa27c42129 | |||
| 16ae809442 | |||
| 6980e40e51 | |||
| e6b53ea103 | |||
| 4da87640e8 | |||
| a38a491403 | |||
| 78d67c6261 |
7
.gitignore
vendored
7
.gitignore
vendored
@@ -15,3 +15,10 @@ infra/step-ca/dev-password
|
||||
|
||||
# Scratch dirs created by the framework git-wrapper shell test harnesses
|
||||
.mosaic-test-work/
|
||||
|
||||
# Transient config files vite/vitest/esbuild write next to a *.config.ts while
|
||||
# loading it, then unlink. They are untracked but were not ignored, so turbo's
|
||||
# package traversal hashed them and intermittently failed CI with "Package
|
||||
# traversal error: ... .timestamp-*.mjs: No such file or directory" when the
|
||||
# file vanished mid-scan. Ignoring them removes the race.
|
||||
*.timestamp-*.mjs
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# &node_image is the pre-baked CI base built by .woodpecker/ci-image.yml:
|
||||
# node:22-alpine + python3/make/g++/postgresql-client + pnpm + a warm pnpm
|
||||
# node:24-alpine + python3/make/g++/postgresql-client + pnpm + a warm pnpm
|
||||
# store. The install step resolves from the baked store (--prefer-offline)
|
||||
# instead of paying a ~731s cold fetch + native compile every run.
|
||||
variables:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# Runs only on main branch push/tag
|
||||
|
||||
variables:
|
||||
# Pre-baked CI base (see .woodpecker/ci-image.yml): node:22-alpine +
|
||||
# Pre-baked CI base (see .woodpecker/ci-image.yml): node:24-alpine +
|
||||
# toolchain + warm pnpm store. Kills the second cold install publish pays.
|
||||
- &node_image 'git.mosaicstack.dev/mosaicstack/stack/ci-base:latest'
|
||||
- &enable_pnpm 'corepack enable'
|
||||
|
||||
@@ -14,10 +14,12 @@
|
||||
# Rebuilt only when `pnpm-lock.yaml` or this Dockerfile change
|
||||
# (see .woodpecker/ci-image.yml).
|
||||
#
|
||||
# Node version is intentionally pinned to 22 (Active LTS at time of writing).
|
||||
# The node:22 -> node:24 bump lands as a SEPARATE follow-up PR so the cache
|
||||
# change carries zero runtime-version variables.
|
||||
FROM node:22-alpine
|
||||
# Node version is pinned to 24 (Active LTS). This is the follow-up bump from
|
||||
# node:22 — sequenced AFTER the CI cache work landed so the runtime change
|
||||
# carries zero cache variables. node:26 stays held until it reaches LTS
|
||||
# (Oct 2026); the Current line risks native-module (node-gyp) breakage on a
|
||||
# runner that compiles better-sqlite3 / canvas / sharp / node-pty from source.
|
||||
FROM node:24-alpine
|
||||
|
||||
# Native toolchain required to compile node-gyp deps on musl, plus the
|
||||
# postgresql-client used by the test step's pg_isready readiness probe. `bash`
|
||||
|
||||
66
docs/scratchpads/h1-heartbeat-readiness.md
Normal file
66
docs/scratchpads/h1-heartbeat-readiness.md
Normal file
@@ -0,0 +1,66 @@
|
||||
# H1 — heartbeat readiness detection
|
||||
|
||||
## Objective
|
||||
|
||||
Add runtime-agnostic readiness classification to `mosaic fleet ps` so an agent can be reported as working/idle/stuck/stale/dead/unknown instead of treating pane liveness as progress.
|
||||
|
||||
## Scope
|
||||
|
||||
- `packages/mosaic/src/commands/fleet.ts`
|
||||
- exported readiness state/types/default thresholds/helpers/classifier
|
||||
- `AgentPsRow.readiness` additive JSON field
|
||||
- table HB column and IDLE/STUCK flags
|
||||
- `packages/mosaic/src/commands/fleet.spec.ts`
|
||||
- pure classifier branch/boundary coverage
|
||||
- threshold helper coverage
|
||||
- legitimate render/JSON assertion updates for new HB text
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- Branches covered: dead, unknown, stale, busy working, null-idle working, stuck boundary, idle boundary, working below idle.
|
||||
- Threshold env helpers default to 300s/900s and honor positive integer env values.
|
||||
- `fleet ps` rows populate `readiness` for roster and unmanaged socket sessions.
|
||||
- Table HB text becomes `<age>s/<readiness>` when heartbeat age exists; remains `unknown` when absent.
|
||||
- Flags include `IDLE`/`STUCK` for matching readiness.
|
||||
- Local gates green: `pnpm typecheck`, `pnpm lint`, `pnpm format:check`, fleet vitest.
|
||||
- Pre-push queue guard passes; PR opened off `origin/main`; no merge by worker.
|
||||
|
||||
## Constraints / Assumptions
|
||||
|
||||
- Source branch: `origin/main` @ `e3adc6a`.
|
||||
- No scope creep beyond readiness detection.
|
||||
- `docs/TASKS.md` and `docs/fleet/TASKS.md` are orchestrator-owned; worker will not modify them.
|
||||
- PRD alignment source: `docs/fleet/PRD.md` Phase 2 observability; this is a refinement of heartbeat observability, preserving existing unknown/stale behavior.
|
||||
|
||||
## Plan
|
||||
|
||||
1. Install dependencies with requested PNPM environment.
|
||||
2. Add readiness types/helpers/classifier near heartbeat constants.
|
||||
3. Add `readiness` to `AgentPsRow` and populate both row paths.
|
||||
4. Update table render and flags.
|
||||
5. Add unit tests and update affected ps render/JSON assertions.
|
||||
6. Run build precheck + required gates.
|
||||
7. Run automated independent review, remediate findings.
|
||||
8. Queue guard, push, open PR.
|
||||
|
||||
## Progress
|
||||
|
||||
- 2026-06-24: Branch created from `origin/main` @ `e3adc6a`.
|
||||
- 2026-06-24: Implemented readiness thresholds/classifier, JSON row field, HB column label, and IDLE/STUCK flags.
|
||||
- 2026-06-24: Added classifier branch/boundary tests, threshold helper tests, JSON shape assertions, and readiness table rendering assertions.
|
||||
|
||||
## Verification Evidence
|
||||
|
||||
- `pnpm install --store-dir "$HOME/.pnpm-store"` — pass.
|
||||
- `npx turbo build --filter=@mosaicstack/mosaic^...` — pass, 12/12 tasks successful.
|
||||
- `pnpm typecheck` — pass, 41/41 tasks successful.
|
||||
- `pnpm lint` — pass, 23/23 tasks successful.
|
||||
- `pnpm format:check` — pass, all matched files use Prettier style.
|
||||
- `pnpm --filter @mosaicstack/mosaic exec vitest run src/commands/fleet.spec.ts` — pass, 171 tests.
|
||||
- `pnpm --filter @mosaicstack/mosaic test` — pass, 39 files / 547 tests; `fleet.spec.ts` 171 tests.
|
||||
- `~/.config/mosaic/tools/codex/codex-code-review.sh --uncommitted` — approve, 0 findings (reviewed supplied diff; sandbox file-inspection limitation noted by tool).
|
||||
|
||||
## Risks / Blockers
|
||||
|
||||
- No current blocker.
|
||||
- Review tool could not inspect repo files directly due sandbox wrapper limitation, but it reviewed the supplied diff and approved with no findings.
|
||||
@@ -28,6 +28,7 @@ export default tseslint.config(
|
||||
'apps/web/e2e/helpers/*.ts',
|
||||
'apps/web/playwright.config.ts',
|
||||
'apps/gateway/vitest.config.ts',
|
||||
'packages/db/vitest.config.ts',
|
||||
'packages/storage/vitest.config.ts',
|
||||
'packages/mosaic/__tests__/*.ts',
|
||||
'tools/federation-harness/*.ts',
|
||||
|
||||
@@ -4,5 +4,22 @@ export default defineConfig({
|
||||
test: {
|
||||
globals: true,
|
||||
environment: 'node',
|
||||
// The migration suite spins up a real PGlite (WASM Postgres) instance per
|
||||
// test and applies the full drizzle migration set. Each case legitimately
|
||||
// takes ~5s locally and considerably longer on CI, where turbo runs many
|
||||
// packages' test suites concurrently. The 5s vitest default then expires
|
||||
// mid-migration and the run fails as a phantom "Test timed out in 5000ms"
|
||||
// (often surfacing the underlying WASM `memory access out of bounds` when
|
||||
// the heap is starved). Give migrations real headroom.
|
||||
testTimeout: 120_000,
|
||||
hookTimeout: 120_000,
|
||||
// Each PGlite instance carries a multi-hundred-MB WASM heap. Running test
|
||||
// files in parallel forks multiplies that peak and is what tips the CI
|
||||
// runner into the WASM OOM. A single fork keeps only one instance resident
|
||||
// at a time — slightly slower, but deterministic.
|
||||
pool: 'forks',
|
||||
poolOptions: {
|
||||
forks: { singleFork: true },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
@@ -122,6 +122,85 @@ fi
|
||||
|
||||
mkdir -p "$MOSAIC_AGENT_WORKDIR"
|
||||
|
||||
# ── Pre-trust the workdir for the Claude runtime ─────────────────────────────
|
||||
# Claude Code shows a one-time "Is this a project you trust?" folder-trust gate
|
||||
# the first time it opens a directory. A fleet-launched agent has no human to
|
||||
# answer it, so the pane stalls forever at the prompt while its heartbeat keeps
|
||||
# reporting "healthy" (the pane process IS alive — it's just blocked).
|
||||
#
|
||||
# IMPORTANT: --dangerously-skip-permissions does NOT bypass this gate, and
|
||||
# neither does `trustedProjectDirectories` in settings.json (verified empirically
|
||||
# 2026-06-24). The ONLY thing the gate honors is the per-project record in
|
||||
# ~/.claude.json: projects["<dir>"].hasTrustDialogAccepted == true (exactly what
|
||||
# answering the prompt writes). So we pre-seed that record here.
|
||||
#
|
||||
# Idempotent, atomic, best-effort: any failure is non-fatal (the agent still
|
||||
# launches — worst case it stalls on the gate, i.e. the pre-fix status quo).
|
||||
# Only the claude runtime needs this; codex/pi have no such gate.
|
||||
_ensure_claude_workdir_trusted() {
|
||||
local workdir="$1"
|
||||
# The path claude keys on is the resolved cwd it is launched in.
|
||||
local rp
|
||||
rp=$(cd "$workdir" 2>/dev/null && pwd -P) || rp="$workdir"
|
||||
# ~/.claude.json lives next to the claude config dir; honor CLAUDE_CONFIG_DIR.
|
||||
local claude_json="${MOSAIC_CLAUDE_JSON:-${CLAUDE_CONFIG_DIR:+$CLAUDE_CONFIG_DIR/.claude.json}}"
|
||||
claude_json="${claude_json:-$HOME/.claude.json}"
|
||||
|
||||
if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "WARNING: python3 not found; cannot pre-trust '$rp' for claude (agent may stall on the folder-trust gate)" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Serialize concurrent agent launches that share ~/.claude.json (flock if available).
|
||||
local lock="${claude_json}.mosaic-lock"
|
||||
_seed() {
|
||||
MOSAIC_CJ="$claude_json" MOSAIC_TRUST_DIR="$rp" python3 - <<'PY'
|
||||
import json, os, sys, tempfile
|
||||
cj = os.environ["MOSAIC_CJ"]
|
||||
d = os.environ["MOSAIC_TRUST_DIR"]
|
||||
try:
|
||||
data = json.load(open(cj)) if os.path.exists(cj) else {}
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
except Exception:
|
||||
# Never corrupt an unreadable/partial file — bail without writing.
|
||||
sys.exit(2)
|
||||
projects = data.setdefault("projects", {})
|
||||
entry = projects.get(d)
|
||||
if not isinstance(entry, dict):
|
||||
entry = {}
|
||||
projects[d] = entry
|
||||
if entry.get("hasTrustDialogAccepted") is True:
|
||||
sys.exit(0) # already trusted — nothing to do
|
||||
entry["hasTrustDialogAccepted"] = True
|
||||
tmp_dir = os.path.dirname(cj) or "."
|
||||
fd, tmp = tempfile.mkstemp(dir=tmp_dir, prefix=".claude.json.mosaic.")
|
||||
try:
|
||||
with os.fdopen(fd, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
os.replace(tmp, cj) # atomic
|
||||
except Exception:
|
||||
try:
|
||||
os.unlink(tmp)
|
||||
except OSError:
|
||||
pass
|
||||
sys.exit(3)
|
||||
PY
|
||||
}
|
||||
if command -v flock >/dev/null 2>&1; then
|
||||
( flock 9; _seed ) 9>"$lock" 2>/dev/null || _seed
|
||||
else
|
||||
_seed
|
||||
fi
|
||||
}
|
||||
|
||||
case "$MOSAIC_AGENT_RUNTIME" in
|
||||
claude)
|
||||
_ensure_claude_workdir_trusted "$MOSAIC_AGENT_WORKDIR" \
|
||||
|| echo "WARNING: could not pre-trust workdir for claude agent $AGENT_NAME" >&2
|
||||
;;
|
||||
esac
|
||||
|
||||
# ── Launch the tmux session (no exec — we continue to wire the heartbeat) ────
|
||||
_tmux new-session -d -s "$AGENT_NAME" -c "$MOSAIC_AGENT_WORKDIR" \
|
||||
bash -c "$PANE_SHELL_SNIPPET"
|
||||
|
||||
@@ -128,8 +128,8 @@ PY
|
||||
merge_gitea_with_api() {
|
||||
local host="$1" api_url token basic_auth body_file raw_code payload
|
||||
api_url="https://${host}/api/v1/repos/${OWNER}/${REPO}/pulls/${PR_NUMBER}/merge"
|
||||
mkdir -p "${AGENT_WORK_ROOT:-/home/hermes/agent-work}"
|
||||
body_file=$(mktemp "${AGENT_WORK_ROOT:-/home/hermes/agent-work}/pr-merge-api-response.XXXXXX")
|
||||
mkdir -p "${AGENT_WORK_ROOT:-${HOME:-/tmp}/mosaic/agent-work}"
|
||||
body_file=$(mktemp "${AGENT_WORK_ROOT:-${HOME:-/tmp}/mosaic/agent-work}/pr-merge-api-response.XXXXXX")
|
||||
payload='{"Do":"squash"}'
|
||||
|
||||
token=$(get_gitea_token "$host" || true)
|
||||
@@ -214,8 +214,8 @@ case "$PLATFORM" in
|
||||
TEA_LOGIN="$(get_gitea_login_for_host "$HOST" || true)"
|
||||
|
||||
if [[ -n "$TEA_LOGIN" ]]; then
|
||||
mkdir -p "${AGENT_WORK_ROOT:-/home/hermes/agent-work}"
|
||||
TEA_ERROR_FILE=$(mktemp "${AGENT_WORK_ROOT:-/home/hermes/agent-work}/pr-merge-tea-error.XXXXXX")
|
||||
mkdir -p "${AGENT_WORK_ROOT:-${HOME:-/tmp}/mosaic/agent-work}"
|
||||
TEA_ERROR_FILE=$(mktemp "${AGENT_WORK_ROOT:-${HOME:-/tmp}/mosaic/agent-work}/pr-merge-tea-error.XXXXXX")
|
||||
if tea pr merge "$PR_NUMBER" --style squash --repo "$OWNER/$REPO" --login "$TEA_LOGIN" 2> "$TEA_ERROR_FILE"; then
|
||||
rm -f "$TEA_ERROR_FILE"
|
||||
elif is_known_tea_empty_identity_failure "$TEA_ERROR_FILE"; then
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
WORK_ROOT="${AGENT_WORK_ROOT:-/home/hermes/agent-work}"
|
||||
WORK_ROOT="${AGENT_WORK_ROOT:-${HOME:-/tmp}/mosaic/agent-work}"
|
||||
SANDBOX="$WORK_ROOT/pr-merge-empty-uid-test-$$"
|
||||
MOCK_BIN="$SANDBOX/bin"
|
||||
REPO_DIR="$SANDBOX/repo"
|
||||
|
||||
@@ -12,6 +12,10 @@
|
||||
# ambiguity about lanes or origin. Recipients replying should FLIP the
|
||||
# preamble: [<dst> -> <src>] ... (this tool sends; it does not auto-reply).
|
||||
#
|
||||
# Optionally tags the message with a TRIAGE CLASS (see -C / --class) so a
|
||||
# comms daemon can route it (deliver-to-agent vs log-and-drop) from an exact
|
||||
# field instead of re-deriving intent from the body.
|
||||
#
|
||||
# WHY A WRAPPER
|
||||
# Reliable submission into an interactive REPL (Claude Code / Codex) is fiddly:
|
||||
# a trailing Enter is often swallowed and the message sits as an unsubmitted
|
||||
@@ -26,6 +30,7 @@
|
||||
# agent-send.sh [-L socket] -s <dst_session> -m "message" # local target
|
||||
# agent-send.sh [-L socket] -H user@host -s <dst_session> -m "message" # remote target
|
||||
# agent-send.sh [-L socket] -H user@host -n <dst_hostname> -s <sess> -f msg.txt
|
||||
# agent-send.sh -s mos-claude --class terminal-log -m "ACK — received"
|
||||
# echo "msg" | agent-send.sh [-L socket] -H user@host -s <dst_session>
|
||||
#
|
||||
# OPTIONS
|
||||
@@ -36,27 +41,61 @@
|
||||
# Default: local hostname, or (remote) resolved via one ssh.
|
||||
# -m MESSAGE message text (single- or multi-line)
|
||||
# -f FILE read message from FILE instead of -m
|
||||
# -C CLASS triage class for a comms daemon. One of:
|
||||
# terminal-log log-only; never needs the agent's attention
|
||||
# actionable carries a decision/blocker/gate — deliver
|
||||
# human from a human operator — deliver
|
||||
# reaction an emoji/ack reaction
|
||||
# Long form: --class CLASS (or --class=CLASS). When SET, the
|
||||
# preamble carries a ` class=<CLASS>` token INSIDE the bracket:
|
||||
# [<src> -> <dst> class=terminal-log] <message>
|
||||
# When OMITTED, NO token is emitted and the preamble is
|
||||
# byte-for-byte identical to the classic format. Consumers MUST
|
||||
# treat an absent class as 'actionable' (fail-safe: agent sees it).
|
||||
# -S SRC_LABEL override source label "<host>:<session>" (default: auto)
|
||||
# -r N Enter-flush attempts passed through (default 2)
|
||||
# -v verbose: print pane tail after delivery
|
||||
# -h help
|
||||
#
|
||||
# PREAMBLE GRAMMAR (for consumers / daemons mirroring this producer)
|
||||
# ^\[(\S+) -> (\S+?)(?: class=(terminal-log|actionable|human|reaction))?\] (.*)$
|
||||
# group 1 = src label group 2 = dst host:session
|
||||
# group 3 = class (absent => actionable) group 4 = message body
|
||||
#
|
||||
# EXIT CODES (passed through from send-message.sh)
|
||||
# 0 delivered/queued · 1 target not found · 2 still draft · 3 usage error
|
||||
set -uo pipefail
|
||||
|
||||
SELF_DIR=$(cd -- "$(dirname -- "$0")" && pwd)
|
||||
SENDER="$SELF_DIR/send-message.sh"
|
||||
# Sender is overridable via env purely for testing (inject a capture stub). The
|
||||
# default is the canonical send-message.sh beside this script; production callers
|
||||
# never set AGENT_SEND_SENDER, so behavior is unchanged.
|
||||
SENDER="${AGENT_SEND_SENDER:-$SELF_DIR/send-message.sh}"
|
||||
|
||||
# Translate the long option --class[=value] into "-C value" so getopts (which is
|
||||
# short-option-only) can parse it. Every other argument passes through untouched,
|
||||
# so callers that never use --class hit the exact original getopts path.
|
||||
args=()
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--class) [ $# -ge 2 ] || { echo "ERROR: --class requires a value" >&2; exit 3; }
|
||||
args+=(-C "$2"); shift 2 ;;
|
||||
--class=*) args+=(-C "${1#*=}"); shift ;;
|
||||
*) args+=("$1"); shift ;;
|
||||
esac
|
||||
done
|
||||
set -- ${args[@]+"${args[@]}"}
|
||||
|
||||
DST_SESSION=""; SSH_TARGET=""; DST_HOST=""; MSG=""; FILE=""; SOCKET_NAME=""
|
||||
SRC_LABEL=""; RETRIES=2; VERBOSE=0
|
||||
usage() { sed -n '2,44p' "$0"; exit "${1:-3}"; }
|
||||
SRC_LABEL=""; RETRIES=2; VERBOSE=0; CLASS=""
|
||||
usage() { sed -n '2,/^set -uo pipefail/{/^set -uo pipefail/d;p}' "$0"; exit "${1:-3}"; }
|
||||
|
||||
while getopts "L:s:H:n:m:f:S:r:vh" o; do
|
||||
while getopts "L:s:H:n:m:f:S:r:C:vh" o; do
|
||||
case "$o" in
|
||||
L) SOCKET_NAME=$OPTARG ;;
|
||||
s) DST_SESSION=$OPTARG ;; H) SSH_TARGET=$OPTARG ;; n) DST_HOST=$OPTARG ;;
|
||||
m) MSG=$OPTARG ;; f) FILE=$OPTARG ;; S) SRC_LABEL=$OPTARG ;;
|
||||
C) CLASS=$OPTARG ;;
|
||||
r) RETRIES=$OPTARG ;; v) VERBOSE=1 ;; h) usage 0 ;; *) usage 3 ;;
|
||||
esac
|
||||
done
|
||||
@@ -64,6 +103,17 @@ done
|
||||
[ -n "$DST_SESSION" ] || { echo "ERROR: -s DST_SESSION is required" >&2; usage 3; }
|
||||
[ -x "$SENDER" ] || { echo "ERROR: send-message.sh not found beside this script" >&2; exit 3; }
|
||||
|
||||
# Validate the triage class only when one was given. An absent class emits NO
|
||||
# token (preamble byte-identical to the classic format); the consumer defaults
|
||||
# absent => actionable.
|
||||
CLASS_TOKEN=""
|
||||
if [ -n "$CLASS" ]; then
|
||||
case "$CLASS" in
|
||||
terminal-log|actionable|human|reaction) CLASS_TOKEN=" class=${CLASS}" ;;
|
||||
*) echo "ERROR: invalid --class '$CLASS' (allowed: terminal-log, actionable, human, reaction)" >&2; exit 3 ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Message body from -f / -m / stdin.
|
||||
if [ -n "$FILE" ]; then [ -r "$FILE" ] || { echo "ERROR: cannot read $FILE" >&2; exit 3; }; MSG=$(cat -- "$FILE")
|
||||
elif [ -z "$MSG" ] && [ ! -t 0 ]; then MSG=$(cat)
|
||||
@@ -90,7 +140,7 @@ if [ -z "$DST_HOST" ]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
PREAMBLE="[${SRC_LABEL} -> ${DST_HOST}:${DST_SESSION}]"
|
||||
PREAMBLE="[${SRC_LABEL} -> ${DST_HOST}:${DST_SESSION}${CLASS_TOKEN}]"
|
||||
FULL="${PREAMBLE} ${MSG}"
|
||||
B64=$(printf '%s' "$FULL" | base64 -w0)
|
||||
|
||||
|
||||
97
packages/mosaic/framework/tools/tmux/agent-send.test.sh
Executable file
97
packages/mosaic/framework/tools/tmux/agent-send.test.sh
Executable file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env bash
|
||||
# agent-send.test.sh — regression + grammar lock for agent-send.sh --class.
|
||||
#
|
||||
# Strategy: inject a capture stub via AGENT_SEND_SENDER that decodes the -b
|
||||
# base64 payload and prints the FULL message (preamble + body) so we can assert
|
||||
# the exact bytes on the wire. Local path only (no ssh), -n pins the dst host so
|
||||
# the preamble is deterministic across machines.
|
||||
#
|
||||
# Guarantees locked here:
|
||||
# 1. REGRESSION BAR — no --class => preamble byte-for-byte identical to classic.
|
||||
# 2. --class <c> => ` class=<c>` token emitted inside the bracket.
|
||||
# 3. --class=<c> (equals form) parses identically to the space form.
|
||||
# 4. -C <c> short form parses identically.
|
||||
# 5. invalid class => exit 3, nothing sent.
|
||||
# 6. --class with no value => exit 3.
|
||||
# 7. the documented consumer regex parses producer output for every class.
|
||||
set -uo pipefail
|
||||
|
||||
HERE=$(cd -- "$(dirname -- "$0")" && pwd)
|
||||
TOOL="$HERE/agent-send.sh"
|
||||
|
||||
# Capture stub: stands in for send-message.sh. Decodes -b and prints the payload.
|
||||
STUB=$(mktemp)
|
||||
trap 'rm -f "$STUB"' EXIT
|
||||
cat >"$STUB" <<'STUB_EOF'
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
b64=""
|
||||
while getopts "t:b:r:v" o; do case "$o" in b) b64=$OPTARG ;; *) : ;; esac; done
|
||||
printf '%s' "$b64" | base64 -d
|
||||
STUB_EOF
|
||||
chmod +x "$STUB"
|
||||
|
||||
PASS=0; FAIL=0
|
||||
ok() { PASS=$((PASS+1)); printf 'ok %s\n' "$1"; }
|
||||
no() { FAIL=$((FAIL+1)); printf 'FAIL %s\n %s\n' "$1" "$2"; }
|
||||
|
||||
# Run the tool with the stub injected; echoes captured payload on stdout.
|
||||
run() { AGENT_SEND_SENDER="$STUB" bash "$TOOL" -S a:src -n dsthost "$@"; }
|
||||
|
||||
# Documented consumer grammar — the daemon will mirror exactly this.
|
||||
GRAMMAR='^\[(\S+) -> (\S+) class=(terminal-log|actionable|human|reaction)\] (.*)$'
|
||||
GRAMMAR_NOCLASS='^\[(\S+) -> (\S+)\] (.*)$'
|
||||
|
||||
# 1. REGRESSION BAR: classic preamble, byte-for-byte.
|
||||
got=$(run -s mos -m "hello world")
|
||||
want='[a:src -> dsthost:mos] hello world'
|
||||
[ "$got" = "$want" ] && ok "regression: no --class is byte-identical" \
|
||||
|| no "regression: no --class is byte-identical" "got=[$got] want=[$want]"
|
||||
|
||||
# 2. --class space form emits the token.
|
||||
got=$(run -s mos --class terminal-log -m "ACK")
|
||||
want='[a:src -> dsthost:mos class=terminal-log] ACK'
|
||||
[ "$got" = "$want" ] && ok "--class terminal-log emits token" \
|
||||
|| no "--class terminal-log emits token" "got=[$got] want=[$want]"
|
||||
|
||||
# 3. --class=value equals form.
|
||||
got=$(run -s mos --class=actionable -m "decide X")
|
||||
want='[a:src -> dsthost:mos class=actionable] decide X'
|
||||
[ "$got" = "$want" ] && ok "--class=actionable (equals form)" \
|
||||
|| no "--class=actionable (equals form)" "got=[$got] want=[$want]"
|
||||
|
||||
# 4. -C short form.
|
||||
got=$(run -s mos -C human -m "from a person")
|
||||
want='[a:src -> dsthost:mos class=human] from a person'
|
||||
[ "$got" = "$want" ] && ok "-C human (short form)" \
|
||||
|| no "-C human (short form)" "got=[$got] want=[$want]"
|
||||
|
||||
# 5. invalid class => exit 3, no send.
|
||||
if out=$(run -s mos --class bogus -m "x" 2>/dev/null); then
|
||||
no "invalid class rejected" "expected non-zero exit, got 0 (out=[$out])"
|
||||
else
|
||||
rc=$?
|
||||
[ "$rc" = 3 ] && [ -z "$out" ] && ok "invalid class => exit 3, nothing sent" \
|
||||
|| no "invalid class => exit 3, nothing sent" "rc=$rc out=[$out]"
|
||||
fi
|
||||
|
||||
# 6. --class with no value => exit 3.
|
||||
if run -s mos -m "x" --class 2>/dev/null; then
|
||||
no "--class with no value rejected" "expected non-zero exit, got 0"
|
||||
else
|
||||
[ "$?" = 3 ] && ok "--class with no value => exit 3" || no "--class with no value => exit 3" "wrong rc"
|
||||
fi
|
||||
|
||||
# 7. consumer grammar parses every class + classic line.
|
||||
for c in terminal-log actionable human reaction; do
|
||||
line=$(run -s mos --class "$c" -m "body $c")
|
||||
[[ "$line" =~ $GRAMMAR ]] && [ "${BASH_REMATCH[3]}" = "$c" ] && [ "${BASH_REMATCH[4]}" = "body $c" ] \
|
||||
&& ok "grammar parses class=$c" || no "grammar parses class=$c" "line=[$line]"
|
||||
done
|
||||
classic=$(run -s mos -m "plain body")
|
||||
[[ "$classic" =~ $GRAMMAR_NOCLASS ]] && [ "${BASH_REMATCH[3]}" = "plain body" ] \
|
||||
&& ok "grammar (no-class) parses classic line" || no "grammar (no-class) parses classic line" "line=[$classic]"
|
||||
|
||||
echo "---"
|
||||
echo "PASS=$PASS FAIL=$FAIL"
|
||||
[ "$FAIL" -eq 0 ]
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mosaicstack/mosaic",
|
||||
"version": "0.0.40",
|
||||
"version": "0.0.43",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://git.mosaicstack.dev/mosaicstack/stack.git",
|
||||
|
||||
@@ -30,6 +30,7 @@ import {
|
||||
refreshActiveFleetUnits,
|
||||
readRosterAgentNames,
|
||||
buildRelaunchCommands,
|
||||
checkFrameworkDrift,
|
||||
FRAMEWORK_RESEED_PACKAGE,
|
||||
} from './runtime/update-checker.js';
|
||||
import { runWizard } from './wizard.js';
|
||||
@@ -418,6 +419,48 @@ program
|
||||
// checkForAllUpdates imported statically above
|
||||
const { execSync } = await import('node:child_process');
|
||||
|
||||
// Re-seed the framework from the freshly-installed package, propagate shipped
|
||||
// systemd unit fixes to the active units, and (opt-in) relaunch durable
|
||||
// agents. Shared by the "packages updated" and the "framework drift" paths.
|
||||
const reseedFramework = (reason: string): void => {
|
||||
console.log(reason);
|
||||
const reseed = runFrameworkReseed();
|
||||
if (!reseed.ok) {
|
||||
console.error(
|
||||
`\n⚠ Framework re-seed skipped: ${reseed.reason ?? 'unknown'}.\n` +
|
||||
' Activate manually: bash "$(npm root -g)/@mosaicstack/mosaic/framework/install.sh" ' +
|
||||
'(MOSAIC_SYNC_ONLY=1 MOSAIC_INSTALL_MODE=keep)',
|
||||
);
|
||||
return;
|
||||
}
|
||||
console.log('✔ Framework re-seeded.');
|
||||
// Propagate shipped systemd unit fixes to the ACTIVE units (re-seed only
|
||||
// touches ~/.config/mosaic/systemd/user; systemd runs ~/.config/systemd/user).
|
||||
const units = refreshActiveFleetUnits();
|
||||
if (units.refreshed.length > 0) {
|
||||
console.log(`✔ Refreshed ${units.refreshed.length} active systemd unit(s).`);
|
||||
}
|
||||
const agents = readRosterAgentNames();
|
||||
if (agents.length === 0) return;
|
||||
if (opts.relaunch) {
|
||||
console.log(`\nRelaunching ${agents.length} fleet agent(s) to pick up the new runtime…`);
|
||||
for (const restart of buildRelaunchCommands(agents)) {
|
||||
try {
|
||||
execSync(restart.join(' '), { stdio: 'inherit', timeout: 30_000 });
|
||||
} catch {
|
||||
console.error(` ⚠ failed to restart agent — run: ${restart.join(' ')}`);
|
||||
}
|
||||
}
|
||||
console.log('✔ Agents relaunched.');
|
||||
} else {
|
||||
console.log(
|
||||
`\nℹ ${agents.length} fleet agent(s) are still running the previous runtime. ` +
|
||||
'Restart them to activate the update:\n mosaic update --relaunch ' +
|
||||
'(or: mosaic fleet restart <agent>)',
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
console.log('Checking for updates…');
|
||||
const results = checkForAllUpdates({ skipCache: true });
|
||||
|
||||
@@ -432,6 +475,18 @@ program
|
||||
process.exit(1);
|
||||
}
|
||||
console.log('\n✔ All packages up to date.');
|
||||
// #642: the CLI may have been upgraded outside `mosaic update` (e.g. a
|
||||
// direct `npm i -g`), leaving the framework files stale even though no
|
||||
// package is reported outdated. Detect that via the framework version and
|
||||
// re-seed so shipped launcher/runtime fixes still activate.
|
||||
const drift = checkFrameworkDrift();
|
||||
if (drift.drifted && opts.reseed !== false) {
|
||||
reseedFramework(
|
||||
`\nFramework drift detected (on-disk v${drift.installed} < bundled v${drift.bundled}) — ` +
|
||||
'the CLI was updated outside `mosaic update`. Re-seeding framework files into ' +
|
||||
'~/.config/mosaic (data-safe; keeps your edits)…',
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -456,52 +511,17 @@ program
|
||||
// F3-m3 / R13: the CLI is updated, but the framework files in
|
||||
// ~/.config/mosaic/ are still the previous version. Re-seed them from the
|
||||
// freshly-installed package so shipped launcher/runtime changes ACTIVATE.
|
||||
// Only when the framework-bearing package itself updated.
|
||||
// Re-seed when the framework-bearing package itself updated OR the on-disk
|
||||
// framework is older than the freshly-installed one (#642 — e.g. only
|
||||
// sibling packages were outdated but the CLI was already ahead).
|
||||
const mosaicUpdated = outdated.some(
|
||||
(r: { package: string }) => r.package === FRAMEWORK_RESEED_PACKAGE,
|
||||
);
|
||||
if (mosaicUpdated && opts.reseed !== false) {
|
||||
console.log(
|
||||
const drift = checkFrameworkDrift();
|
||||
if ((mosaicUpdated || drift.drifted) && opts.reseed !== false) {
|
||||
reseedFramework(
|
||||
'\nRe-seeding framework files into ~/.config/mosaic (data-safe; keeps your edits)…',
|
||||
);
|
||||
const reseed = runFrameworkReseed();
|
||||
if (reseed.ok) {
|
||||
console.log('✔ Framework re-seeded.');
|
||||
// Propagate shipped systemd unit fixes to the ACTIVE units (re-seed only
|
||||
// touches ~/.config/mosaic/systemd/user; systemd runs ~/.config/systemd/user).
|
||||
const units = refreshActiveFleetUnits();
|
||||
if (units.refreshed.length > 0) {
|
||||
console.log(`✔ Refreshed ${units.refreshed.length} active systemd unit(s).`);
|
||||
}
|
||||
const agents = readRosterAgentNames();
|
||||
if (agents.length > 0) {
|
||||
if (opts.relaunch) {
|
||||
console.log(
|
||||
`\nRelaunching ${agents.length} fleet agent(s) to pick up the new runtime…`,
|
||||
);
|
||||
for (const restart of buildRelaunchCommands(agents)) {
|
||||
try {
|
||||
execSync(restart.join(' '), { stdio: 'inherit', timeout: 30_000 });
|
||||
} catch {
|
||||
console.error(` ⚠ failed to restart agent — run: ${restart.join(' ')}`);
|
||||
}
|
||||
}
|
||||
console.log('✔ Agents relaunched.');
|
||||
} else {
|
||||
console.log(
|
||||
`\nℹ ${agents.length} fleet agent(s) are still running the previous runtime. ` +
|
||||
'Restart them to activate the update:\n mosaic update --relaunch ' +
|
||||
'(or: mosaic fleet restart <agent>)',
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
console.error(
|
||||
`\n⚠ Framework re-seed skipped: ${reseed.reason ?? 'unknown'}.\n` +
|
||||
' Activate manually: bash "$(npm root -g)/@mosaicstack/mosaic/framework/install.sh" ' +
|
||||
'(MOSAIC_SYNC_ONLY=1 MOSAIC_INSTALL_MODE=keep)',
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
@@ -19,17 +19,21 @@ import {
|
||||
buildSystemdShowCommand,
|
||||
buildTmuxListPanesCommand,
|
||||
buildTmuxListSessionsCommand,
|
||||
classifyReadiness,
|
||||
classifySendResult,
|
||||
countOrchestrators,
|
||||
countEnhancers,
|
||||
detectDrift,
|
||||
enableFleetUnits,
|
||||
FLEET_PROFILES,
|
||||
HEARTBEAT_IDLE_THRESHOLD_SECONDS,
|
||||
HEARTBEAT_STUCK_THRESHOLD_SECONDS,
|
||||
generateAgentEnv,
|
||||
getDefaultOperatorSourceLabel,
|
||||
getDefaultTenantAndHost,
|
||||
getRosterAgent,
|
||||
heartbeatPath,
|
||||
idleThresholdSeconds,
|
||||
isSendAccepted,
|
||||
loadFleetRoster,
|
||||
mergeAgentEnv,
|
||||
@@ -44,6 +48,7 @@ import {
|
||||
resolvePresetFilename,
|
||||
RUNTIME_ACCEPTABLE_COMMANDS,
|
||||
serializeRosterToYaml,
|
||||
stuckThresholdSeconds,
|
||||
VERIFY_DEFAULT_TIMEOUT_MS,
|
||||
VERIFY_POLL_INTERVAL_MS,
|
||||
type AgentPsRow,
|
||||
@@ -933,6 +938,127 @@ describe('fleet ps — heartbeat parsing', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — readiness thresholds', () => {
|
||||
const savedIdle = process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
const savedStuck = process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
|
||||
afterEach(() => {
|
||||
if (savedIdle === undefined) delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = savedIdle;
|
||||
if (savedStuck === undefined) delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = savedStuck;
|
||||
});
|
||||
|
||||
it('uses default readiness thresholds when env is unset', () => {
|
||||
delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
|
||||
expect(idleThresholdSeconds()).toBe(HEARTBEAT_IDLE_THRESHOLD_SECONDS);
|
||||
expect(stuckThresholdSeconds()).toBe(HEARTBEAT_STUCK_THRESHOLD_SECONDS);
|
||||
});
|
||||
|
||||
it('honors positive integer readiness thresholds from env', () => {
|
||||
process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '120';
|
||||
process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = '480';
|
||||
|
||||
expect(idleThresholdSeconds()).toBe(120);
|
||||
expect(stuckThresholdSeconds()).toBe(480);
|
||||
});
|
||||
|
||||
it('falls back to defaults for invalid readiness thresholds', () => {
|
||||
process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '0';
|
||||
process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = 'not-a-number';
|
||||
|
||||
expect(idleThresholdSeconds()).toBe(HEARTBEAT_IDLE_THRESHOLD_SECONDS);
|
||||
expect(stuckThresholdSeconds()).toBe(HEARTBEAT_STUCK_THRESHOLD_SECONDS);
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — readiness classification', () => {
|
||||
const thresholds = { idleThresholdSeconds: 300, stuckThresholdSeconds: 900 };
|
||||
|
||||
it('reports dead when the pane is not alive', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: false, hbHealth: 'healthy', hbStatus: 'busy', idleSeconds: 0 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('dead');
|
||||
});
|
||||
|
||||
it('reports unknown when heartbeat health is unknown', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'unknown', hbStatus: null, idleSeconds: 0 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('unknown');
|
||||
});
|
||||
|
||||
it('reports stale when heartbeat health is stale', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'stale', hbStatus: 'busy', idleSeconds: 1_000 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('stale');
|
||||
});
|
||||
|
||||
it('reports working when heartbeat status is busy, even past stuck threshold', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'busy', idleSeconds: 2_000 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('working');
|
||||
});
|
||||
|
||||
it('reports working when pane idle seconds are unavailable', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: null },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('working');
|
||||
});
|
||||
|
||||
it('reports stuck at the stuck threshold boundary', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 900 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('stuck');
|
||||
});
|
||||
|
||||
it('reports idle at the idle threshold boundary', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 300 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('idle');
|
||||
});
|
||||
|
||||
it('reports working below the idle threshold', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 299 },
|
||||
thresholds,
|
||||
),
|
||||
).toBe('working');
|
||||
});
|
||||
|
||||
it('checks stuck before idle when thresholds are inverted', () => {
|
||||
expect(
|
||||
classifyReadiness(
|
||||
{ paneAlive: true, hbHealth: 'healthy', hbStatus: 'ok', idleSeconds: 350 },
|
||||
{ idleThresholdSeconds: 900, stuckThresholdSeconds: 300 },
|
||||
),
|
||||
).toBe('stuck');
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — systemd show parsing', () => {
|
||||
it('parses ActiveState, SubState, UnitFileState from systemctl show output', () => {
|
||||
const output = 'ActiveState=active\nSubState=running\nUnitFileState=enabled\n';
|
||||
@@ -1324,8 +1450,9 @@ describe('fleet ps — JSON output shape (FR-6)', () => {
|
||||
// boot-enable warning: active + disabled
|
||||
expect(row.bootEnableWarning).toBe(true);
|
||||
|
||||
// heartbeat missing → unknown
|
||||
// heartbeat missing → unknown readiness preserves existing display semantics
|
||||
expect(row.heartbeat.health).toBe('unknown');
|
||||
expect(row.readiness).toBe('unknown');
|
||||
|
||||
expect(row.name).toBe('canary-pi');
|
||||
expect(row.runtime).toBe('pi');
|
||||
@@ -1387,6 +1514,92 @@ describe('fleet ps — command sequences issued', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('fleet ps — readiness table output', () => {
|
||||
it('renders readiness in HB column and flags idle/stuck rows', async () => {
|
||||
const home = await mkdtemp(join(tmpdir(), 'mosaic-fleet-'));
|
||||
const rosterPath = join(home, 'fleet', 'roster.yaml');
|
||||
const runDir = join(home, 'fleet', 'run');
|
||||
await mkdir(runDir, { recursive: true });
|
||||
await writeFile(
|
||||
rosterPath,
|
||||
[
|
||||
'version: 1',
|
||||
'transport: tmux',
|
||||
'agents:',
|
||||
' - name: idle-agent',
|
||||
' runtime: pi',
|
||||
' - name: stuck-agent',
|
||||
' runtime: pi',
|
||||
].join('\n'),
|
||||
);
|
||||
|
||||
const nowMs = 1_700_000_000_000;
|
||||
const idleActivityEpoch = Math.floor((nowMs - 10_000) / 1000);
|
||||
const stuckActivityEpoch = Math.floor((nowMs - 40_000) / 1000);
|
||||
const hbTs = new Date(nowMs - 1_000).toISOString();
|
||||
await writeFile(join(runDir, 'idle-agent.hb'), `ts=${hbTs}\npid=111\nstatus=ok\n`);
|
||||
await writeFile(join(runDir, 'stuck-agent.hb'), `ts=${hbTs}\npid=222\nstatus=ok\n`);
|
||||
|
||||
const savedIdle = process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
const savedStuck = process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = '5';
|
||||
process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = '30';
|
||||
|
||||
const dateNow = vi.spyOn(Date, 'now').mockReturnValue(nowMs);
|
||||
const runner: CommandRunner = async (command, args) => {
|
||||
const full = [command, ...args].join(' ');
|
||||
if (full.includes('list-sessions')) {
|
||||
return { stdout: 'idle-agent\nstuck-agent\n', stderr: '', exitCode: 0 };
|
||||
}
|
||||
if (full.includes('=idle-agent:0.0')) {
|
||||
return { stdout: `111 pi 0 ${idleActivityEpoch}\n`, stderr: '', exitCode: 0 };
|
||||
}
|
||||
if (full.includes('=stuck-agent:0.0')) {
|
||||
return { stdout: `222 pi 0 ${stuckActivityEpoch}\n`, stderr: '', exitCode: 0 };
|
||||
}
|
||||
if (full.includes('systemctl') && full.includes('show')) {
|
||||
return {
|
||||
stdout: 'ActiveState=active\nSubState=running\nUnitFileState=enabled\n',
|
||||
stderr: '',
|
||||
exitCode: 0,
|
||||
};
|
||||
}
|
||||
return { stdout: '', stderr: '', exitCode: 0 };
|
||||
};
|
||||
|
||||
const lines: string[] = [];
|
||||
const origLog = console.log;
|
||||
console.log = (msg: string) => {
|
||||
lines.push(msg);
|
||||
};
|
||||
|
||||
const program = new Command();
|
||||
program.exitOverride();
|
||||
registerFleetCommand(program, { runner, mosaicHome: home });
|
||||
|
||||
try {
|
||||
await program.parseAsync(['node', 'mosaic', 'fleet', 'ps']);
|
||||
} finally {
|
||||
console.log = origLog;
|
||||
dateNow.mockRestore();
|
||||
if (savedIdle === undefined) delete process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD = savedIdle;
|
||||
if (savedStuck === undefined) delete process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD;
|
||||
else process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD = savedStuck;
|
||||
await rm(home, { recursive: true, force: true });
|
||||
}
|
||||
|
||||
const idleLine = lines.find((line) => line.includes('idle-agent'));
|
||||
const stuckLine = lines.find((line) => line.includes('stuck-agent'));
|
||||
expect(idleLine).toBeDefined();
|
||||
expect(idleLine).toContain('1s/idle');
|
||||
expect(idleLine).toMatch(/\bIDLE\b/);
|
||||
expect(stuckLine).toBeDefined();
|
||||
expect(stuckLine).toContain('1s/stuck');
|
||||
expect(stuckLine).toMatch(/\bSTUCK\b/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildTmuxListSessionsCommand', () => {
|
||||
it('builds exact list-sessions command with session_name format', () => {
|
||||
expect(buildTmuxListSessionsCommand('mosaic-fleet')).toEqual([
|
||||
@@ -1514,6 +1727,7 @@ describe('fleet ps — unmanaged socket sessions', () => {
|
||||
|
||||
// driftFlag must be false for unmanaged (no roster runtime to compare)
|
||||
expect(unmanagedRow.driftFlag).toBe(false);
|
||||
expect(unmanagedRow.readiness).toBe('unknown');
|
||||
});
|
||||
|
||||
it('shows UNMANAGED flag in table output for unmanaged sessions', async () => {
|
||||
|
||||
@@ -394,6 +394,8 @@ export function buildAgentTailCommand(agentName: string, lines: number, socketNa
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const HEARTBEAT_INTERVAL_MS = 15_000;
|
||||
export const HEARTBEAT_IDLE_THRESHOLD_SECONDS = 300;
|
||||
export const HEARTBEAT_STUCK_THRESHOLD_SECONDS = 900;
|
||||
|
||||
/**
|
||||
* Heartbeat interval in ms, honoring MOSAIC_HEARTBEAT_INTERVAL (seconds) so the
|
||||
@@ -404,8 +406,68 @@ export function heartbeatIntervalMs(): number {
|
||||
const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_INTERVAL ?? '', 10);
|
||||
return Number.isFinite(sec) && sec > 0 ? sec * 1000 : HEARTBEAT_INTERVAL_MS;
|
||||
}
|
||||
|
||||
/** Idle threshold in seconds, honoring MOSAIC_HEARTBEAT_IDLE_THRESHOLD. */
|
||||
export function idleThresholdSeconds(): number {
|
||||
const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_IDLE_THRESHOLD ?? '', 10);
|
||||
return Number.isFinite(sec) && sec > 0 ? sec : HEARTBEAT_IDLE_THRESHOLD_SECONDS;
|
||||
}
|
||||
|
||||
/** Stuck threshold in seconds, honoring MOSAIC_HEARTBEAT_STUCK_THRESHOLD. */
|
||||
export function stuckThresholdSeconds(): number {
|
||||
const sec = Number.parseInt(process.env.MOSAIC_HEARTBEAT_STUCK_THRESHOLD ?? '', 10);
|
||||
return Number.isFinite(sec) && sec > 0 ? sec : HEARTBEAT_STUCK_THRESHOLD_SECONDS;
|
||||
}
|
||||
export const HEARTBEAT_HEALTHY_MULTIPLIER = 3;
|
||||
|
||||
export type ReadinessState = 'working' | 'idle' | 'stuck' | 'stale' | 'dead' | 'unknown';
|
||||
|
||||
export interface ReadinessSignals {
|
||||
paneAlive: boolean;
|
||||
hbHealth: 'healthy' | 'stale' | 'unknown';
|
||||
hbStatus: 'ok' | 'busy' | null;
|
||||
idleSeconds: number | null;
|
||||
}
|
||||
|
||||
export interface ReadinessThresholds {
|
||||
idleThresholdSeconds: number;
|
||||
stuckThresholdSeconds: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Classify whether an agent is progressing based on already-parsed heartbeat/tmux signals.
|
||||
* Best-effort and runtime-agnostic: it never probes, never throws, and preserves existing
|
||||
* unknown/stale behavior when heartbeat data is absent or old.
|
||||
*/
|
||||
export function classifyReadiness(
|
||||
signals: Partial<ReadinessSignals> | null | undefined,
|
||||
thresholds: Partial<ReadinessThresholds> | null | undefined = {},
|
||||
): ReadinessState {
|
||||
try {
|
||||
if (signals?.paneAlive !== true) return 'dead';
|
||||
if (signals.hbHealth === 'unknown' || signals.hbHealth === undefined) return 'unknown';
|
||||
if (signals.hbHealth === 'stale') return 'stale';
|
||||
if (signals.hbStatus === 'busy') return 'working';
|
||||
if (signals.idleSeconds === null || signals.idleSeconds === undefined) return 'working';
|
||||
|
||||
const idleSeconds = Number.isFinite(signals.idleSeconds) ? signals.idleSeconds : null;
|
||||
if (idleSeconds === null) return 'working';
|
||||
|
||||
const idleThreshold = Number.isFinite(thresholds?.idleThresholdSeconds)
|
||||
? Number(thresholds?.idleThresholdSeconds)
|
||||
: idleThresholdSeconds();
|
||||
const stuckThreshold = Number.isFinite(thresholds?.stuckThresholdSeconds)
|
||||
? Number(thresholds?.stuckThresholdSeconds)
|
||||
: stuckThresholdSeconds();
|
||||
|
||||
if (idleSeconds >= stuckThreshold) return 'stuck';
|
||||
if (idleSeconds >= idleThreshold) return 'idle';
|
||||
return 'working';
|
||||
} catch {
|
||||
return 'unknown';
|
||||
}
|
||||
}
|
||||
|
||||
export interface HeartbeatInfo {
|
||||
ts: Date | null;
|
||||
pid: number | null;
|
||||
@@ -429,6 +491,7 @@ export interface AgentPsRow {
|
||||
paneCommand: string | null;
|
||||
idleSeconds: number | null;
|
||||
heartbeat: HeartbeatInfo;
|
||||
readiness: ReadinessState;
|
||||
/** roster runtime !== actual pane command */
|
||||
driftFlag: boolean;
|
||||
/** active but UnitFileState=disabled */
|
||||
@@ -1022,6 +1085,10 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
|
||||
const nowMs = Date.now();
|
||||
|
||||
const rows: AgentPsRow[] = [];
|
||||
const readinessThresholds = {
|
||||
idleThresholdSeconds: idleThresholdSeconds(),
|
||||
stuckThresholdSeconds: stuckThresholdSeconds(),
|
||||
};
|
||||
|
||||
// Build the set of roster agent names for quick lookup when filtering socket sessions.
|
||||
const rosterAgentNames = new Set(roster.agents.map((a) => a.name));
|
||||
@@ -1052,6 +1119,17 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
|
||||
const bootEnableWarning =
|
||||
sysInfo.ActiveState === 'active' && sysInfo.UnitFileState === 'disabled';
|
||||
|
||||
const paneAlive = !paneInfo.dead;
|
||||
const readiness = classifyReadiness(
|
||||
{
|
||||
paneAlive,
|
||||
hbHealth: hb.health,
|
||||
hbStatus: hb.status,
|
||||
idleSeconds: paneInfo.idleSeconds,
|
||||
},
|
||||
readinessThresholds,
|
||||
);
|
||||
|
||||
rows.push({
|
||||
name: agent.name,
|
||||
tenant_id,
|
||||
@@ -1059,11 +1137,12 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
|
||||
runtime: agent.runtime,
|
||||
systemdActive: sysInfo.ActiveState,
|
||||
systemdEnabled: sysInfo.UnitFileState,
|
||||
paneAlive: !paneInfo.dead,
|
||||
paneAlive,
|
||||
panePid: paneInfo.pid,
|
||||
paneCommand: paneInfo.command,
|
||||
idleSeconds: paneInfo.idleSeconds,
|
||||
heartbeat: hb,
|
||||
readiness,
|
||||
driftFlag,
|
||||
bootEnableWarning,
|
||||
managed: true,
|
||||
@@ -1110,6 +1189,17 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
|
||||
const bootEnableWarning =
|
||||
sysInfo.ActiveState === 'active' && sysInfo.UnitFileState === 'disabled';
|
||||
|
||||
const paneAlive = !paneInfo.dead;
|
||||
const readiness = classifyReadiness(
|
||||
{
|
||||
paneAlive,
|
||||
hbHealth: hb.health,
|
||||
hbStatus: hb.status,
|
||||
idleSeconds: paneInfo.idleSeconds,
|
||||
},
|
||||
readinessThresholds,
|
||||
);
|
||||
|
||||
rows.push({
|
||||
name: sessionName,
|
||||
tenant_id,
|
||||
@@ -1118,11 +1208,12 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
|
||||
runtime: 'unknown',
|
||||
systemdActive: sysInfo.ActiveState,
|
||||
systemdEnabled: sysInfo.UnitFileState,
|
||||
paneAlive: !paneInfo.dead,
|
||||
paneAlive,
|
||||
panePid: paneInfo.pid,
|
||||
paneCommand: paneInfo.command,
|
||||
idleSeconds: paneInfo.idleSeconds,
|
||||
heartbeat: hb,
|
||||
readiness,
|
||||
// No roster runtime to compare — drift is not meaningful for unmanaged sessions
|
||||
driftFlag: false,
|
||||
bootEnableWarning,
|
||||
@@ -1164,13 +1255,15 @@ export function registerFleetCommand(program: Command, deps: FleetCommandDeps =
|
||||
const idle = row.idleSeconds !== null ? `${row.idleSeconds}s` : '-';
|
||||
const hbAge =
|
||||
row.heartbeat.ageMs !== null
|
||||
? `${Math.round(row.heartbeat.ageMs / 1000)}s/${row.heartbeat.health}`
|
||||
? `${Math.round(row.heartbeat.ageMs / 1000)}s/${row.readiness}`
|
||||
: `unknown`;
|
||||
const model = row.heartbeat.model ?? '-';
|
||||
const flags: string[] = [];
|
||||
if (!row.managed) flags.push('UNMANAGED');
|
||||
if (row.driftFlag) flags.push('DRIFT');
|
||||
if (row.bootEnableWarning) flags.push('BOOT-ENABLE');
|
||||
if (row.readiness === 'idle') flags.push('IDLE');
|
||||
if (row.readiness === 'stuck') flags.push('STUCK');
|
||||
|
||||
console.log(
|
||||
[
|
||||
|
||||
@@ -8,6 +8,9 @@ import {
|
||||
readRosterAgentNames,
|
||||
runFrameworkReseed,
|
||||
refreshActiveFleetUnits,
|
||||
readInstalledFrameworkVersion,
|
||||
readBundledFrameworkVersion,
|
||||
checkFrameworkDrift,
|
||||
} from './update-checker.js';
|
||||
import { existsSync, readFileSync } from 'node:fs';
|
||||
|
||||
@@ -123,3 +126,73 @@ describe('refreshActiveFleetUnits', () => {
|
||||
expect(existsSync(join(configHome, 'systemd', 'user', 'mosaic-agent@.service'))).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* #642: re-seed when the on-disk framework is older than the bundled one even
|
||||
* if no package is reported outdated (CLI upgraded outside `mosaic update`).
|
||||
*/
|
||||
describe('framework drift detection', () => {
|
||||
let home: string; // stand-in for ~/.config/mosaic
|
||||
let fw: string; // stand-in for the bundled framework root
|
||||
|
||||
beforeEach(() => {
|
||||
const root = mkdtempSync(join(tmpdir(), 'mosaic-drift-'));
|
||||
home = join(root, 'mosaic');
|
||||
fw = join(root, 'framework');
|
||||
mkdirSync(home, { recursive: true });
|
||||
mkdirSync(fw, { recursive: true });
|
||||
});
|
||||
afterEach(() => {
|
||||
rmSync(join(home, '..'), { recursive: true, force: true });
|
||||
});
|
||||
|
||||
const writeInstalled = (v: string) => writeFileSync(join(home, '.framework-version'), v);
|
||||
const writeBundled = (v: string) =>
|
||||
writeFileSync(join(fw, 'install.sh'), `#!/usr/bin/env bash\nFRAMEWORK_VERSION=${v}\n`);
|
||||
|
||||
describe('readInstalledFrameworkVersion', () => {
|
||||
it('returns undefined when the version file is absent', () => {
|
||||
expect(readInstalledFrameworkVersion(home)).toBeUndefined();
|
||||
});
|
||||
it('parses the integer (tolerating surrounding whitespace)', () => {
|
||||
writeInstalled(' 3\n');
|
||||
expect(readInstalledFrameworkVersion(home)).toBe(3);
|
||||
});
|
||||
it('returns undefined for non-numeric content', () => {
|
||||
writeInstalled('not-a-number\n');
|
||||
expect(readInstalledFrameworkVersion(home)).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('readBundledFrameworkVersion', () => {
|
||||
it('returns undefined when install.sh is absent', () => {
|
||||
expect(readBundledFrameworkVersion(fw)).toBeUndefined();
|
||||
});
|
||||
it('parses FRAMEWORK_VERSION=<n> from install.sh', () => {
|
||||
writeBundled('4');
|
||||
expect(readBundledFrameworkVersion(fw)).toBe(4);
|
||||
});
|
||||
});
|
||||
|
||||
describe('checkFrameworkDrift', () => {
|
||||
it('reports drift when on-disk is older than bundled', () => {
|
||||
writeInstalled('3');
|
||||
writeBundled('4');
|
||||
expect(checkFrameworkDrift(home, fw)).toEqual({ drifted: true, installed: 3, bundled: 4 });
|
||||
});
|
||||
it('no drift when versions match', () => {
|
||||
writeInstalled('4');
|
||||
writeBundled('4');
|
||||
expect(checkFrameworkDrift(home, fw)).toMatchObject({ drifted: false });
|
||||
});
|
||||
it('no drift when on-disk is newer than bundled', () => {
|
||||
writeInstalled('5');
|
||||
writeBundled('4');
|
||||
expect(checkFrameworkDrift(home, fw)).toMatchObject({ drifted: false });
|
||||
});
|
||||
it('no drift (conservative) when a version cannot be read', () => {
|
||||
writeBundled('4'); // installed version file missing
|
||||
expect(checkFrameworkDrift(home, fw)).toMatchObject({ drifted: false, bundled: 4 });
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -521,6 +521,75 @@ export function runFrameworkReseed(
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Framework drift detection (#642) ────────────────────────────────────────
|
||||
//
|
||||
// `mosaic update` only re-seeds the framework when the @mosaicstack/mosaic
|
||||
// package itself is upgraded *within that command*. When the CLI is upgraded
|
||||
// some OTHER way — a direct `npm i -g @mosaicstack/mosaic`, or an upgrade run
|
||||
// where only sibling packages were outdated — the framework files in
|
||||
// ~/.config/mosaic stay stale and shipped launcher/runtime fixes never
|
||||
// activate. Comparing the on-disk framework schema version against the version
|
||||
// bundled in the installed package detects exactly that situation.
|
||||
|
||||
/** Read the framework schema version recorded on disk (~/.config/mosaic/.framework-version). */
|
||||
export function readInstalledFrameworkVersion(
|
||||
mosaicHome = join(homedir(), '.config', 'mosaic'),
|
||||
): number | undefined {
|
||||
const vf = join(mosaicHome, '.framework-version');
|
||||
if (!existsSync(vf)) return undefined;
|
||||
try {
|
||||
const n = parseInt(readFileSync(vf, 'utf-8').trim(), 10);
|
||||
return Number.isFinite(n) ? n : undefined;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the framework schema version shipped in the installed package by parsing
|
||||
* `FRAMEWORK_VERSION=<n>` out of the bundled install.sh (the authoritative
|
||||
* source the installer writes to .framework-version).
|
||||
*/
|
||||
export function readBundledFrameworkVersion(
|
||||
frameworkRoot = resolveBundledFrameworkRoot(),
|
||||
): number | undefined {
|
||||
const installer = join(frameworkRoot, 'install.sh');
|
||||
if (!existsSync(installer)) return undefined;
|
||||
try {
|
||||
const m = readFileSync(installer, 'utf-8').match(/^\s*FRAMEWORK_VERSION=(\d+)/m);
|
||||
const raw = m?.[1];
|
||||
if (!raw) return undefined;
|
||||
const n = parseInt(raw, 10);
|
||||
return Number.isFinite(n) ? n : undefined;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
export interface FrameworkDrift {
|
||||
/** True only when both versions are known AND the on-disk one is older. */
|
||||
drifted: boolean;
|
||||
installed?: number;
|
||||
bundled?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect whether the on-disk framework is older than the framework bundled in
|
||||
* the installed CLI (#642). Conservative: if either version can't be read the
|
||||
* result is no-drift, so a missing/unreadable version file never triggers an
|
||||
* unexpected re-seed.
|
||||
*/
|
||||
export function checkFrameworkDrift(
|
||||
mosaicHome = join(homedir(), '.config', 'mosaic'),
|
||||
frameworkRoot = resolveBundledFrameworkRoot(),
|
||||
): FrameworkDrift {
|
||||
const installed = readInstalledFrameworkVersion(mosaicHome);
|
||||
const bundled = readBundledFrameworkVersion(frameworkRoot);
|
||||
const drifted =
|
||||
typeof installed === 'number' && typeof bundled === 'number' && installed < bundled;
|
||||
return { drifted, installed, bundled };
|
||||
}
|
||||
|
||||
/**
|
||||
* Best-effort parse of the fleet roster for agent names (used to relaunch
|
||||
* durable agents after a re-seed). Returns [] when no roster exists.
|
||||
|
||||
Reference in New Issue
Block a user