#!/usr/bin/env bash # verify-sanitized.sh — blocking CI gate: the public framework package must # contain no operator-specific personal data or private executable defaults. # # Two rule classes, with DELIBERATELY DIFFERENT scopes: # 1. DENYLIST (identity) — a LABELED, one-time regression guard for the CURRENT # operator's identity tokens. Scanned EVERYWHERE including examples/, because a # jarvis/jason/private-home regression in a SHIPPED example would break the # open-source guarantee just as badly as one in a default. NOT a general PII # detector (a future operator's name can't be enumerated) — the durable control # is the L0 framework-PR firewall + human review; this just stops re-contamination. # 2. STRUCTURAL (private $HOME default in *.sh) — scanned everywhere EXCEPT examples/, # because worked example overlays/personas legitimately show placeholder paths. # # File types: *.md, *.sh, *.ps1, *.json, *.yml/*.yaml, *.toml, *.env, *.service, and the CLI scripts under # tools/_scripts/. Excludes node_modules/ and this gate file. # # NOTE: '\bPDA\b' intentionally matches "PDA-friendly" (the contamination removed in P2); # a hyphen is not a \b word boundary on the right, so "PDA-foo" matches. If a future # legitimate doc needs the literal token "PDA" in a non-personal sense, reword it or # narrow this rule — do not weaken the gate silently. # # NOTE: private THIRD-PARTY host refs (e.g. a maintainer's employer Gitea) are NOT in # this denylist — they are functionally entangled in host-routing + test fixtures and # tracked as a separate follow-up. # # Usage: verify-sanitized.sh [FRAMEWORK_ROOT] set -uo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" FRAMEWORK_ROOT="${1:-$(cd "$SCRIPT_DIR/../../.." && pwd)}" SELF_REL="tools/quality/scripts/verify-sanitized.sh" DENYLIST='jarvis|jason|woltje|brain\.woltje\.com|/home/jwoltje|\bPDA\b' STRUCTURAL_SH=':[-=]\$\{?HOME\}?/src/' cd "$FRAMEWORK_ROOT" || { echo "FRAMEWORK_ROOT not found: $FRAMEWORK_ROOT" >&2; exit 3; } # Identity scope = ALL shipped text files (examples/ INCLUDED). _files_identity() { find . -type f \ \( -name '*.md' -o -name '*.sh' -o -name '*.ps1' -o -name '*.json' -o -name '*.yml' -o -name '*.yaml' -o -name '*.toml' -o -name '*.env' -o -name '*.service' -o -path '*/tools/_scripts/*' \) \ -not -path '*/node_modules/*' -not -path "./$SELF_REL" -print0 } # Structural scope = shipped scripts, examples/ EXCLUDED. _files_structural() { find . -type f \( -name '*.sh' -o -path '*/tools/_scripts/*' \) \ -not -path '*/examples/*' -not -path '*/node_modules/*' -not -path "./$SELF_REL" -print0 } # ---- self-test FIRST: a broken regex must never silently no-op the gate ---- _selftest() { local tmp; tmp="$(mktemp -d)" || return 1 printf 'contact jason.woltje at jarvis-brain (PDA-friendly)\n' > "$tmp/planted.md" printf 'X="${VAR:-$HOME/src/whatever/x.json}"\n' > "$tmp/planted.sh" local rc=0 grep -qIEi "$DENYLIST" "$tmp/planted.md" || { echo "✗ SELF-TEST: identity denylist regex broken" >&2; rc=1; } grep -qIE "$STRUCTURAL_SH" "$tmp/planted.sh" || { echo "✗ SELF-TEST: structural regex broken" >&2; rc=1; } rm -rf "$tmp"; return $rc } _selftest || exit 2 fail=0 deny_hits="$(_files_identity | xargs -0 -r grep -nIEi "$DENYLIST" 2>/dev/null || true)" if [[ -n "$deny_hits" ]]; then echo "✗ [denylist] operator-identity tokens in shipped files (examples/ included):" echo "$deny_hits" | sed "s#^\./##; s/^/ /" fail=1 fi struct_hits="$(_files_structural | xargs -0 -r grep -nIE "$STRUCTURAL_SH" 2>/dev/null || true)" if [[ -n "$struct_hits" ]]; then echo "✗ [structural] private \$HOME/src default in a shipped script:" echo "$struct_hits" | sed "s#^\./##; s/^/ /" fail=1 fi if [[ "$fail" -ne 0 ]]; then echo echo "Sanitization gate FAILED. Public framework files must not contain operator identity" >&2 echo "or private \$HOME defaults. Move personal content to init-generated files or genericize." >&2 exit 1 fi echo "✓ sanitization gate passed (identity scan incl. examples/; structural scan excl. examples/)"