add drain-mode orchestration and docs/tasks sync for codex-opencode

2026-02-17 14:41:47 -06:00
parent c7f363b2d2
commit 0ff39bcee4
15 changed files with 560 additions and 46 deletions
--- a/rails/orchestrator-matrix/README.md
+++ b/rails/orchestrator-matrix/README.md
@@ -37,6 +37,7 @@ From a bootstrapped repo:
 ```bash
 ~/.config/mosaic/bin/mosaic-orchestrator-matrix-cycle
 ~/.config/mosaic/bin/mosaic-orchestrator-run --once
+~/.config/mosaic/bin/mosaic-orchestrator-drain
 ```

 Continuous loop:
@@ -45,6 +46,20 @@ Continuous loop:
 ~/.config/mosaic/bin/mosaic-orchestrator-run --poll-sec 10
 ```

+Sync from `docs/tasks.md` to queue:
+
+```bash
+~/.config/mosaic/bin/mosaic-orchestrator-sync-tasks --apply
+```
+
+Set worker command when needed:
+
+```bash
+export MOSAIC_WORKER_EXEC="codex -p"
+# or
+export MOSAIC_WORKER_EXEC="opencode -p"
+```
+
 Publish new orchestrator events to Matrix:

 ```bash
--- a/rails/orchestrator-matrix/controller/mosaic_orchestrator.py
+++ b/rails/orchestrator-matrix/controller/mosaic_orchestrator.py
@@ -65,7 +65,7 @@ def emit_event(
    )


-def run_shell(command: str, cwd: pathlib.Path, log_path: pathlib.Path) -> tuple[int, str]:
+def run_shell(command: str, cwd: pathlib.Path, log_path: pathlib.Path, timeout_sec: int) -> tuple[int, str, bool]:
    log_path.parent.mkdir(parents=True, exist_ok=True)
    with log_path.open("a", encoding="utf-8") as log:
        log.write(f"\n[{now_iso()}] COMMAND: {command}\n")
@@ -78,14 +78,21 @@ def run_shell(command: str, cwd: pathlib.Path, log_path: pathlib.Path) -> tuple[
            text=True,
            encoding="utf-8",
        )
-        output_chunks: list[str] = []
-        assert proc.stdout is not None
-        for line in proc.stdout:
-            output_chunks.append(line)
-            log.write(line)
-        code = proc.wait()
+        timed_out = False
+        try:
+            output, _ = proc.communicate(timeout=max(1, timeout_sec))
+            code = proc.returncode
+        except subprocess.TimeoutExpired:
+            timed_out = True
+            proc.kill()
+            output, _ = proc.communicate()
+            code = 124
+            log.write(f"[{now_iso()}] TIMEOUT: exceeded {timeout_sec}s\n")
+
+        if output:
+            log.write(output)
        log.write(f"[{now_iso()}] EXIT: {code}\n")
-    return code, "".join(output_chunks)
+    return code, output or "", timed_out


 def render_command_template(template: str, task: dict[str, Any], task_file: pathlib.Path) -> str:
@@ -96,9 +103,26 @@ def render_command_template(template: str, task: dict[str, Any], task_file: path
    )


+def parse_dep_list(raw: Any) -> list[str]:
+    if isinstance(raw, list):
+        return [str(x).strip() for x in raw if str(x).strip()]
+    if isinstance(raw, str):
+        return [x.strip() for x in raw.split(",") if x.strip()]
+    return []
+
+
+def is_completed_status(status: str) -> bool:
+    return status in {"completed", "done"}
+
+
 def pick_next_task(tasks: list[dict[str, Any]]) -> dict[str, Any] | None:
+    status_by_id = {str(t.get("id", "")): str(t.get("status", "")) for t in tasks}
    for task in tasks:
-        if task.get("status", "pending") == "pending":
+        if task.get("status", "pending") != "pending":
+            continue
+        deps = parse_dep_list(task.get("depends_on"))
+        deps_ready = all(is_completed_status(status_by_id.get(dep, "")) for dep in deps)
+        if deps_ready:
            return task
    return None

@@ -120,6 +144,10 @@ def run_single_task(repo_root: pathlib.Path, orch_dir: pathlib.Path, config: dic
        return False

    task_id = str(task.get("id", "unknown-task"))
+    max_attempts = int(task.get("max_attempts") or config.get("worker", {}).get("max_attempts") or 1)
+    attempt = int(task.get("attempts", 0)) + 1
+    task["attempts"] = attempt
+    task["max_attempts"] = max_attempts
    task["status"] = "running"
    task["started_at"] = now_iso()
    save_json(tasks_path, {"tasks": task_items})
@@ -153,17 +181,33 @@ def run_single_task(repo_root: pathlib.Path, orch_dir: pathlib.Path, config: dic
        save_json(state_path, state)
        return True

-    rc, _ = run_shell(cmd, repo_root, log_path)
+    timeout_sec = int(task.get("timeout_seconds") or config.get("worker", {}).get("timeout_seconds") or 7200)
+    rc, _, timed_out = run_shell(cmd, repo_root, log_path, timeout_sec)
    if rc != 0:
-        task["status"] = "failed"
-        task["failed_at"] = now_iso()
-        task["error"] = f"Worker command failed with exit code {rc}"
+        task["error"] = f"Worker command timed out after {timeout_sec}s" if timed_out else f"Worker command failed with exit code {rc}"
+        if attempt < max_attempts:
+            task["status"] = "pending"
+            task["last_failed_at"] = now_iso()
+            emit_event(
+                events_path,
+                "task.retry.scheduled",
+                task_id,
+                "pending",
+                "worker",
+                f"{task['error']}; retry {attempt + 1}/{max_attempts}",
+            )
+        else:
+            task["status"] = "failed"
+            task["failed_at"] = now_iso()
+            emit_event(events_path, "task.failed", task_id, "failed", "worker", task["error"])
        save_json(tasks_path, {"tasks": task_items})
-        emit_event(events_path, "task.failed", task_id, "failed", "worker", task["error"])
        state["running_task_id"] = None
        state["updated_at"] = now_iso()
        save_json(state_path, state)
-        save_json(results_dir / f"{task_id}.json", {"task_id": task_id, "status": "failed", "exit_code": rc})
+        save_json(
+            results_dir / f"{task_id}.json",
+            {"task_id": task_id, "status": task["status"], "exit_code": rc, "attempt": attempt, "max_attempts": max_attempts},
+        )
        return True

    gates = task.get("quality_gates") or config.get("quality_gates") or []
@@ -174,7 +218,7 @@ def run_single_task(repo_root: pathlib.Path, orch_dir: pathlib.Path, config: dic
        if not gate_cmd:
            continue
        emit_event(events_path, "rail.check.started", task_id, "running", "quality-gate", f"Running gate: {gate_cmd}")
-        gate_rc, _ = run_shell(gate_cmd, repo_root, log_path)
+        gate_rc, _, gate_timed_out = run_shell(gate_cmd, repo_root, log_path, timeout_sec)
        if gate_rc == 0:
            emit_event(events_path, "rail.check.passed", task_id, "running", "quality-gate", f"Gate passed: {gate_cmd}")
        else:
@@ -185,7 +229,7 @@ def run_single_task(repo_root: pathlib.Path, orch_dir: pathlib.Path, config: dic
                task_id,
                "failed",
                "quality-gate",
-                f"Gate failed ({gate_rc}): {gate_cmd}",
+                f"Gate timed out after {timeout_sec}s: {gate_cmd}" if gate_timed_out else f"Gate failed ({gate_rc}): {gate_cmd}",
            )
        gate_results.append({"command": gate_cmd, "exit_code": gate_rc})

@@ -194,10 +238,22 @@ def run_single_task(repo_root: pathlib.Path, orch_dir: pathlib.Path, config: dic
        task["completed_at"] = now_iso()
        emit_event(events_path, "task.completed", task_id, "completed", "controller", "Task completed")
    else:
-        task["status"] = "failed"
-        task["failed_at"] = now_iso()
        task["error"] = "One or more quality gates failed"
-        emit_event(events_path, "task.failed", task_id, "failed", "controller", task["error"])
+        if attempt < max_attempts:
+            task["status"] = "pending"
+            task["last_failed_at"] = now_iso()
+            emit_event(
+                events_path,
+                "task.retry.scheduled",
+                task_id,
+                "pending",
+                "controller",
+                f"{task['error']}; retry {attempt + 1}/{max_attempts}",
+            )
+        else:
+            task["status"] = "failed"
+            task["failed_at"] = now_iso()
+            emit_event(events_path, "task.failed", task_id, "failed", "controller", task["error"])

    save_json(tasks_path, {"tasks": task_items})
    state["running_task_id"] = None
@@ -216,10 +272,33 @@ def run_single_task(repo_root: pathlib.Path, orch_dir: pathlib.Path, config: dic
    return True


+def queue_state(orch_dir: pathlib.Path) -> dict[str, int]:
+    tasks = load_json(orch_dir / "tasks.json", {"tasks": []})
+    task_items = tasks.get("tasks", [])
+    if not isinstance(task_items, list):
+        return {"pending": 0, "running": 0, "runnable": 0}
+
+    pending = 0
+    running = 0
+    runnable = 0
+    status_by_id = {str(t.get("id", "")): str(t.get("status", "")) for t in task_items}
+    for task in task_items:
+        status = str(task.get("status", "pending"))
+        if status == "pending":
+            pending += 1
+            deps = parse_dep_list(task.get("depends_on"))
+            if all(is_completed_status(status_by_id.get(dep, "")) for dep in deps):
+                runnable += 1
+        if status == "running":
+            running += 1
+    return {"pending": pending, "running": running, "runnable": runnable}
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description="Mosaic deterministic orchestrator controller")
    parser.add_argument("--repo", default=os.getcwd(), help="Repository root (default: cwd)")
    parser.add_argument("--once", action="store_true", help="Process at most one pending task and exit")
+    parser.add_argument("--until-drained", action="store_true", help="Run until no pending tasks remain (or blocked)")
    parser.add_argument("--poll-sec", type=int, default=10, help="Polling interval for continuous mode")
    args = parser.parse_args()

@@ -246,6 +325,14 @@ def main() -> int:
        try:
            processed = run_single_task(repo_root, orch_dir, config)
            if not processed:
+                qs = queue_state(orch_dir)
+                if args.until_drained:
+                    if qs["pending"] == 0 and qs["running"] == 0:
+                        print("[mosaic-orchestrator] drained: no pending tasks")
+                        return 0
+                    if qs["pending"] > 0 and qs["runnable"] == 0 and qs["running"] == 0:
+                        print("[mosaic-orchestrator] blocked: pending tasks remain but dependencies are unmet", file=sys.stderr)
+                        return 2
                time.sleep(max(1, args.poll_sec))
        except KeyboardInterrupt:
            print("\n[mosaic-orchestrator] stopping")
--- a/rails/orchestrator-matrix/controller/tasks_md_sync.py
+++ b/rails/orchestrator-matrix/controller/tasks_md_sync.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""Sync docs/tasks.md rows into .mosaic/orchestrator/tasks.json."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import pathlib
+from typing import Any
+
+
+def load_json(path: pathlib.Path, default: Any) -> Any:
+    if not path.exists():
+        return default
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def save_json(path: pathlib.Path, data: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    with tmp.open("w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2)
+        f.write("\n")
+    tmp.replace(path)
+
+
+def split_pipe_row(line: str) -> list[str]:
+    row = line.strip()
+    if row.startswith("|"):
+        row = row[1:]
+    if row.endswith("|"):
+        row = row[:-1]
+    return [c.strip() for c in row.split("|")]
+
+
+def parse_tasks_markdown(path: pathlib.Path) -> list[dict[str, str]]:
+    if not path.exists():
+        return []
+    lines = path.read_text(encoding="utf-8").splitlines()
+
+    header_idx = -1
+    headers: list[str] = []
+    for i, line in enumerate(lines):
+        if "|" not in line:
+            continue
+        cells = [x.lower() for x in split_pipe_row(line)]
+        if "id" in cells and "status" in cells and "description" in cells:
+            header_idx = i
+            headers = cells
+            break
+    if header_idx < 0:
+        return []
+
+    rows: list[dict[str, str]] = []
+    for line in lines[header_idx + 2 :]:
+        if not line.strip().startswith("|"):
+            if rows:
+                break
+            continue
+        cells = split_pipe_row(line)
+        if len(cells) < len(headers):
+            cells += [""] * (len(headers) - len(cells))
+        row = {headers[i]: cells[i] for i in range(len(headers))}
+        task_id = row.get("id", "").strip()
+        if not task_id or task_id.lower() == "id":
+            continue
+        rows.append(row)
+    return rows
+
+
+def map_status(raw: str) -> str:
+    value = raw.strip().lower()
+    mapping = {
+        "not-started": "pending",
+        "todo": "pending",
+        "pending": "pending",
+        "in-progress": "pending",
+        "needs-qa": "pending",
+        "done": "completed",
+        "completed": "completed",
+        "failed": "failed",
+    }
+    return mapping.get(value, "pending")
+
+
+def parse_depends(raw: str) -> list[str]:
+    return [x.strip() for x in raw.split(",") if x.strip()]
+
+
+def build_task(row: dict[str, str], existing: dict[str, Any], runtime_default: str) -> dict[str, Any]:
+    task_id = row.get("id", "").strip()
+    description = row.get("description", "").strip()
+    issue = row.get("issue", "").strip()
+    repo = row.get("repo", "").strip()
+    branch = row.get("branch", "").strip()
+    depends_on = parse_depends(row.get("depends_on", ""))
+
+    task = dict(existing)
+    task["id"] = task_id
+    task["title"] = description or task_id
+    task["description"] = description
+    task["status"] = map_status(row.get("status", "pending"))
+    task["depends_on"] = depends_on
+    task["runtime"] = str(task.get("runtime") or runtime_default or "codex")
+    task["command"] = str(task.get("command") or "")
+    task["quality_gates"] = task.get("quality_gates") or []
+    metadata = dict(task.get("metadata") or {})
+    metadata.update(
+        {
+            "source": "docs/tasks.md",
+            "issue": issue,
+            "repo": repo,
+            "branch": branch,
+        }
+    )
+    task["metadata"] = metadata
+    return task
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Sync docs/tasks.md into .mosaic/orchestrator/tasks.json")
+    parser.add_argument("--repo", default=os.getcwd(), help="Repository root (default: cwd)")
+    parser.add_argument("--docs", default="docs/tasks.md", help="Path to tasks markdown (repo-relative)")
+    parser.add_argument(
+        "--tasks-json",
+        default=".mosaic/orchestrator/tasks.json",
+        help="Path to orchestrator tasks JSON (repo-relative)",
+    )
+    parser.add_argument("--keep-unlisted", action="store_true", help="Retain tasks already in JSON but missing from docs/tasks.md")
+    parser.add_argument("--apply", action="store_true", help="Write changes (default is dry-run)")
+    args = parser.parse_args()
+
+    repo = pathlib.Path(args.repo).resolve()
+    docs_path = (repo / args.docs).resolve()
+    tasks_path = (repo / args.tasks_json).resolve()
+    config_path = repo / ".mosaic" / "orchestrator" / "config.json"
+    config = load_json(config_path, {})
+    runtime_default = str(config.get("worker", {}).get("runtime") or "codex")
+
+    rows = parse_tasks_markdown(docs_path)
+    existing_payload = load_json(tasks_path, {"tasks": []})
+    existing_tasks = existing_payload.get("tasks", [])
+    if not isinstance(existing_tasks, list):
+        existing_tasks = []
+    existing_by_id = {str(t.get("id", "")): t for t in existing_tasks}
+
+    out_tasks: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for row in rows:
+        task_id = row.get("id", "").strip()
+        if not task_id:
+            continue
+        seen.add(task_id)
+        out_tasks.append(build_task(row, existing_by_id.get(task_id, {}), runtime_default))
+
+    if args.keep_unlisted:
+        for task in existing_tasks:
+            task_id = str(task.get("id", ""))
+            if task_id and task_id not in seen:
+                out_tasks.append(task)
+
+    payload = {"tasks": out_tasks}
+    if args.apply:
+        save_json(tasks_path, payload)
+        print(f"[mosaic-orchestrator-sync] wrote {len(out_tasks)} tasks -> {tasks_path}")
+    else:
+        print(f"[mosaic-orchestrator-sync] dry-run: {len(out_tasks)} tasks would be written -> {tasks_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())