feat(#160): Implement basic orchestration loop

Implements the Coordinator class with main orchestration loop: - Async loop architecture with configurable poll interval - process_queue() method gets next ready issue and spawns agent (stub) - Graceful shutdown handling with stop() method - Error handling that allows loop to continue after failures - Logging for all actions (start, stop, processing, errors) - Integration with QueueManager from #159 - Active agent tracking for future agent management Configuration settings added: - COORDINATOR_POLL_INTERVAL (default: 5.0s) - COORDINATOR_MAX_CONCURRENT_AGENTS (default: 10) - COORDINATOR_ENABLED (default: true) Tests: 27 new tests covering all acceptance criteria Coverage: 92% overall (100% for coordinator.py) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-01 18:03:12 -06:00
parent f0fd0bed41
commit 88953fc998
9 changed files with 1043 additions and 24 deletions
--- a/apps/coordinator/src/config.py
+++ b/apps/coordinator/src/config.py
@@ -27,6 +27,11 @@ class Settings(BaseSettings):
    # Logging
    log_level: str = "info"

+    # Coordinator Configuration
+    coordinator_poll_interval: float = 5.0
+    coordinator_max_concurrent_agents: int = 10
+    coordinator_enabled: bool = True
+

 def get_settings() -> Settings:
    """Get settings instance (lazy loaded)."""
--- a/apps/coordinator/src/coordinator.py
+++ b/apps/coordinator/src/coordinator.py
@@ -0,0 +1,181 @@
+"""Coordinator orchestration loop for processing issue queue."""
+
+import asyncio
+import logging
+from typing import Any
+
+from src.queue import QueueItem, QueueManager
+
+logger = logging.getLogger(__name__)
+
+
+class Coordinator:
+    """Main orchestration loop for processing the issue queue.
+
+    The Coordinator is responsible for:
+    - Monitoring the queue for ready items
+    - Spawning agents to process issues (stub implementation for Phase 0)
+    - Marking items as complete when processing finishes
+    - Handling errors gracefully
+    - Supporting graceful shutdown
+    """
+
+    def __init__(
+        self,
+        queue_manager: QueueManager,
+        poll_interval: float = 5.0,
+    ) -> None:
+        """Initialize the Coordinator.
+
+        Args:
+            queue_manager: QueueManager instance for queue operations
+            poll_interval: Seconds between queue polls (default: 5.0)
+        """
+        self.queue_manager = queue_manager
+        self.poll_interval = poll_interval
+        self._running = False
+        self._stop_event: asyncio.Event | None = None
+        self._active_agents: dict[int, dict[str, Any]] = {}
+
+    @property
+    def is_running(self) -> bool:
+        """Check if the coordinator is currently running.
+
+        Returns:
+            True if the orchestration loop is running
+        """
+        return self._running
+
+    @property
+    def active_agents(self) -> dict[int, dict[str, Any]]:
+        """Get the dictionary of active agents.
+
+        Returns:
+            Dictionary mapping issue numbers to agent info
+        """
+        return self._active_agents
+
+    def get_active_agent_count(self) -> int:
+        """Get the count of currently active agents.
+
+        Returns:
+            Number of active agents
+        """
+        return len(self._active_agents)
+
+    async def start(self) -> None:
+        """Start the orchestration loop.
+
+        Continuously processes the queue until stop() is called.
+        """
+        self._running = True
+        self._stop_event = asyncio.Event()
+        logger.info("Coordinator started - beginning orchestration loop")
+
+        try:
+            while self._running:
+                try:
+                    await self.process_queue()
+                except Exception as e:
+                    logger.error(f"Error in process_queue: {e}")
+                    # Continue running despite errors
+
+                # Wait for poll interval or stop signal
+                try:
+                    await asyncio.wait_for(
+                        self._stop_event.wait(),
+                        timeout=self.poll_interval,
+                    )
+                    # If we reach here, stop was requested
+                    break
+                except TimeoutError:
+                    # Normal timeout, continue polling
+                    pass
+
+        finally:
+            self._running = False
+            logger.info("Coordinator stopped")
+
+    async def stop(self) -> None:
+        """Stop the orchestration loop gracefully.
+
+        Signals the loop to stop and waits for current processing to complete.
+        This method is idempotent - can be called multiple times safely.
+        """
+        logger.info("Coordinator stop requested")
+        self._running = False
+        if self._stop_event is not None:
+            self._stop_event.set()
+
+    async def process_queue(self) -> QueueItem | None:
+        """Process the next ready item from the queue.
+
+        Gets the next ready item, spawns an agent to process it,
+        and marks it complete on success.
+
+        Returns:
+            The QueueItem that was processed, or None if queue is empty
+        """
+        # Get next ready item
+        item = self.queue_manager.get_next_ready()
+
+        if item is None:
+            logger.debug("No items in queue to process")
+            return None
+
+        logger.info(
+            f"Processing issue #{item.issue_number} "
+            f"(agent: {item.metadata.assigned_agent}, "
+            f"difficulty: {item.metadata.difficulty})"
+        )
+
+        # Mark as in progress
+        self.queue_manager.mark_in_progress(item.issue_number)
+
+        # Spawn agent (stub implementation)
+        try:
+            success = await self.spawn_agent(item)
+
+            if success:
+                # Mark as complete
+                self.queue_manager.mark_complete(item.issue_number)
+                logger.info(f"Issue #{item.issue_number} completed successfully")
+            else:
+                logger.warning(f"Issue #{item.issue_number} agent failed - remains in progress")
+
+        except Exception as e:
+            logger.error(f"Error spawning agent for issue #{item.issue_number}: {e}")
+            # Item remains in progress on error
+
+        return item
+
+    async def spawn_agent(self, item: QueueItem) -> bool:
+        """Spawn an agent to process the given item.
+
+        This is a stub implementation for Phase 0 that always succeeds.
+        Future phases will implement actual agent spawning.
+
+        Args:
+            item: QueueItem containing issue details
+
+        Returns:
+            True if agent completed successfully, False otherwise
+        """
+        logger.info(
+            f"[STUB] Spawning {item.metadata.assigned_agent} agent "
+            f"for issue #{item.issue_number} "
+            f"(estimated context: {item.metadata.estimated_context} tokens)"
+        )
+
+        # Track the agent
+        self._active_agents[item.issue_number] = {
+            "agent_type": item.metadata.assigned_agent,
+            "issue_number": item.issue_number,
+            "status": "running",
+        }
+
+        # Stub implementation: always succeed
+        # In future phases, this will actually spawn a Claude agent process
+        logger.info(f"[STUB] Agent completed for issue #{item.issue_number}")
+
+        return True
--- a/apps/coordinator/src/main.py
+++ b/apps/coordinator/src/main.py
@@ -1,13 +1,18 @@
 """FastAPI application for mosaic-coordinator webhook receiver."""

+import asyncio
 import logging
 from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
+from pathlib import Path
+from typing import Any

 from fastapi import FastAPI
 from pydantic import BaseModel

 from .config import settings
+from .coordinator import Coordinator
+from .queue import QueueManager
 from .webhook import router as webhook_router


@@ -26,24 +31,77 @@ def setup_logging() -> None:
 setup_logging()
 logger = logging.getLogger(__name__)

+# Global instances for application state
+_coordinator: Coordinator | None = None
+_coordinator_task: asyncio.Task[None] | None = None
+
+
+def get_coordinator() -> Coordinator | None:
+    """Get the global coordinator instance.
+
+    Returns:
+        The Coordinator instance if initialized, None otherwise
+    """
+    return _coordinator
+

@asynccontextmanager
-async def lifespan(app: FastAPI) -> AsyncIterator[None]:
-    """
-    Application lifespan manager.
+async def lifespan(app: FastAPI) -> AsyncIterator[dict[str, Any]]:
+    """Application lifespan manager.

-    Handles startup and shutdown logic.
+    Handles startup and shutdown logic including coordinator lifecycle.
+
+    Yields:
+        State dict with shared resources
    """
+    global _coordinator, _coordinator_task
+
    # Startup
    logger.info("Starting mosaic-coordinator webhook receiver")
    logger.info(f"Gitea URL: {settings.gitea_url}")
    logger.info(f"Log level: {settings.log_level}")
    logger.info(f"Server: {settings.host}:{settings.port}")

-    yield
+    # Initialize queue manager
+    queue_file = Path("queue.json")
+    queue_manager = QueueManager(queue_file=queue_file)
+    logger.info(f"Queue manager initialized (file: {queue_file})")
+
+    # Initialize and start coordinator if enabled
+    if settings.coordinator_enabled:
+        _coordinator = Coordinator(
+            queue_manager=queue_manager,
+            poll_interval=settings.coordinator_poll_interval,
+        )
+        logger.info(
+            f"Coordinator initialized (poll interval: {settings.coordinator_poll_interval}s, "
+            f"max agents: {settings.coordinator_max_concurrent_agents})"
+        )
+
+        # Start coordinator in background
+        _coordinator_task = asyncio.create_task(_coordinator.start())
+        logger.info("Coordinator orchestration loop started")
+    else:
+        logger.info("Coordinator disabled via configuration")
+
+    yield {"queue_manager": queue_manager, "coordinator": _coordinator}

    # Shutdown
-    logger.info("Shutting down mosaic-coordinator webhook receiver")
+    logger.info("Shutting down mosaic-coordinator")
+
+    # Stop coordinator gracefully
+    if _coordinator is not None:
+        logger.info("Stopping coordinator...")
+        await _coordinator.stop()
+        if _coordinator_task is not None:
+            _coordinator_task.cancel()
+            try:
+                await _coordinator_task
+            except asyncio.CancelledError:
+                pass
+        logger.info("Coordinator stopped")
+
+    logger.info("Mosaic-coordinator shutdown complete")


 # Create FastAPI application
@@ -60,17 +118,30 @@ class HealthResponse(BaseModel):

    status: str
    service: str
+    coordinator_running: bool = False
+    active_agents: int = 0


@app.get("/health", response_model=HealthResponse)
 async def health_check() -> HealthResponse:
-    """
-    Health check endpoint.
+    """Health check endpoint.

    Returns:
-        HealthResponse indicating service is healthy
+        HealthResponse indicating service is healthy with coordinator status
    """
-    return HealthResponse(status="healthy", service="mosaic-coordinator")
+    coordinator_running = False
+    active_agents = 0
+
+    if _coordinator is not None:
+        coordinator_running = _coordinator.is_running
+        active_agents = _coordinator.get_active_agent_count()
+
+    return HealthResponse(
+        status="healthy",
+        service="mosaic-coordinator",
+        coordinator_running=coordinator_running,
+        active_agents=active_agents,
+    )


 # Include webhook router