feat(#160): Implement basic orchestration loop
Implements the Coordinator class with main orchestration loop: - Async loop architecture with configurable poll interval - process_queue() method gets next ready issue and spawns agent (stub) - Graceful shutdown handling with stop() method - Error handling that allows loop to continue after failures - Logging for all actions (start, stop, processing, errors) - Integration with QueueManager from #159 - Active agent tracking for future agent management Configuration settings added: - COORDINATOR_POLL_INTERVAL (default: 5.0s) - COORDINATOR_MAX_CONCURRENT_AGENTS (default: 10) - COORDINATOR_ENABLED (default: true) Tests: 27 new tests covering all acceptance criteria Coverage: 92% overall (100% for coordinator.py) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -27,6 +27,11 @@ class Settings(BaseSettings):
|
||||
# Logging
|
||||
log_level: str = "info"
|
||||
|
||||
# Coordinator Configuration
|
||||
coordinator_poll_interval: float = 5.0
|
||||
coordinator_max_concurrent_agents: int = 10
|
||||
coordinator_enabled: bool = True
|
||||
|
||||
|
||||
def get_settings() -> Settings:
|
||||
"""Get settings instance (lazy loaded)."""
|
||||
|
||||
181
apps/coordinator/src/coordinator.py
Normal file
181
apps/coordinator/src/coordinator.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""Coordinator orchestration loop for processing issue queue."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from src.queue import QueueItem, QueueManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Coordinator:
|
||||
"""Main orchestration loop for processing the issue queue.
|
||||
|
||||
The Coordinator is responsible for:
|
||||
- Monitoring the queue for ready items
|
||||
- Spawning agents to process issues (stub implementation for Phase 0)
|
||||
- Marking items as complete when processing finishes
|
||||
- Handling errors gracefully
|
||||
- Supporting graceful shutdown
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
queue_manager: QueueManager,
|
||||
poll_interval: float = 5.0,
|
||||
) -> None:
|
||||
"""Initialize the Coordinator.
|
||||
|
||||
Args:
|
||||
queue_manager: QueueManager instance for queue operations
|
||||
poll_interval: Seconds between queue polls (default: 5.0)
|
||||
"""
|
||||
self.queue_manager = queue_manager
|
||||
self.poll_interval = poll_interval
|
||||
self._running = False
|
||||
self._stop_event: asyncio.Event | None = None
|
||||
self._active_agents: dict[int, dict[str, Any]] = {}
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
"""Check if the coordinator is currently running.
|
||||
|
||||
Returns:
|
||||
True if the orchestration loop is running
|
||||
"""
|
||||
return self._running
|
||||
|
||||
@property
|
||||
def active_agents(self) -> dict[int, dict[str, Any]]:
|
||||
"""Get the dictionary of active agents.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping issue numbers to agent info
|
||||
"""
|
||||
return self._active_agents
|
||||
|
||||
def get_active_agent_count(self) -> int:
|
||||
"""Get the count of currently active agents.
|
||||
|
||||
Returns:
|
||||
Number of active agents
|
||||
"""
|
||||
return len(self._active_agents)
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Start the orchestration loop.
|
||||
|
||||
Continuously processes the queue until stop() is called.
|
||||
"""
|
||||
self._running = True
|
||||
self._stop_event = asyncio.Event()
|
||||
logger.info("Coordinator started - beginning orchestration loop")
|
||||
|
||||
try:
|
||||
while self._running:
|
||||
try:
|
||||
await self.process_queue()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process_queue: {e}")
|
||||
# Continue running despite errors
|
||||
|
||||
# Wait for poll interval or stop signal
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._stop_event.wait(),
|
||||
timeout=self.poll_interval,
|
||||
)
|
||||
# If we reach here, stop was requested
|
||||
break
|
||||
except TimeoutError:
|
||||
# Normal timeout, continue polling
|
||||
pass
|
||||
|
||||
finally:
|
||||
self._running = False
|
||||
logger.info("Coordinator stopped")
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stop the orchestration loop gracefully.
|
||||
|
||||
Signals the loop to stop and waits for current processing to complete.
|
||||
This method is idempotent - can be called multiple times safely.
|
||||
"""
|
||||
logger.info("Coordinator stop requested")
|
||||
self._running = False
|
||||
if self._stop_event is not None:
|
||||
self._stop_event.set()
|
||||
|
||||
async def process_queue(self) -> QueueItem | None:
|
||||
"""Process the next ready item from the queue.
|
||||
|
||||
Gets the next ready item, spawns an agent to process it,
|
||||
and marks it complete on success.
|
||||
|
||||
Returns:
|
||||
The QueueItem that was processed, or None if queue is empty
|
||||
"""
|
||||
# Get next ready item
|
||||
item = self.queue_manager.get_next_ready()
|
||||
|
||||
if item is None:
|
||||
logger.debug("No items in queue to process")
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
f"Processing issue #{item.issue_number} "
|
||||
f"(agent: {item.metadata.assigned_agent}, "
|
||||
f"difficulty: {item.metadata.difficulty})"
|
||||
)
|
||||
|
||||
# Mark as in progress
|
||||
self.queue_manager.mark_in_progress(item.issue_number)
|
||||
|
||||
# Spawn agent (stub implementation)
|
||||
try:
|
||||
success = await self.spawn_agent(item)
|
||||
|
||||
if success:
|
||||
# Mark as complete
|
||||
self.queue_manager.mark_complete(item.issue_number)
|
||||
logger.info(f"Issue #{item.issue_number} completed successfully")
|
||||
else:
|
||||
logger.warning(f"Issue #{item.issue_number} agent failed - remains in progress")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error spawning agent for issue #{item.issue_number}: {e}")
|
||||
# Item remains in progress on error
|
||||
|
||||
return item
|
||||
|
||||
async def spawn_agent(self, item: QueueItem) -> bool:
|
||||
"""Spawn an agent to process the given item.
|
||||
|
||||
This is a stub implementation for Phase 0 that always succeeds.
|
||||
Future phases will implement actual agent spawning.
|
||||
|
||||
Args:
|
||||
item: QueueItem containing issue details
|
||||
|
||||
Returns:
|
||||
True if agent completed successfully, False otherwise
|
||||
"""
|
||||
logger.info(
|
||||
f"[STUB] Spawning {item.metadata.assigned_agent} agent "
|
||||
f"for issue #{item.issue_number} "
|
||||
f"(estimated context: {item.metadata.estimated_context} tokens)"
|
||||
)
|
||||
|
||||
# Track the agent
|
||||
self._active_agents[item.issue_number] = {
|
||||
"agent_type": item.metadata.assigned_agent,
|
||||
"issue_number": item.issue_number,
|
||||
"status": "running",
|
||||
}
|
||||
|
||||
# Stub implementation: always succeed
|
||||
# In future phases, this will actually spawn a Claude agent process
|
||||
logger.info(f"[STUB] Agent completed for issue #{item.issue_number}")
|
||||
|
||||
return True
|
||||
@@ -1,13 +1,18 @@
|
||||
"""FastAPI application for mosaic-coordinator webhook receiver."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from collections.abc import AsyncIterator
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
|
||||
from .config import settings
|
||||
from .coordinator import Coordinator
|
||||
from .queue import QueueManager
|
||||
from .webhook import router as webhook_router
|
||||
|
||||
|
||||
@@ -26,24 +31,77 @@ def setup_logging() -> None:
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Global instances for application state
|
||||
_coordinator: Coordinator | None = None
|
||||
_coordinator_task: asyncio.Task[None] | None = None
|
||||
|
||||
|
||||
def get_coordinator() -> Coordinator | None:
|
||||
"""Get the global coordinator instance.
|
||||
|
||||
Returns:
|
||||
The Coordinator instance if initialized, None otherwise
|
||||
"""
|
||||
return _coordinator
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
||||
"""
|
||||
Application lifespan manager.
|
||||
async def lifespan(app: FastAPI) -> AsyncIterator[dict[str, Any]]:
|
||||
"""Application lifespan manager.
|
||||
|
||||
Handles startup and shutdown logic.
|
||||
Handles startup and shutdown logic including coordinator lifecycle.
|
||||
|
||||
Yields:
|
||||
State dict with shared resources
|
||||
"""
|
||||
global _coordinator, _coordinator_task
|
||||
|
||||
# Startup
|
||||
logger.info("Starting mosaic-coordinator webhook receiver")
|
||||
logger.info(f"Gitea URL: {settings.gitea_url}")
|
||||
logger.info(f"Log level: {settings.log_level}")
|
||||
logger.info(f"Server: {settings.host}:{settings.port}")
|
||||
|
||||
yield
|
||||
# Initialize queue manager
|
||||
queue_file = Path("queue.json")
|
||||
queue_manager = QueueManager(queue_file=queue_file)
|
||||
logger.info(f"Queue manager initialized (file: {queue_file})")
|
||||
|
||||
# Initialize and start coordinator if enabled
|
||||
if settings.coordinator_enabled:
|
||||
_coordinator = Coordinator(
|
||||
queue_manager=queue_manager,
|
||||
poll_interval=settings.coordinator_poll_interval,
|
||||
)
|
||||
logger.info(
|
||||
f"Coordinator initialized (poll interval: {settings.coordinator_poll_interval}s, "
|
||||
f"max agents: {settings.coordinator_max_concurrent_agents})"
|
||||
)
|
||||
|
||||
# Start coordinator in background
|
||||
_coordinator_task = asyncio.create_task(_coordinator.start())
|
||||
logger.info("Coordinator orchestration loop started")
|
||||
else:
|
||||
logger.info("Coordinator disabled via configuration")
|
||||
|
||||
yield {"queue_manager": queue_manager, "coordinator": _coordinator}
|
||||
|
||||
# Shutdown
|
||||
logger.info("Shutting down mosaic-coordinator webhook receiver")
|
||||
logger.info("Shutting down mosaic-coordinator")
|
||||
|
||||
# Stop coordinator gracefully
|
||||
if _coordinator is not None:
|
||||
logger.info("Stopping coordinator...")
|
||||
await _coordinator.stop()
|
||||
if _coordinator_task is not None:
|
||||
_coordinator_task.cancel()
|
||||
try:
|
||||
await _coordinator_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
logger.info("Coordinator stopped")
|
||||
|
||||
logger.info("Mosaic-coordinator shutdown complete")
|
||||
|
||||
|
||||
# Create FastAPI application
|
||||
@@ -60,17 +118,30 @@ class HealthResponse(BaseModel):
|
||||
|
||||
status: str
|
||||
service: str
|
||||
coordinator_running: bool = False
|
||||
active_agents: int = 0
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse)
|
||||
async def health_check() -> HealthResponse:
|
||||
"""
|
||||
Health check endpoint.
|
||||
"""Health check endpoint.
|
||||
|
||||
Returns:
|
||||
HealthResponse indicating service is healthy
|
||||
HealthResponse indicating service is healthy with coordinator status
|
||||
"""
|
||||
return HealthResponse(status="healthy", service="mosaic-coordinator")
|
||||
coordinator_running = False
|
||||
active_agents = 0
|
||||
|
||||
if _coordinator is not None:
|
||||
coordinator_running = _coordinator.is_running
|
||||
active_agents = _coordinator.get_active_agent_count()
|
||||
|
||||
return HealthResponse(
|
||||
status="healthy",
|
||||
service="mosaic-coordinator",
|
||||
coordinator_running=coordinator_running,
|
||||
active_agents=active_agents,
|
||||
)
|
||||
|
||||
|
||||
# Include webhook router
|
||||
|
||||
Reference in New Issue
Block a user