fix(#338): Add circuit breaker to coordinator loops

Implement circuit breaker pattern to prevent infinite retry loops on repeated failures (SEC-ORCH-7). The circuit breaker tracks consecutive failures and opens after a threshold is reached, blocking further requests until a cooldown period elapses. Circuit breaker states: - CLOSED: Normal operation, requests pass through - OPEN: After N consecutive failures, all requests blocked - HALF_OPEN: After cooldown, allow one test request Changes: - Add circuit_breaker.py with CircuitBreaker class - Integrate circuit breaker into Coordinator.start() loop - Integrate circuit breaker into OrchestrationLoop.start() loop - Integrate per-agent circuit breakers into ContextMonitor - Add comprehensive tests for circuit breaker behavior - Log state transitions and circuit breaker stats on shutdown Configuration (defaults): - failure_threshold: 5 consecutive failures - cooldown_seconds: 30 seconds Refs #338 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 18:10:38 -06:00
parent 203bd1e7f2
commit 1852fe2812
5 changed files with 1219 additions and 11 deletions
--- a/apps/coordinator/tests/test_circuit_breaker.py
+++ b/apps/coordinator/tests/test_circuit_breaker.py
@@ -0,0 +1,495 @@
+"""Tests for circuit breaker pattern implementation.
+
+These tests verify the circuit breaker behavior:
+- State transitions (closed -> open -> half_open -> closed)
+- Failure counting and threshold detection
+- Cooldown timing
+- Success/failure recording
+- Execute wrapper method
+"""
+
+import asyncio
+import time
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from src.circuit_breaker import CircuitBreaker, CircuitBreakerError, CircuitState
+
+
+class TestCircuitBreakerInitialization:
+    """Tests for CircuitBreaker initialization."""
+
+    def test_default_initialization(self) -> None:
+        """Test circuit breaker initializes with default values."""
+        cb = CircuitBreaker("test")
+
+        assert cb.name == "test"
+        assert cb.failure_threshold == 5
+        assert cb.cooldown_seconds == 30.0
+        assert cb.state == CircuitState.CLOSED
+        assert cb.failure_count == 0
+
+    def test_custom_initialization(self) -> None:
+        """Test circuit breaker with custom values."""
+        cb = CircuitBreaker(
+            name="custom",
+            failure_threshold=3,
+            cooldown_seconds=10.0,
+        )
+
+        assert cb.name == "custom"
+        assert cb.failure_threshold == 3
+        assert cb.cooldown_seconds == 10.0
+
+    def test_initial_state_is_closed(self) -> None:
+        """Test circuit starts in closed state."""
+        cb = CircuitBreaker("test")
+        assert cb.state == CircuitState.CLOSED
+
+    def test_initial_can_execute_is_true(self) -> None:
+        """Test can_execute returns True initially."""
+        cb = CircuitBreaker("test")
+        assert cb.can_execute() is True
+
+
+class TestCircuitBreakerFailureTracking:
+    """Tests for failure tracking behavior."""
+
+    def test_failure_increments_count(self) -> None:
+        """Test that recording failure increments failure count."""
+        cb = CircuitBreaker("test", failure_threshold=5)
+
+        cb.record_failure()
+        assert cb.failure_count == 1
+
+        cb.record_failure()
+        assert cb.failure_count == 2
+
+    def test_success_resets_failure_count(self) -> None:
+        """Test that recording success resets failure count."""
+        cb = CircuitBreaker("test", failure_threshold=5)
+
+        cb.record_failure()
+        cb.record_failure()
+        assert cb.failure_count == 2
+
+        cb.record_success()
+        assert cb.failure_count == 0
+
+    def test_total_failures_tracked(self) -> None:
+        """Test that total failures are tracked separately."""
+        cb = CircuitBreaker("test", failure_threshold=5)
+
+        cb.record_failure()
+        cb.record_failure()
+        cb.record_success()  # Resets consecutive count
+        cb.record_failure()
+
+        assert cb.failure_count == 1  # Consecutive
+        assert cb.total_failures == 3  # Total
+
+    def test_total_successes_tracked(self) -> None:
+        """Test that total successes are tracked."""
+        cb = CircuitBreaker("test")
+
+        cb.record_success()
+        cb.record_success()
+        cb.record_failure()
+        cb.record_success()
+
+        assert cb.total_successes == 3
+
+
+class TestCircuitBreakerStateTransitions:
+    """Tests for state transition behavior."""
+
+    def test_reaches_threshold_opens_circuit(self) -> None:
+        """Test circuit opens when failure threshold is reached."""
+        cb = CircuitBreaker("test", failure_threshold=3)
+
+        cb.record_failure()
+        assert cb.state == CircuitState.CLOSED
+
+        cb.record_failure()
+        assert cb.state == CircuitState.CLOSED
+
+        cb.record_failure()
+        assert cb.state == CircuitState.OPEN
+
+    def test_open_circuit_blocks_execution(self) -> None:
+        """Test that open circuit blocks can_execute."""
+        cb = CircuitBreaker("test", failure_threshold=2)
+
+        cb.record_failure()
+        cb.record_failure()
+
+        assert cb.state == CircuitState.OPEN
+        assert cb.can_execute() is False
+
+    def test_cooldown_transitions_to_half_open(self) -> None:
+        """Test that cooldown period transitions circuit to half-open."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        cb.record_failure()
+        cb.record_failure()
+        assert cb.state == CircuitState.OPEN
+
+        # Wait for cooldown
+        time.sleep(0.15)
+
+        # Accessing state triggers transition
+        assert cb.state == CircuitState.HALF_OPEN
+
+    def test_half_open_allows_one_request(self) -> None:
+        """Test that half-open state allows test request."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        cb.record_failure()
+        cb.record_failure()
+
+        time.sleep(0.15)
+
+        assert cb.state == CircuitState.HALF_OPEN
+        assert cb.can_execute() is True
+
+    def test_half_open_success_closes_circuit(self) -> None:
+        """Test that success in half-open state closes circuit."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        cb.record_failure()
+        cb.record_failure()
+
+        time.sleep(0.15)
+        assert cb.state == CircuitState.HALF_OPEN
+
+        cb.record_success()
+        assert cb.state == CircuitState.CLOSED
+
+    def test_half_open_failure_reopens_circuit(self) -> None:
+        """Test that failure in half-open state reopens circuit."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        cb.record_failure()
+        cb.record_failure()
+
+        time.sleep(0.15)
+        assert cb.state == CircuitState.HALF_OPEN
+
+        cb.record_failure()
+        assert cb.state == CircuitState.OPEN
+
+    def test_state_transitions_counted(self) -> None:
+        """Test that state transitions are counted."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        assert cb.state_transitions == 0
+
+        cb.record_failure()
+        cb.record_failure()  # -> OPEN
+        assert cb.state_transitions == 1
+
+        time.sleep(0.15)
+        _ = cb.state  # -> HALF_OPEN
+        assert cb.state_transitions == 2
+
+        cb.record_success()  # -> CLOSED
+        assert cb.state_transitions == 3
+
+
+class TestCircuitBreakerCooldown:
+    """Tests for cooldown timing behavior."""
+
+    def test_time_until_retry_when_open(self) -> None:
+        """Test time_until_retry reports correct value when open."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=1.0)
+
+        cb.record_failure()
+        cb.record_failure()
+
+        # Should be approximately 1 second
+        assert 0.9 <= cb.time_until_retry <= 1.0
+
+    def test_time_until_retry_decreases(self) -> None:
+        """Test time_until_retry decreases over time."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=1.0)
+
+        cb.record_failure()
+        cb.record_failure()
+
+        initial = cb.time_until_retry
+        time.sleep(0.2)
+        after = cb.time_until_retry
+
+        assert after < initial
+
+    def test_time_until_retry_zero_when_closed(self) -> None:
+        """Test time_until_retry is 0 when circuit is closed."""
+        cb = CircuitBreaker("test")
+        assert cb.time_until_retry == 0.0
+
+    def test_time_until_retry_zero_when_half_open(self) -> None:
+        """Test time_until_retry is 0 when circuit is half-open."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        cb.record_failure()
+        cb.record_failure()
+        time.sleep(0.15)
+
+        assert cb.state == CircuitState.HALF_OPEN
+        assert cb.time_until_retry == 0.0
+
+
+class TestCircuitBreakerReset:
+    """Tests for manual reset behavior."""
+
+    def test_reset_closes_circuit(self) -> None:
+        """Test that reset closes an open circuit."""
+        cb = CircuitBreaker("test", failure_threshold=2)
+
+        cb.record_failure()
+        cb.record_failure()
+        assert cb.state == CircuitState.OPEN
+
+        cb.reset()
+        assert cb.state == CircuitState.CLOSED
+
+    def test_reset_clears_failure_count(self) -> None:
+        """Test that reset clears failure count."""
+        cb = CircuitBreaker("test", failure_threshold=5)
+
+        cb.record_failure()
+        cb.record_failure()
+        assert cb.failure_count == 2
+
+        cb.reset()
+        assert cb.failure_count == 0
+
+    def test_reset_from_half_open(self) -> None:
+        """Test reset from half-open state."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        cb.record_failure()
+        cb.record_failure()
+        time.sleep(0.15)
+        assert cb.state == CircuitState.HALF_OPEN
+
+        cb.reset()
+        assert cb.state == CircuitState.CLOSED
+
+
+class TestCircuitBreakerStats:
+    """Tests for statistics reporting."""
+
+    def test_get_stats_returns_all_fields(self) -> None:
+        """Test get_stats returns complete statistics."""
+        cb = CircuitBreaker("test", failure_threshold=3, cooldown_seconds=15.0)
+
+        stats = cb.get_stats()
+
+        assert stats["name"] == "test"
+        assert stats["state"] == "closed"
+        assert stats["failure_count"] == 0
+        assert stats["failure_threshold"] == 3
+        assert stats["cooldown_seconds"] == 15.0
+        assert stats["time_until_retry"] == 0.0
+        assert stats["total_failures"] == 0
+        assert stats["total_successes"] == 0
+        assert stats["state_transitions"] == 0
+
+    def test_stats_update_after_operations(self) -> None:
+        """Test stats update correctly after operations."""
+        cb = CircuitBreaker("test", failure_threshold=3)
+
+        cb.record_failure()
+        cb.record_success()
+        cb.record_failure()
+        cb.record_failure()
+        cb.record_failure()  # Opens circuit
+
+        stats = cb.get_stats()
+
+        assert stats["state"] == "open"
+        assert stats["failure_count"] == 3
+        assert stats["total_failures"] == 4
+        assert stats["total_successes"] == 1
+        assert stats["state_transitions"] == 1
+
+
+class TestCircuitBreakerError:
+    """Tests for CircuitBreakerError exception."""
+
+    def test_error_contains_state(self) -> None:
+        """Test error contains circuit state."""
+        error = CircuitBreakerError(CircuitState.OPEN, 10.0)
+        assert error.state == CircuitState.OPEN
+
+    def test_error_contains_retry_time(self) -> None:
+        """Test error contains time until retry."""
+        error = CircuitBreakerError(CircuitState.OPEN, 10.5)
+        assert error.time_until_retry == 10.5
+
+    def test_error_message_formatting(self) -> None:
+        """Test error message is properly formatted."""
+        error = CircuitBreakerError(CircuitState.OPEN, 15.3)
+        assert "open" in str(error)
+        assert "15.3" in str(error)
+
+
+class TestCircuitBreakerExecute:
+    """Tests for the execute wrapper method."""
+
+    @pytest.mark.asyncio
+    async def test_execute_calls_function(self) -> None:
+        """Test execute calls the provided function."""
+        cb = CircuitBreaker("test")
+        mock_func = AsyncMock(return_value="success")
+
+        result = await cb.execute(mock_func, "arg1", kwarg="value")
+
+        mock_func.assert_called_once_with("arg1", kwarg="value")
+        assert result == "success"
+
+    @pytest.mark.asyncio
+    async def test_execute_records_success(self) -> None:
+        """Test execute records success on successful call."""
+        cb = CircuitBreaker("test")
+        mock_func = AsyncMock(return_value="ok")
+
+        await cb.execute(mock_func)
+
+        assert cb.total_successes == 1
+
+    @pytest.mark.asyncio
+    async def test_execute_records_failure(self) -> None:
+        """Test execute records failure when function raises."""
+        cb = CircuitBreaker("test")
+        mock_func = AsyncMock(side_effect=RuntimeError("test error"))
+
+        with pytest.raises(RuntimeError):
+            await cb.execute(mock_func)
+
+        assert cb.failure_count == 1
+
+    @pytest.mark.asyncio
+    async def test_execute_raises_when_open(self) -> None:
+        """Test execute raises CircuitBreakerError when circuit is open."""
+        cb = CircuitBreaker("test", failure_threshold=2)
+
+        mock_func = AsyncMock(side_effect=RuntimeError("fail"))
+
+        with pytest.raises(RuntimeError):
+            await cb.execute(mock_func)
+        with pytest.raises(RuntimeError):
+            await cb.execute(mock_func)
+
+        # Circuit should now be open
+        assert cb.state == CircuitState.OPEN
+
+        # Next call should raise CircuitBreakerError
+        with pytest.raises(CircuitBreakerError) as exc_info:
+            await cb.execute(mock_func)
+
+        assert exc_info.value.state == CircuitState.OPEN
+
+    @pytest.mark.asyncio
+    async def test_execute_allows_half_open_test(self) -> None:
+        """Test execute allows test request in half-open state."""
+        cb = CircuitBreaker("test", failure_threshold=2, cooldown_seconds=0.1)
+
+        mock_func = AsyncMock(side_effect=RuntimeError("fail"))
+
+        with pytest.raises(RuntimeError):
+            await cb.execute(mock_func)
+        with pytest.raises(RuntimeError):
+            await cb.execute(mock_func)
+
+        # Wait for cooldown
+        await asyncio.sleep(0.15)
+        assert cb.state == CircuitState.HALF_OPEN
+
+        # Should allow test request
+        mock_func.side_effect = None
+        mock_func.return_value = "recovered"
+
+        result = await cb.execute(mock_func)
+        assert result == "recovered"
+        assert cb.state == CircuitState.CLOSED
+
+
+class TestCircuitBreakerConcurrency:
+    """Tests for thread safety and concurrent access."""
+
+    @pytest.mark.asyncio
+    async def test_concurrent_failures(self) -> None:
+        """Test concurrent failures are handled correctly."""
+        cb = CircuitBreaker("test", failure_threshold=10)
+
+        async def record_failure() -> None:
+            cb.record_failure()
+
+        # Record 10 concurrent failures
+        await asyncio.gather(*[record_failure() for _ in range(10)])
+
+        assert cb.failure_count >= 10
+        assert cb.state == CircuitState.OPEN
+
+    @pytest.mark.asyncio
+    async def test_concurrent_mixed_operations(self) -> None:
+        """Test concurrent mixed success/failure operations."""
+        cb = CircuitBreaker("test", failure_threshold=100)
+
+        async def record_success() -> None:
+            cb.record_success()
+
+        async def record_failure() -> None:
+            cb.record_failure()
+
+        # Mix of operations
+        tasks = [record_failure() for _ in range(5)]
+        tasks.extend([record_success() for _ in range(3)])
+        tasks.extend([record_failure() for _ in range(5)])
+
+        await asyncio.gather(*tasks)
+
+        # At least some of each should have been recorded
+        assert cb.total_failures >= 5
+        assert cb.total_successes >= 1
+
+
+class TestCircuitBreakerLogging:
+    """Tests for logging behavior."""
+
+    def test_logs_state_transitions(self) -> None:
+        """Test that state transitions are logged."""
+        cb = CircuitBreaker("test", failure_threshold=2)
+
+        with patch("src.circuit_breaker.logger") as mock_logger:
+            cb.record_failure()
+            cb.record_failure()
+
+            # Should have logged the transition to OPEN
+            mock_logger.info.assert_called()
+            calls = [str(c) for c in mock_logger.info.call_args_list]
+            assert any("closed -> open" in c for c in calls)
+
+    def test_logs_failure_warnings(self) -> None:
+        """Test that failures are logged as warnings."""
+        cb = CircuitBreaker("test", failure_threshold=5)
+
+        with patch("src.circuit_breaker.logger") as mock_logger:
+            cb.record_failure()
+
+            mock_logger.warning.assert_called()
+
+    def test_logs_threshold_reached_as_error(self) -> None:
+        """Test that reaching threshold is logged as error."""
+        cb = CircuitBreaker("test", failure_threshold=2)
+
+        with patch("src.circuit_breaker.logger") as mock_logger:
+            cb.record_failure()
+            cb.record_failure()
+
+            mock_logger.error.assert_called()
+            calls = [str(c) for c in mock_logger.error.call_args_list]
+            assert any("threshold reached" in c for c in calls)
--- a/apps/coordinator/tests/test_coordinator.py
+++ b/apps/coordinator/tests/test_coordinator.py
@@ -744,3 +744,186 @@ class TestCoordinatorActiveAgents:
        await coordinator.process_queue()

        assert coordinator.get_active_agent_count() == 3
+
+
+class TestCoordinatorCircuitBreaker:
+    """Tests for Coordinator circuit breaker integration (SEC-ORCH-7)."""
+
+    @pytest.fixture
+    def temp_queue_file(self) -> Generator[Path, None, None]:
+        """Create a temporary file for queue persistence."""
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
+            temp_path = Path(f.name)
+        yield temp_path
+        if temp_path.exists():
+            temp_path.unlink()
+
+    @pytest.fixture
+    def queue_manager(self, temp_queue_file: Path) -> QueueManager:
+        """Create a queue manager with temporary storage."""
+        return QueueManager(queue_file=temp_queue_file)
+
+    def test_circuit_breaker_initialized(self, queue_manager: QueueManager) -> None:
+        """Test that circuit breaker is initialized with Coordinator."""
+        from src.coordinator import Coordinator
+
+        coordinator = Coordinator(queue_manager=queue_manager)
+
+        assert coordinator.circuit_breaker is not None
+        assert coordinator.circuit_breaker.name == "coordinator_loop"
+
+    def test_circuit_breaker_custom_settings(self, queue_manager: QueueManager) -> None:
+        """Test circuit breaker with custom threshold and cooldown."""
+        from src.coordinator import Coordinator
+
+        coordinator = Coordinator(
+            queue_manager=queue_manager,
+            circuit_breaker_threshold=3,
+            circuit_breaker_cooldown=15.0,
+        )
+
+        assert coordinator.circuit_breaker.failure_threshold == 3
+        assert coordinator.circuit_breaker.cooldown_seconds == 15.0
+
+    def test_get_circuit_breaker_stats(self, queue_manager: QueueManager) -> None:
+        """Test getting circuit breaker statistics."""
+        from src.coordinator import Coordinator
+
+        coordinator = Coordinator(queue_manager=queue_manager)
+
+        stats = coordinator.get_circuit_breaker_stats()
+
+        assert "name" in stats
+        assert "state" in stats
+        assert "failure_count" in stats
+        assert "total_failures" in stats
+        assert stats["name"] == "coordinator_loop"
+        assert stats["state"] == "closed"
+
+    @pytest.mark.asyncio
+    async def test_circuit_breaker_opens_on_repeated_failures(
+        self, queue_manager: QueueManager
+    ) -> None:
+        """Test that circuit breaker opens after repeated failures."""
+        from src.circuit_breaker import CircuitState
+        from src.coordinator import Coordinator
+
+        coordinator = Coordinator(
+            queue_manager=queue_manager,
+            poll_interval=0.02,
+            circuit_breaker_threshold=3,
+            circuit_breaker_cooldown=0.2,
+        )
+
+        failure_count = 0
+
+        async def failing_process_queue() -> None:
+            nonlocal failure_count
+            failure_count += 1
+            raise RuntimeError("Simulated failure")
+
+        coordinator.process_queue = failing_process_queue  # type: ignore[method-assign]
+
+        task = asyncio.create_task(coordinator.start())
+        await asyncio.sleep(0.15)  # Allow time for failures
+
+        # Circuit should be open after 3 failures
+        assert coordinator.circuit_breaker.state == CircuitState.OPEN
+        assert coordinator.circuit_breaker.failure_count >= 3
+
+        await coordinator.stop()
+        task.cancel()
+        try:
+            await task
+        except asyncio.CancelledError:
+            pass
+
+    @pytest.mark.asyncio
+    async def test_circuit_breaker_backs_off_when_open(
+        self, queue_manager: QueueManager
+    ) -> None:
+        """Test that coordinator backs off when circuit breaker is open."""
+        from src.coordinator import Coordinator
+
+        coordinator = Coordinator(
+            queue_manager=queue_manager,
+            poll_interval=0.02,
+            circuit_breaker_threshold=2,
+            circuit_breaker_cooldown=0.3,
+        )
+
+        call_timestamps: list[float] = []
+
+        async def failing_process_queue() -> None:
+            call_timestamps.append(asyncio.get_event_loop().time())
+            raise RuntimeError("Simulated failure")
+
+        coordinator.process_queue = failing_process_queue  # type: ignore[method-assign]
+
+        task = asyncio.create_task(coordinator.start())
+        await asyncio.sleep(0.5)  # Allow time for failures and backoff
+        await coordinator.stop()
+
+        task.cancel()
+        try:
+            await task
+        except asyncio.CancelledError:
+            pass
+
+        # Should have at least 2 calls (to trigger open), then back off
+        assert len(call_timestamps) >= 2
+
+        # After circuit opens, there should be a gap (cooldown)
+        if len(call_timestamps) >= 3:
+            # Check there's a larger gap after the first 2 calls
+            first_gap = call_timestamps[1] - call_timestamps[0]
+            later_gap = call_timestamps[2] - call_timestamps[1]
+            # Later gap should be larger due to cooldown
+            assert later_gap > first_gap * 2
+
+    @pytest.mark.asyncio
+    async def test_circuit_breaker_resets_on_success(
+        self, queue_manager: QueueManager
+    ) -> None:
+        """Test that circuit breaker resets after successful operation."""
+        from src.circuit_breaker import CircuitState
+        from src.coordinator import Coordinator
+
+        coordinator = Coordinator(
+            queue_manager=queue_manager,
+            poll_interval=0.02,
+            circuit_breaker_threshold=3,
+        )
+
+        # Record failures then success
+        coordinator.circuit_breaker.record_failure()
+        coordinator.circuit_breaker.record_failure()
+        assert coordinator.circuit_breaker.failure_count == 2
+
+        coordinator.circuit_breaker.record_success()
+        assert coordinator.circuit_breaker.failure_count == 0
+        assert coordinator.circuit_breaker.state == CircuitState.CLOSED
+
+    @pytest.mark.asyncio
+    async def test_circuit_breaker_stats_logged_on_stop(
+        self, queue_manager: QueueManager
+    ) -> None:
+        """Test that circuit breaker stats are logged when coordinator stops."""
+        from src.coordinator import Coordinator
+
+        coordinator = Coordinator(queue_manager=queue_manager, poll_interval=0.05)
+
+        with patch("src.coordinator.logger") as mock_logger:
+            task = asyncio.create_task(coordinator.start())
+            await asyncio.sleep(0.1)
+            await coordinator.stop()
+
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
+
+            # Should log circuit breaker stats on stop
+            info_calls = [str(call) for call in mock_logger.info.call_args_list]
+            assert any("circuit breaker" in call.lower() for call in info_calls)