fix(#338): Log queue corruption and backup corrupted file

- Log ERROR when queue corruption detected with error details - Create timestamped backup before discarding corrupted data - Add comprehensive tests for corruption handling Refs #338 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 18:13:15 -06:00
parent 1852fe2812
commit 67c72a2d82
2 changed files with 181 additions and 3 deletions
--- a/apps/coordinator/src/queue.py
+++ b/apps/coordinator/src/queue.py
@@ -1,13 +1,18 @@
 """Queue manager for issue coordination."""

 import json
+import logging
+import shutil
 from dataclasses import dataclass, field
+from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Any

 from src.models import IssueMetadata

+logger = logging.getLogger(__name__)
+

 class QueueItemStatus(str, Enum):
    """Status of a queue item."""
@@ -229,6 +234,40 @@ class QueueManager:

            # Update ready status after loading
            self._update_ready_status()
-        except (json.JSONDecodeError, KeyError, ValueError):
-            # If file is corrupted, start with empty queue
-            self._items = {}
+        except (json.JSONDecodeError, KeyError, ValueError) as e:
+            # Log corruption details and create backup before discarding
+            self._handle_corrupted_queue(e)
+
+    def _handle_corrupted_queue(self, error: Exception) -> None:
+        """Handle corrupted queue file by logging, backing up, and resetting.
+
+        Args:
+            error: The exception that was raised during loading
+        """
+        # Log error with details
+        logger.error(
+            "Queue file corruption detected in '%s': %s - %s",
+            self.queue_file,
+            type(error).__name__,
+            str(error),
+        )
+
+        # Create backup of corrupted file with timestamp
+        if self.queue_file.exists():
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            backup_path = self.queue_file.with_suffix(f".corrupted.{timestamp}.json")
+            try:
+                shutil.copy2(self.queue_file, backup_path)
+                logger.error(
+                    "Corrupted queue file backed up to '%s'",
+                    backup_path,
+                )
+            except OSError as backup_error:
+                logger.error(
+                    "Failed to create backup of corrupted queue file: %s",
+                    backup_error,
+                )
+
+        # Reset to empty queue
+        self._items = {}
+        logger.error("Queue reset to empty state after corruption")
--- a/apps/coordinator/tests/test_queue.py
+++ b/apps/coordinator/tests/test_queue.py
@@ -474,3 +474,142 @@ class TestQueueManager:
        item = queue_manager.get_item(159)
        assert item is not None
        assert item.status == QueueItemStatus.COMPLETED
+
+
+class TestQueueCorruptionHandling:
+    """Tests for queue file corruption handling."""
+
+    @pytest.fixture
+    def temp_queue_file(self) -> Generator[Path, None, None]:
+        """Create a temporary file for queue persistence."""
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
+            temp_path = Path(f.name)
+        yield temp_path
+        # Cleanup - remove main file and any backup files
+        if temp_path.exists():
+            temp_path.unlink()
+        # Clean up backup files
+        for backup in temp_path.parent.glob(f"{temp_path.stem}.corrupted.*.json"):
+            backup.unlink()
+
+    def test_corrupted_json_logs_error_and_creates_backup(
+        self, temp_queue_file: Path, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        """Test that corrupted JSON file triggers logging and backup creation."""
+        # Write invalid JSON to the file
+        with open(temp_queue_file, "w") as f:
+            f.write("{ invalid json content }")
+
+        import logging
+
+        with caplog.at_level(logging.ERROR):
+            queue_manager = QueueManager(queue_file=temp_queue_file)
+
+        # Verify queue is empty after corruption
+        assert queue_manager.size() == 0
+
+        # Verify error was logged
+        assert "Queue file corruption detected" in caplog.text
+        assert "JSONDecodeError" in caplog.text
+
+        # Verify backup file was created
+        backup_files = list(temp_queue_file.parent.glob(f"{temp_queue_file.stem}.corrupted.*.json"))
+        assert len(backup_files) == 1
+        assert "Corrupted queue file backed up" in caplog.text
+
+        # Verify backup contains original corrupted content
+        with open(backup_files[0]) as f:
+            backup_content = f.read()
+        assert "invalid json content" in backup_content
+
+    def test_corrupted_structure_logs_error_and_creates_backup(
+        self, temp_queue_file: Path, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        """Test that valid JSON with invalid structure triggers logging and backup."""
+        # Write valid JSON but with missing required fields
+        with open(temp_queue_file, "w") as f:
+            json.dump(
+                {
+                    "items": [
+                        {
+                            "issue_number": 159,
+                            # Missing "status", "ready", "metadata" fields
+                        }
+                    ]
+                },
+                f,
+            )
+
+        import logging
+
+        with caplog.at_level(logging.ERROR):
+            queue_manager = QueueManager(queue_file=temp_queue_file)
+
+        # Verify queue is empty after corruption
+        assert queue_manager.size() == 0
+
+        # Verify error was logged (KeyError for missing fields)
+        assert "Queue file corruption detected" in caplog.text
+        assert "KeyError" in caplog.text
+
+        # Verify backup file was created
+        backup_files = list(temp_queue_file.parent.glob(f"{temp_queue_file.stem}.corrupted.*.json"))
+        assert len(backup_files) == 1
+
+    def test_invalid_status_value_logs_error_and_creates_backup(
+        self, temp_queue_file: Path, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        """Test that invalid enum value triggers logging and backup."""
+        # Write valid JSON but with invalid status enum value
+        with open(temp_queue_file, "w") as f:
+            json.dump(
+                {
+                    "items": [
+                        {
+                            "issue_number": 159,
+                            "status": "invalid_status",
+                            "ready": True,
+                            "metadata": {
+                                "estimated_context": 50000,
+                                "difficulty": "medium",
+                                "assigned_agent": "sonnet",
+                                "blocks": [],
+                                "blocked_by": [],
+                            },
+                        }
+                    ]
+                },
+                f,
+            )
+
+        import logging
+
+        with caplog.at_level(logging.ERROR):
+            queue_manager = QueueManager(queue_file=temp_queue_file)
+
+        # Verify queue is empty after corruption
+        assert queue_manager.size() == 0
+
+        # Verify error was logged (ValueError for invalid enum)
+        assert "Queue file corruption detected" in caplog.text
+        assert "ValueError" in caplog.text
+
+        # Verify backup file was created
+        backup_files = list(temp_queue_file.parent.glob(f"{temp_queue_file.stem}.corrupted.*.json"))
+        assert len(backup_files) == 1
+
+    def test_queue_reset_logged_after_corruption(
+        self, temp_queue_file: Path, caplog: pytest.LogCaptureFixture
+    ) -> None:
+        """Test that queue reset is logged after handling corruption."""
+        # Write invalid JSON
+        with open(temp_queue_file, "w") as f:
+            f.write("not valid json")
+
+        import logging
+
+        with caplog.at_level(logging.ERROR):
+            QueueManager(queue_file=temp_queue_file)
+
+        # Verify the reset was logged
+        assert "Queue reset to empty state after corruption" in caplog.text