Files
stack/apps/coordinator/src/parser.py
Jason Woltje 442f8e0971 fix(#338): Sanitize issue body for prompt injection
- Add sanitize_for_prompt() function to security module
- Remove suspicious control characters (except whitespace)
- Detect and log common prompt injection patterns
- Escape dangerous XML-like tags used for prompt manipulation
- Truncate user content to max length (default 50000 chars)
- Integrate sanitization in parser before building LLM prompts
- Add comprehensive test suite (12 new tests)

Refs #338

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 18:36:16 -06:00

160 lines
4.6 KiB
Python

"""Issue parser agent using Anthropic API."""
import json
import logging
from typing import Any
from anthropic import Anthropic
from anthropic.types import TextBlock
from .models import IssueMetadata
from .security import sanitize_for_prompt
logger = logging.getLogger(__name__)
# In-memory cache: issue_number -> IssueMetadata
_parse_cache: dict[int, IssueMetadata] = {}
def clear_cache() -> None:
"""Clear the parse cache (primarily for testing)."""
_parse_cache.clear()
def parse_issue_metadata(issue_body: str, issue_number: int) -> IssueMetadata:
"""
Parse issue markdown body to extract structured metadata using Anthropic API.
Args:
issue_body: Markdown content of the issue
issue_number: Issue number for caching
Returns:
IssueMetadata with extracted fields or defaults on failure
Example:
>>> metadata = parse_issue_metadata(issue_body, 158)
>>> print(metadata.difficulty)
'medium'
"""
# Check cache first
if issue_number in _parse_cache:
logger.debug(f"Cache hit for issue #{issue_number}")
return _parse_cache[issue_number]
# Parse using Anthropic API
try:
from .config import settings
client = Anthropic(api_key=settings.anthropic_api_key)
prompt = _build_parse_prompt(issue_body)
response = client.messages.create(
model="claude-sonnet-4.5-20250929",
max_tokens=1024,
temperature=0,
messages=[
{
"role": "user",
"content": prompt
}
]
)
# Extract JSON from response
first_block = response.content[0]
if not isinstance(first_block, TextBlock):
raise ValueError("Expected TextBlock in response")
response_text = first_block.text
parsed_data = json.loads(response_text)
# Log token usage
logger.info(
f"Parsed issue #{issue_number}",
extra={
"issue_number": issue_number,
"input_tokens": response.usage.input_tokens,
"output_tokens": response.usage.output_tokens,
}
)
# Create metadata with validation
metadata = _create_metadata_from_parsed(parsed_data)
# Cache the result
_parse_cache[issue_number] = metadata
return metadata
except Exception as e:
logger.error(
f"Failed to parse issue #{issue_number}: {e}",
extra={"issue_number": issue_number, "error": str(e)},
exc_info=True
)
# Return defaults on failure
return IssueMetadata()
def _build_parse_prompt(issue_body: str) -> str:
"""
Build the prompt for Anthropic API to parse issue metadata.
Args:
issue_body: Issue markdown content (will be sanitized)
Returns:
Formatted prompt string
"""
# Sanitize issue body to prevent prompt injection attacks
sanitized_body = sanitize_for_prompt(issue_body)
return f"""Extract structured metadata from this GitHub/Gitea issue markdown.
Issue Body:
{sanitized_body}
Extract the following fields:
1. estimated_context: Total estimated tokens from "Context Estimate" section
(look for "Total estimated: X tokens")
2. difficulty: From "Difficulty" section (easy/medium/hard)
3. assigned_agent: From "Recommended agent" in Context Estimate section
(sonnet/haiku/opus/glm)
4. blocks: Issue numbers from "Dependencies" section after "Blocks:"
(extract #XXX numbers)
5. blocked_by: Issue numbers from "Dependencies" section after "Blocked by:"
(extract #XXX numbers)
Return ONLY a JSON object with these exact fields.
Use these defaults if fields are missing:
- estimated_context: 50000
- difficulty: "medium"
- assigned_agent: "sonnet"
- blocks: []
- blocked_by: []
Example output:
{{"estimated_context": 46800, "difficulty": "medium", "assigned_agent": "sonnet",
"blocks": [159], "blocked_by": [157]}}
"""
def _create_metadata_from_parsed(parsed_data: dict[str, Any]) -> IssueMetadata:
"""
Create IssueMetadata from parsed JSON data with validation.
Args:
parsed_data: Dictionary from parsed JSON
Returns:
Validated IssueMetadata instance
"""
return IssueMetadata(
estimated_context=parsed_data.get("estimated_context", 50000),
difficulty=parsed_data.get("difficulty", "medium"),
assigned_agent=parsed_data.get("assigned_agent", "sonnet"),
blocks=parsed_data.get("blocks", []),
blocked_by=parsed_data.get("blocked_by", []),
)