- Add sanitize_for_prompt() function to security module - Remove suspicious control characters (except whitespace) - Detect and log common prompt injection patterns - Escape dangerous XML-like tags used for prompt manipulation - Truncate user content to max length (default 50000 chars) - Integrate sanitization in parser before building LLM prompts - Add comprehensive test suite (12 new tests) Refs #338 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
160 lines
4.6 KiB
Python
160 lines
4.6 KiB
Python
"""Issue parser agent using Anthropic API."""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Any
|
|
|
|
from anthropic import Anthropic
|
|
from anthropic.types import TextBlock
|
|
|
|
from .models import IssueMetadata
|
|
from .security import sanitize_for_prompt
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# In-memory cache: issue_number -> IssueMetadata
|
|
_parse_cache: dict[int, IssueMetadata] = {}
|
|
|
|
|
|
def clear_cache() -> None:
|
|
"""Clear the parse cache (primarily for testing)."""
|
|
_parse_cache.clear()
|
|
|
|
|
|
def parse_issue_metadata(issue_body: str, issue_number: int) -> IssueMetadata:
|
|
"""
|
|
Parse issue markdown body to extract structured metadata using Anthropic API.
|
|
|
|
Args:
|
|
issue_body: Markdown content of the issue
|
|
issue_number: Issue number for caching
|
|
|
|
Returns:
|
|
IssueMetadata with extracted fields or defaults on failure
|
|
|
|
Example:
|
|
>>> metadata = parse_issue_metadata(issue_body, 158)
|
|
>>> print(metadata.difficulty)
|
|
'medium'
|
|
"""
|
|
# Check cache first
|
|
if issue_number in _parse_cache:
|
|
logger.debug(f"Cache hit for issue #{issue_number}")
|
|
return _parse_cache[issue_number]
|
|
|
|
# Parse using Anthropic API
|
|
try:
|
|
from .config import settings
|
|
|
|
client = Anthropic(api_key=settings.anthropic_api_key)
|
|
|
|
prompt = _build_parse_prompt(issue_body)
|
|
|
|
response = client.messages.create(
|
|
model="claude-sonnet-4.5-20250929",
|
|
max_tokens=1024,
|
|
temperature=0,
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": prompt
|
|
}
|
|
]
|
|
)
|
|
|
|
# Extract JSON from response
|
|
first_block = response.content[0]
|
|
if not isinstance(first_block, TextBlock):
|
|
raise ValueError("Expected TextBlock in response")
|
|
response_text = first_block.text
|
|
parsed_data = json.loads(response_text)
|
|
|
|
# Log token usage
|
|
logger.info(
|
|
f"Parsed issue #{issue_number}",
|
|
extra={
|
|
"issue_number": issue_number,
|
|
"input_tokens": response.usage.input_tokens,
|
|
"output_tokens": response.usage.output_tokens,
|
|
}
|
|
)
|
|
|
|
# Create metadata with validation
|
|
metadata = _create_metadata_from_parsed(parsed_data)
|
|
|
|
# Cache the result
|
|
_parse_cache[issue_number] = metadata
|
|
|
|
return metadata
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to parse issue #{issue_number}: {e}",
|
|
extra={"issue_number": issue_number, "error": str(e)},
|
|
exc_info=True
|
|
)
|
|
# Return defaults on failure
|
|
return IssueMetadata()
|
|
|
|
|
|
def _build_parse_prompt(issue_body: str) -> str:
|
|
"""
|
|
Build the prompt for Anthropic API to parse issue metadata.
|
|
|
|
Args:
|
|
issue_body: Issue markdown content (will be sanitized)
|
|
|
|
Returns:
|
|
Formatted prompt string
|
|
"""
|
|
# Sanitize issue body to prevent prompt injection attacks
|
|
sanitized_body = sanitize_for_prompt(issue_body)
|
|
|
|
return f"""Extract structured metadata from this GitHub/Gitea issue markdown.
|
|
|
|
Issue Body:
|
|
{sanitized_body}
|
|
|
|
Extract the following fields:
|
|
1. estimated_context: Total estimated tokens from "Context Estimate" section
|
|
(look for "Total estimated: X tokens")
|
|
2. difficulty: From "Difficulty" section (easy/medium/hard)
|
|
3. assigned_agent: From "Recommended agent" in Context Estimate section
|
|
(sonnet/haiku/opus/glm)
|
|
4. blocks: Issue numbers from "Dependencies" section after "Blocks:"
|
|
(extract #XXX numbers)
|
|
5. blocked_by: Issue numbers from "Dependencies" section after "Blocked by:"
|
|
(extract #XXX numbers)
|
|
|
|
Return ONLY a JSON object with these exact fields.
|
|
Use these defaults if fields are missing:
|
|
- estimated_context: 50000
|
|
- difficulty: "medium"
|
|
- assigned_agent: "sonnet"
|
|
- blocks: []
|
|
- blocked_by: []
|
|
|
|
Example output:
|
|
{{"estimated_context": 46800, "difficulty": "medium", "assigned_agent": "sonnet",
|
|
"blocks": [159], "blocked_by": [157]}}
|
|
"""
|
|
|
|
|
|
def _create_metadata_from_parsed(parsed_data: dict[str, Any]) -> IssueMetadata:
|
|
"""
|
|
Create IssueMetadata from parsed JSON data with validation.
|
|
|
|
Args:
|
|
parsed_data: Dictionary from parsed JSON
|
|
|
|
Returns:
|
|
Validated IssueMetadata instance
|
|
"""
|
|
return IssueMetadata(
|
|
estimated_context=parsed_data.get("estimated_context", 50000),
|
|
difficulty=parsed_data.get("difficulty", "medium"),
|
|
assigned_agent=parsed_data.get("assigned_agent", "sonnet"),
|
|
blocks=parsed_data.get("blocks", []),
|
|
blocked_by=parsed_data.get("blocked_by", []),
|
|
)
|