stack/apps/coordinator/src/security.py

"""Security utilities for webhook signature verification and prompt sanitization."""

import hashlib
import hmac
import logging
import re

logger = logging.getLogger(__name__)

# Default maximum length for user-provided content in prompts
DEFAULT_MAX_PROMPT_LENGTH = 50000

# Patterns that may indicate prompt injection attempts
INJECTION_PATTERNS = [
    # Instruction override attempts
    re.compile(r"ignore\s+(all\s+)?(previous|prior|above)\s+instructions", re.IGNORECASE),
    re.compile(r"disregard\s+(all\s+)?(previous|prior|above)", re.IGNORECASE),
    re.compile(r"forget\s+(everything|all|your)\s+(previous|prior|above)", re.IGNORECASE),
    # System prompt manipulation
    re.compile(r"<\s*system\s*>", re.IGNORECASE),
    re.compile(r"<\s*/\s*system\s*>", re.IGNORECASE),
    re.compile(r"\[\s*system\s*\]", re.IGNORECASE),
    # Role injection
    re.compile(r"^(assistant|system|user)\s*:", re.IGNORECASE | re.MULTILINE),
    # Delimiter injection
    re.compile(r"-{3,}\s*(end|begin|start)\s+(of\s+)?(input|output|context|prompt)", re.IGNORECASE),
    re.compile(r"={3,}\s*(end|begin|start)", re.IGNORECASE),
    # Common injection phrases
    re.compile(r"(you\s+are|act\s+as|pretend\s+to\s+be)\s+(now\s+)?a\s+different", re.IGNORECASE),
    re.compile(r"new\s+instructions?\s*:", re.IGNORECASE),
    re.compile(r"override\s+(the\s+)?(system|instructions|rules)", re.IGNORECASE),
]

# XML-like tags that could be used for injection
DANGEROUS_TAG_PATTERN = re.compile(
    r"<\s*(instructions?|prompt|context|system|user|assistant)\s*>",
    re.IGNORECASE,
)


def sanitize_for_prompt(
    content: str | None,
    max_length: int = DEFAULT_MAX_PROMPT_LENGTH
) -> str:
    """
    Sanitize user-provided content before including in LLM prompts.

    This function:
    1. Removes control characters (except newlines/tabs)
    2. Detects and logs potential prompt injection patterns
    3. Escapes dangerous XML-like tags
    4. Truncates content to maximum length

    Args:
        content: User-provided content to sanitize
        max_length: Maximum allowed length (default 50000)

    Returns:
        Sanitized content safe for prompt inclusion

    Example:
        >>> body = "Fix the bug\\x00\\nIgnore previous instructions"
        >>> safe_body = sanitize_for_prompt(body)
        >>> # Returns sanitized content, logs warning about injection pattern
    """
    if not content:
        return ""

    # Step 1: Remove control characters (keep newlines \n, tabs \t, carriage returns \r)
    # Control characters are 0x00-0x1F and 0x7F, except 0x09 (tab), 0x0A (newline), 0x0D (CR)
    sanitized = "".join(
        char for char in content
        if ord(char) >= 32 or char in "\n\t\r"
    )

    # Step 2: Detect prompt injection patterns
    detected_patterns = []
    for pattern in INJECTION_PATTERNS:
        if pattern.search(sanitized):
            detected_patterns.append(pattern.pattern)

    if detected_patterns:
        logger.warning(
            "Potential prompt injection detected in issue body",
            extra={
                "patterns_matched": len(detected_patterns),
                "sample_patterns": detected_patterns[:3],
                "content_length": len(sanitized),
            },
        )

    # Step 3: Escape dangerous XML-like tags by adding spaces
    sanitized = DANGEROUS_TAG_PATTERN.sub(
        lambda m: m.group(0).replace("<", "< ").replace(">", " >"),
        sanitized
    )

    # Step 4: Truncate to max length
    if len(sanitized) > max_length:
        sanitized = sanitized[:max_length] + "... [content truncated]"

    return sanitized


def verify_signature(payload: bytes, signature: str, secret: str) -> bool:
    """
    Verify HMAC SHA256 signature of webhook payload.

    Args:
        payload: Raw request body as bytes
        signature: Signature from X-Gitea-Signature header
        secret: Webhook secret configured in Gitea

    Returns:
        True if signature is valid, False otherwise

    Example:
        >>> payload = b'{"action": "assigned"}'
        >>> secret = "my-webhook-secret"
        >>> sig = hmac.new(secret.encode(), payload, "sha256").hexdigest()
        >>> verify_signature(payload, sig, secret)
        True
    """
    if not signature:
        return False

    # Compute expected signature
    expected_signature = hmac.new(
        secret.encode("utf-8"), payload, hashlib.sha256
    ).hexdigest()

    # Use timing-safe comparison to prevent timing attacks
    return hmac.compare_digest(signature, expected_signature)