All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
- Add input sanitization to prevent LLM prompt injection (escapes quotes, backslashes, replaces newlines) - Add MaxLength(500) validation to DTO to prevent DoS - Add entity validation to filter malicious LLM responses - Add confidence validation to clamp values to 0.0-1.0 - Make LLM model configurable via INTENT_CLASSIFICATION_MODEL env var - Add 12 new security tests (total: 72 tests, from 60) Security fixes identified by code review: - CVE-mitigated: Prompt injection via unescaped user input - CVE-mitigated: Unvalidated entity data from LLM response - CVE-mitigated: Missing input length validation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
589 lines
18 KiB
TypeScript
589 lines
18 KiB
TypeScript
import { Injectable, Optional, Logger } from "@nestjs/common";
|
|
import { LlmService } from "../llm/llm.service";
|
|
import type {
|
|
IntentType,
|
|
IntentClassification,
|
|
IntentPattern,
|
|
ExtractedEntity,
|
|
} from "./interfaces";
|
|
|
|
/** Valid entity types for validation */
|
|
const VALID_ENTITY_TYPES = ["date", "time", "person", "project", "priority", "status", "text"];
|
|
|
|
/**
|
|
* Intent Classification Service
|
|
*
|
|
* Classifies natural language queries into structured intents using a hybrid approach:
|
|
* 1. Rule-based classification (fast, <100ms) - regex patterns for common phrases
|
|
* 2. LLM fallback (optional) - for ambiguous queries or when explicitly requested
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* // Rule-based classification (default)
|
|
* const result = await service.classify("show my tasks");
|
|
* // { intent: "query_tasks", confidence: 0.9, method: "rule", ... }
|
|
*
|
|
* // Force LLM classification
|
|
* const result = await service.classify("show my tasks", true);
|
|
* // { intent: "query_tasks", confidence: 0.95, method: "llm", ... }
|
|
* ```
|
|
*/
|
|
@Injectable()
|
|
export class IntentClassificationService {
|
|
private readonly logger = new Logger(IntentClassificationService.name);
|
|
private readonly patterns: IntentPattern[];
|
|
private readonly RULE_CONFIDENCE_THRESHOLD = 0.7;
|
|
|
|
/** Configurable LLM model for intent classification */
|
|
private readonly intentModel =
|
|
// eslint-disable-next-line @typescript-eslint/dot-notation -- env vars use bracket notation
|
|
process.env["INTENT_CLASSIFICATION_MODEL"] ?? "llama3.2";
|
|
/** Configurable temperature (low for consistent results) */
|
|
private readonly intentTemperature = parseFloat(
|
|
// eslint-disable-next-line @typescript-eslint/dot-notation -- env vars use bracket notation
|
|
process.env["INTENT_CLASSIFICATION_TEMPERATURE"] ?? "0.1"
|
|
);
|
|
|
|
constructor(@Optional() private readonly llmService?: LlmService) {
|
|
this.patterns = this.buildPatterns();
|
|
this.logger.log("Intent classification service initialized");
|
|
}
|
|
|
|
/**
|
|
* Classify a natural language query into an intent.
|
|
* Uses rule-based classification by default, with optional LLM fallback.
|
|
*
|
|
* @param query - Natural language query to classify
|
|
* @param useLlm - Force LLM classification (default: false)
|
|
* @returns Intent classification result
|
|
*/
|
|
async classify(query: string, useLlm = false): Promise<IntentClassification> {
|
|
if (!query || query.trim().length === 0) {
|
|
return {
|
|
intent: "unknown",
|
|
confidence: 0,
|
|
entities: [],
|
|
method: "rule",
|
|
query,
|
|
};
|
|
}
|
|
|
|
// Try rule-based classification first
|
|
const ruleResult = this.classifyWithRules(query);
|
|
|
|
// Use LLM if:
|
|
// 1. Explicitly requested
|
|
// 2. Rule confidence is low and LLM is available
|
|
const shouldUseLlm =
|
|
useLlm || (ruleResult.confidence < this.RULE_CONFIDENCE_THRESHOLD && this.llmService);
|
|
|
|
if (shouldUseLlm) {
|
|
return this.classifyWithLlm(query);
|
|
}
|
|
|
|
return ruleResult;
|
|
}
|
|
|
|
/**
|
|
* Classify a query using rule-based pattern matching.
|
|
* Fast (<100ms) but limited to predefined patterns.
|
|
*
|
|
* @param query - Natural language query to classify
|
|
* @returns Intent classification result
|
|
*/
|
|
classifyWithRules(query: string): IntentClassification {
|
|
if (!query || query.trim().length === 0) {
|
|
return {
|
|
intent: "unknown",
|
|
confidence: 0,
|
|
entities: [],
|
|
method: "rule",
|
|
query,
|
|
};
|
|
}
|
|
|
|
const normalizedQuery = query.toLowerCase().trim();
|
|
|
|
// Sort patterns by priority (highest first)
|
|
const sortedPatterns = [...this.patterns].sort((a, b) => b.priority - a.priority);
|
|
|
|
// Find first matching pattern
|
|
for (const patternConfig of sortedPatterns) {
|
|
for (const pattern of patternConfig.patterns) {
|
|
if (pattern.test(normalizedQuery)) {
|
|
const entities = this.extractEntities(query);
|
|
return {
|
|
intent: patternConfig.intent,
|
|
confidence: 0.9, // High confidence for direct pattern match
|
|
entities,
|
|
method: "rule",
|
|
query,
|
|
};
|
|
}
|
|
}
|
|
}
|
|
|
|
// No pattern matched
|
|
return {
|
|
intent: "unknown",
|
|
confidence: 0.2,
|
|
entities: [],
|
|
method: "rule",
|
|
query,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Classify a query using LLM.
|
|
* Slower but more flexible for ambiguous queries.
|
|
*
|
|
* @param query - Natural language query to classify
|
|
* @returns Intent classification result
|
|
*/
|
|
async classifyWithLlm(query: string): Promise<IntentClassification> {
|
|
if (!this.llmService) {
|
|
this.logger.warn("LLM service not available, falling back to rule-based classification");
|
|
return this.classifyWithRules(query);
|
|
}
|
|
|
|
try {
|
|
const prompt = this.buildLlmPrompt(query);
|
|
const response = await this.llmService.chat({
|
|
messages: [
|
|
{
|
|
role: "system",
|
|
content: "You are an intent classification assistant. Respond only with valid JSON.",
|
|
},
|
|
{
|
|
role: "user",
|
|
content: prompt,
|
|
},
|
|
],
|
|
model: this.intentModel,
|
|
temperature: this.intentTemperature,
|
|
});
|
|
|
|
const result = this.parseLlmResponse(response.message.content, query);
|
|
return result;
|
|
} catch (error: unknown) {
|
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
this.logger.error(`LLM classification failed: ${errorMessage}`);
|
|
return {
|
|
intent: "unknown",
|
|
confidence: 0,
|
|
entities: [],
|
|
method: "llm",
|
|
query,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract entities from a query.
|
|
* Identifies dates, times, priorities, statuses, etc.
|
|
*
|
|
* @param query - Query to extract entities from
|
|
* @returns Array of extracted entities
|
|
*/
|
|
extractEntities(query: string): ExtractedEntity[] {
|
|
const entities: ExtractedEntity[] = [];
|
|
|
|
/* eslint-disable security/detect-unsafe-regex */
|
|
// Date patterns
|
|
const datePatterns = [
|
|
{ pattern: /\b(today|tomorrow|yesterday)\b/gi, normalize: (m: string) => m.toLowerCase() },
|
|
{
|
|
pattern: /\b(monday|tuesday|wednesday|thursday|friday|saturday|sunday)\b/gi,
|
|
normalize: (m: string) => m.toLowerCase(),
|
|
},
|
|
{
|
|
pattern: /\b(next|this)\s+(week|month|year)\b/gi,
|
|
normalize: (m: string) => m.toLowerCase(),
|
|
},
|
|
{
|
|
pattern: /\b(\d{1,2})[/-](\d{1,2})([/-](\d{2,4}))?\b/g,
|
|
normalize: (m: string) => m,
|
|
},
|
|
];
|
|
|
|
for (const { pattern, normalize } of datePatterns) {
|
|
let match: RegExpExecArray | null;
|
|
while ((match = pattern.exec(query)) !== null) {
|
|
entities.push({
|
|
type: "date",
|
|
value: normalize(match[0]),
|
|
raw: match[0],
|
|
start: match.index,
|
|
end: match.index + match[0].length,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Time patterns
|
|
const timePatterns = [
|
|
/\b(\d{1,2}):(\d{2})\s*(am|pm)?\b/gi,
|
|
/\b(\d{1,2})\s*(am|pm)\b/gi,
|
|
/\bat\s+(\d{1,2})\b/gi,
|
|
];
|
|
|
|
for (const pattern of timePatterns) {
|
|
let match: RegExpExecArray | null;
|
|
while ((match = pattern.exec(query)) !== null) {
|
|
entities.push({
|
|
type: "time",
|
|
value: match[0].toLowerCase(),
|
|
raw: match[0],
|
|
start: match.index,
|
|
end: match.index + match[0].length,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Priority patterns
|
|
const priorityPatterns = [
|
|
{ pattern: /\b(high|urgent|critical)\s*priority\b/gi, value: "HIGH" },
|
|
{ pattern: /\b(medium|normal)\s*priority\b/gi, value: "MEDIUM" },
|
|
{ pattern: /\b(low|minor)\s*priority\b/gi, value: "LOW" },
|
|
];
|
|
|
|
for (const { pattern, value } of priorityPatterns) {
|
|
let match: RegExpExecArray | null;
|
|
while ((match = pattern.exec(query)) !== null) {
|
|
entities.push({
|
|
type: "priority",
|
|
value,
|
|
raw: match[0],
|
|
start: match.index,
|
|
end: match.index + match[0].length,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Status patterns
|
|
const statusPatterns = [
|
|
{ pattern: /\b(done|complete|finished|completed)\b/gi, value: "DONE" },
|
|
{ pattern: /\b(in\s*progress|working\s*on|ongoing)\b/gi, value: "IN_PROGRESS" },
|
|
{ pattern: /\b(pending|todo|not\s*started)\b/gi, value: "PENDING" },
|
|
{ pattern: /\b(blocked|stuck)\b/gi, value: "BLOCKED" },
|
|
{ pattern: /\b(cancelled|canceled)\b/gi, value: "CANCELLED" },
|
|
];
|
|
|
|
for (const { pattern, value } of statusPatterns) {
|
|
let match: RegExpExecArray | null;
|
|
while ((match = pattern.exec(query)) !== null) {
|
|
entities.push({
|
|
type: "status",
|
|
value,
|
|
raw: match[0],
|
|
start: match.index,
|
|
end: match.index + match[0].length,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Person patterns (mentions)
|
|
const personPattern = /@(\w+)/g;
|
|
let match: RegExpExecArray | null;
|
|
while ((match = personPattern.exec(query)) !== null) {
|
|
if (match[1]) {
|
|
entities.push({
|
|
type: "person",
|
|
value: match[1],
|
|
raw: match[0],
|
|
start: match.index,
|
|
end: match.index + match[0].length,
|
|
});
|
|
}
|
|
}
|
|
/* eslint-enable security/detect-unsafe-regex */
|
|
|
|
return entities;
|
|
}
|
|
|
|
/**
|
|
* Build regex patterns for intent matching.
|
|
* Patterns are sorted by priority (higher = checked first).
|
|
*
|
|
* @returns Array of intent patterns
|
|
*/
|
|
private buildPatterns(): IntentPattern[] {
|
|
/* eslint-disable security/detect-unsafe-regex */
|
|
return [
|
|
// Briefing (highest priority - specific intent)
|
|
{
|
|
intent: "briefing",
|
|
patterns: [
|
|
/\b(morning|daily|today'?s?)\s+(briefing|summary|overview)\b/i,
|
|
/\bwhat'?s?\s+(my|the)\s+day\s+look\s+like\b/i,
|
|
/\bgive\s+me\s+(a\s+)?(rundown|summary)\b/i,
|
|
],
|
|
priority: 10,
|
|
},
|
|
// Create operations (high priority - specific actions)
|
|
{
|
|
intent: "create_task",
|
|
patterns: [
|
|
/\b(add|create|new|make)\s+(a\s+)?(task|to-?do)\b/i,
|
|
/\bremind\s+me\s+to\b/i,
|
|
/\bI\s+need\s+to\b/i,
|
|
],
|
|
priority: 9,
|
|
},
|
|
{
|
|
intent: "create_event",
|
|
patterns: [
|
|
/\b(schedule|create|add|book)\s+(a\s+|an\s+)?(meeting|event|appointment|call)\b/i,
|
|
/\bset\s+up\s+(a\s+)?(meeting|call)\b/i,
|
|
],
|
|
priority: 9,
|
|
},
|
|
// Update operations
|
|
{
|
|
intent: "update_task",
|
|
patterns: [
|
|
/\b(mark|set|update|change)\s+(task|to-?do)\s+(as\s+)?(done|complete|status|priority)\b/i,
|
|
/\bcomplete\s+(the\s+)?(task|to-?do)\b/i,
|
|
/\b(finish|done\s+with)\s+(the\s+)?(task|to-?do)\b/i,
|
|
/\bcomplete\s+\w+\s+\w+\s+(task|to-?do)\b/i, // "complete the review task"
|
|
/\bcomplete\s+[\w\s]{1,30}(task|to-?do)\b/i, // More flexible but bounded
|
|
],
|
|
priority: 8,
|
|
},
|
|
{
|
|
intent: "update_event",
|
|
patterns: [
|
|
/\b(reschedule|move|change|cancel|update)\s+(the\s+)?(meeting|event|appointment|call|standup)\b/i,
|
|
/\bmove\s+(event|meeting)\s+to\b/i,
|
|
/\bcancel\s+(the\s+)?(meeting|event|standup|call)\b/i,
|
|
],
|
|
priority: 8,
|
|
},
|
|
// Query operations
|
|
{
|
|
intent: "query_tasks",
|
|
patterns: [
|
|
/\b(show|list|get|what|display)\s+((my|all|the)\s+)?tasks?\b/i,
|
|
/\bwhat\s+(tasks?|to-?dos?)\s+(do\s+I|have)\b/i,
|
|
/\b(pending|overdue|upcoming|active)\s+tasks?\b/i,
|
|
],
|
|
priority: 8,
|
|
},
|
|
{
|
|
intent: "query_events",
|
|
patterns: [
|
|
/\b(show|list|get|display)\s+((my|all|the)\s+)?(calendar|events?|meetings?|schedule)\b/i,
|
|
/\bwhat'?s?\s+(on\s+)?(my\s+)?(calendar|schedule)\b/i,
|
|
/\b(upcoming|next|today'?s?)\s+(events?|meetings?)\b/i,
|
|
],
|
|
priority: 8,
|
|
},
|
|
{
|
|
intent: "query_projects",
|
|
patterns: [
|
|
/\b(show|list|get|display|what)\s+((my|all|the)\s+)?projects?\b/i,
|
|
/\bwhat\s+projects?\s+(do\s+I|have)\b/i,
|
|
/\b(active|ongoing)\s+projects?\b/i,
|
|
],
|
|
priority: 8,
|
|
},
|
|
// Search (lower priority - more general)
|
|
{
|
|
intent: "search",
|
|
patterns: [/\b(find|search|look\s*for|locate)\b/i],
|
|
priority: 6,
|
|
},
|
|
];
|
|
/* eslint-enable security/detect-unsafe-regex */
|
|
}
|
|
|
|
/**
|
|
* Sanitize user query for safe inclusion in LLM prompt.
|
|
* Prevents prompt injection by escaping special characters and limiting length.
|
|
*
|
|
* @param query - Raw user query
|
|
* @returns Sanitized query safe for LLM prompt
|
|
*/
|
|
private sanitizeQueryForPrompt(query: string): string {
|
|
// Escape quotes and backslashes to prevent prompt injection
|
|
const sanitized = query
|
|
.replace(/\\/g, "\\\\")
|
|
.replace(/"/g, '\\"')
|
|
.replace(/\n/g, " ")
|
|
.replace(/\r/g, " ");
|
|
|
|
// Limit length to prevent prompt overflow (500 chars max)
|
|
const maxLength = 500;
|
|
if (sanitized.length > maxLength) {
|
|
this.logger.warn(
|
|
`Query truncated from ${String(sanitized.length)} to ${String(maxLength)} chars`
|
|
);
|
|
return sanitized.slice(0, maxLength);
|
|
}
|
|
|
|
return sanitized;
|
|
}
|
|
|
|
/**
|
|
* Build the prompt for LLM classification.
|
|
*
|
|
* @param query - User query to classify
|
|
* @returns Formatted prompt
|
|
*/
|
|
private buildLlmPrompt(query: string): string {
|
|
const sanitizedQuery = this.sanitizeQueryForPrompt(query);
|
|
|
|
return `Classify the following user query into one of these intents:
|
|
- query_tasks: User wants to see their tasks
|
|
- query_events: User wants to see their calendar/events
|
|
- query_projects: User wants to see their projects
|
|
- create_task: User wants to create a new task
|
|
- create_event: User wants to schedule a new event
|
|
- update_task: User wants to update an existing task
|
|
- update_event: User wants to update/reschedule an event
|
|
- briefing: User wants a daily briefing/summary
|
|
- search: User wants to search for something
|
|
- unknown: Query doesn't match any intent
|
|
|
|
Also extract any entities (dates, times, priorities, statuses, people).
|
|
|
|
Query: "${sanitizedQuery}"
|
|
|
|
Respond with ONLY this JSON format (no other text):
|
|
{
|
|
"intent": "<intent_type>",
|
|
"confidence": <0.0-1.0>,
|
|
"entities": [
|
|
{
|
|
"type": "<date|time|person|project|priority|status|text>",
|
|
"value": "<normalized_value>",
|
|
"raw": "<original_text>",
|
|
"start": <position>,
|
|
"end": <position>
|
|
}
|
|
]
|
|
}`;
|
|
}
|
|
|
|
/**
|
|
* Validate and sanitize confidence score from LLM.
|
|
* Ensures confidence is a valid number between 0.0 and 1.0.
|
|
*
|
|
* @param confidence - Raw confidence value from LLM
|
|
* @returns Validated confidence (0.0 - 1.0)
|
|
*/
|
|
private validateConfidence(confidence: unknown): number {
|
|
if (typeof confidence !== "number" || isNaN(confidence) || !isFinite(confidence)) {
|
|
return 0;
|
|
}
|
|
return Math.max(0, Math.min(1, confidence));
|
|
}
|
|
|
|
/**
|
|
* Validate an entity from LLM response.
|
|
* Ensures entity has valid structure and safe values.
|
|
*
|
|
* @param entity - Raw entity from LLM
|
|
* @returns True if entity is valid
|
|
*/
|
|
private isValidEntity(entity: unknown): entity is ExtractedEntity {
|
|
if (typeof entity !== "object" || entity === null) {
|
|
return false;
|
|
}
|
|
|
|
const e = entity as Record<string, unknown>;
|
|
|
|
// Validate type
|
|
if (typeof e.type !== "string" || !VALID_ENTITY_TYPES.includes(e.type)) {
|
|
return false;
|
|
}
|
|
|
|
// Validate value (string, max 200 chars)
|
|
if (typeof e.value !== "string" || e.value.length > 200) {
|
|
return false;
|
|
}
|
|
|
|
// Validate raw (string, max 200 chars)
|
|
if (typeof e.raw !== "string" || e.raw.length > 200) {
|
|
return false;
|
|
}
|
|
|
|
// Validate positions (non-negative integers, end > start)
|
|
if (
|
|
typeof e.start !== "number" ||
|
|
typeof e.end !== "number" ||
|
|
e.start < 0 ||
|
|
e.end <= e.start ||
|
|
e.end > 10000
|
|
) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Parse LLM response into IntentClassification.
|
|
*
|
|
* @param content - LLM response content
|
|
* @param query - Original query
|
|
* @returns Intent classification result
|
|
*/
|
|
private parseLlmResponse(content: string, query: string): IntentClassification {
|
|
try {
|
|
const parsed: unknown = JSON.parse(content);
|
|
|
|
if (typeof parsed !== "object" || parsed === null) {
|
|
throw new Error("Invalid JSON structure");
|
|
}
|
|
|
|
const parsedObj = parsed as Record<string, unknown>;
|
|
|
|
// Validate intent type
|
|
const validIntents: IntentType[] = [
|
|
"query_tasks",
|
|
"query_events",
|
|
"query_projects",
|
|
"create_task",
|
|
"create_event",
|
|
"update_task",
|
|
"update_event",
|
|
"briefing",
|
|
"search",
|
|
"unknown",
|
|
];
|
|
const intent =
|
|
typeof parsedObj.intent === "string" &&
|
|
validIntents.includes(parsedObj.intent as IntentType)
|
|
? (parsedObj.intent as IntentType)
|
|
: "unknown";
|
|
|
|
// Validate and filter entities
|
|
const rawEntities: unknown[] = Array.isArray(parsedObj.entities) ? parsedObj.entities : [];
|
|
const validEntities = rawEntities.filter((e): e is ExtractedEntity => this.isValidEntity(e));
|
|
|
|
if (rawEntities.length !== validEntities.length) {
|
|
this.logger.warn(
|
|
`Filtered ${String(rawEntities.length - validEntities.length)} invalid entities from LLM response`
|
|
);
|
|
}
|
|
|
|
return {
|
|
intent,
|
|
confidence: this.validateConfidence(parsedObj.confidence),
|
|
entities: validEntities,
|
|
method: "llm",
|
|
query,
|
|
};
|
|
} catch {
|
|
this.logger.error(`Failed to parse LLM response: ${content}`);
|
|
return {
|
|
intent: "unknown",
|
|
confidence: 0,
|
|
entities: [],
|
|
method: "llm",
|
|
query,
|
|
};
|
|
}
|
|
}
|
|
}
|