feat: add semantic search with pgvector (closes #68, #69, #70)
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
ci/woodpecker/pr/woodpecker Pipeline failed

Issues resolved:
- #68: pgvector Setup
  * Added pgvector vector index migration for knowledge_embeddings
  * Vector index uses HNSW algorithm with cosine distance
  * Optimized for 1536-dimension OpenAI embeddings

- #69: Embedding Generation Pipeline
  * Created EmbeddingService with OpenAI integration
  * Automatic embedding generation on entry create/update
  * Batch processing endpoint for existing entries
  * Async generation to avoid blocking API responses
  * Content preparation with title weighting

- #70: Semantic Search API
  * POST /api/knowledge/search/semantic - pure vector search
  * POST /api/knowledge/search/hybrid - RRF combined search
  * POST /api/knowledge/embeddings/batch - batch generation
  * Comprehensive test coverage
  * Full documentation in docs/SEMANTIC_SEARCH.md

Technical details:
- Uses OpenAI text-embedding-3-small model (1536 dims)
- HNSW index for O(log n) similarity search
- Reciprocal Rank Fusion for hybrid search
- Graceful degradation when OpenAI not configured
- Async embedding generation for performance

Configuration:
- Added OPENAI_API_KEY to .env.example
- Optional feature - disabled if API key not set
- Falls back to keyword search in hybrid mode
This commit is contained in:
Jason Woltje
2026-01-30 00:24:41 -06:00
parent 22cd68811d
commit 3ec2059470
14 changed files with 1408 additions and 5 deletions

View File

@@ -6,6 +6,7 @@ import type {
PaginatedEntries,
} from "../entities/knowledge-entry.entity";
import { KnowledgeCacheService } from "./cache.service";
import { EmbeddingService } from "./embedding.service";
/**
* Search options for full-text search
@@ -66,7 +67,8 @@ interface RawSearchResult {
export class SearchService {
constructor(
private readonly prisma: PrismaService,
private readonly cache: KnowledgeCacheService
private readonly cache: KnowledgeCacheService,
private readonly embedding: EmbeddingService
) {}
/**
@@ -428,4 +430,288 @@ export class SearchService {
return tagsMap;
}
/**
* Semantic search using vector similarity
*
* @param query - The search query string
* @param workspaceId - The workspace to search within
* @param options - Search options (status filter, pagination)
* @returns Paginated search results ranked by semantic similarity
*/
async semanticSearch(
query: string,
workspaceId: string,
options: SearchOptions = {}
): Promise<PaginatedSearchResults> {
if (!this.embedding.isConfigured()) {
throw new Error("Semantic search requires OPENAI_API_KEY to be configured");
}
const page = options.page || 1;
const limit = options.limit || 20;
const offset = (page - 1) * limit;
// Generate embedding for the query
const queryEmbedding = await this.embedding.generateEmbedding(query);
const embeddingString = `[${queryEmbedding.join(",")}]`;
// Build status filter
const statusFilter = options.status
? Prisma.sql`AND e.status = ${options.status}::text::"EntryStatus"`
: Prisma.sql`AND e.status != 'ARCHIVED'`;
// Vector similarity search using cosine distance
const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
SELECT
e.id,
e.workspace_id,
e.slug,
e.title,
e.content,
e.content_html,
e.summary,
e.status,
e.visibility,
e.created_at,
e.updated_at,
e.created_by,
e.updated_by,
(1 - (emb.embedding <=> ${embeddingString}::vector)) AS rank,
NULL AS headline
FROM knowledge_entries e
INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
WHERE e.workspace_id = ${workspaceId}::uuid
${statusFilter}
ORDER BY emb.embedding <=> ${embeddingString}::vector
LIMIT ${limit}
OFFSET ${offset}
`;
// Get total count for pagination
const countResult = await this.prisma.$queryRaw<[{ count: bigint }]>`
SELECT COUNT(*) as count
FROM knowledge_entries e
INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
WHERE e.workspace_id = ${workspaceId}::uuid
${statusFilter}
`;
const total = Number(countResult[0].count);
// Fetch tags for the results
const entryIds = searchResults.map((r) => r.id);
const tagsMap = await this.fetchTagsForEntries(entryIds);
// Transform results to the expected format
const data: SearchResult[] = searchResults.map((row) => ({
id: row.id,
workspaceId: row.workspace_id,
slug: row.slug,
title: row.title,
content: row.content,
contentHtml: row.content_html,
summary: row.summary,
status: row.status,
visibility: row.visibility as "PRIVATE" | "WORKSPACE" | "PUBLIC",
createdAt: row.created_at,
updatedAt: row.updated_at,
createdBy: row.created_by,
updatedBy: row.updated_by,
rank: row.rank,
headline: row.headline ?? undefined,
tags: tagsMap.get(row.id) || [],
}));
return {
data,
pagination: {
page,
limit,
total,
totalPages: Math.ceil(total / limit),
},
query,
};
}
/**
* Hybrid search combining vector similarity and full-text search
* Uses Reciprocal Rank Fusion (RRF) to combine rankings
*
* @param query - The search query string
* @param workspaceId - The workspace to search within
* @param options - Search options (status filter, pagination)
* @returns Paginated search results ranked by combined relevance
*/
async hybridSearch(
query: string,
workspaceId: string,
options: SearchOptions = {}
): Promise<PaginatedSearchResults> {
if (!this.embedding.isConfigured()) {
// Fall back to keyword search if embeddings not configured
return this.search(query, workspaceId, options);
}
const page = options.page || 1;
const limit = options.limit || 20;
const offset = (page - 1) * limit;
// Sanitize query for keyword search
const sanitizedQuery = this.sanitizeSearchQuery(query);
if (!sanitizedQuery) {
return {
data: [],
pagination: {
page,
limit,
total: 0,
totalPages: 0,
},
query,
};
}
// Generate embedding for vector search
const queryEmbedding = await this.embedding.generateEmbedding(query);
const embeddingString = `[${queryEmbedding.join(",")}]`;
// Build status filter
const statusFilter = options.status
? Prisma.sql`AND e.status = ${options.status}::text::"EntryStatus"`
: Prisma.sql`AND e.status != 'ARCHIVED'`;
// Hybrid search using Reciprocal Rank Fusion (RRF)
// Combines vector similarity and full-text search rankings
const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
WITH vector_search AS (
SELECT
e.id,
ROW_NUMBER() OVER (ORDER BY emb.embedding <=> ${embeddingString}::vector) AS rank
FROM knowledge_entries e
INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
WHERE e.workspace_id = ${workspaceId}::uuid
${statusFilter}
),
keyword_search AS (
SELECT
e.id,
ROW_NUMBER() OVER (
ORDER BY ts_rank(
setweight(to_tsvector('english', e.title), 'A') ||
setweight(to_tsvector('english', e.content), 'B'),
plainto_tsquery('english', ${sanitizedQuery})
) DESC
) AS rank
FROM knowledge_entries e
WHERE e.workspace_id = ${workspaceId}::uuid
${statusFilter}
AND (
to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
)
),
combined AS (
SELECT
COALESCE(v.id, k.id) AS id,
-- Reciprocal Rank Fusion: RRF(d) = sum(1 / (k + rank_i))
-- k=60 is a common constant that prevents high rankings from dominating
(COALESCE(1.0 / (60 + v.rank), 0) + COALESCE(1.0 / (60 + k.rank), 0)) AS rrf_score
FROM vector_search v
FULL OUTER JOIN keyword_search k ON v.id = k.id
)
SELECT
e.id,
e.workspace_id,
e.slug,
e.title,
e.content,
e.content_html,
e.summary,
e.status,
e.visibility,
e.created_at,
e.updated_at,
e.created_by,
e.updated_by,
c.rrf_score AS rank,
ts_headline(
'english',
e.content,
plainto_tsquery('english', ${sanitizedQuery}),
'MaxWords=50, MinWords=25, StartSel=<mark>, StopSel=</mark>'
) AS headline
FROM combined c
INNER JOIN knowledge_entries e ON c.id = e.id
ORDER BY c.rrf_score DESC, e.updated_at DESC
LIMIT ${limit}
OFFSET ${offset}
`;
// Get total count
const countResult = await this.prisma.$queryRaw<[{ count: bigint }]>`
WITH vector_search AS (
SELECT e.id
FROM knowledge_entries e
INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
WHERE e.workspace_id = ${workspaceId}::uuid
${statusFilter}
),
keyword_search AS (
SELECT e.id
FROM knowledge_entries e
WHERE e.workspace_id = ${workspaceId}::uuid
${statusFilter}
AND (
to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
)
)
SELECT COUNT(DISTINCT id) as count
FROM (
SELECT id FROM vector_search
UNION
SELECT id FROM keyword_search
) AS combined
`;
const total = Number(countResult[0].count);
// Fetch tags for the results
const entryIds = searchResults.map((r) => r.id);
const tagsMap = await this.fetchTagsForEntries(entryIds);
// Transform results to the expected format
const data: SearchResult[] = searchResults.map((row) => ({
id: row.id,
workspaceId: row.workspace_id,
slug: row.slug,
title: row.title,
content: row.content,
contentHtml: row.content_html,
summary: row.summary,
status: row.status,
visibility: row.visibility as "PRIVATE" | "WORKSPACE" | "PUBLIC",
createdAt: row.created_at,
updatedAt: row.updated_at,
createdBy: row.created_by,
updatedBy: row.updated_by,
rank: row.rank,
headline: row.headline ?? undefined,
tags: tagsMap.get(row.id) || [],
}));
return {
data,
pagination: {
page,
limit,
total,
totalPages: Math.ceil(total / limit),
},
query,
};
}
}