feat: add semantic search with pgvector (closes #68, #69, #70)

Issues resolved: - #68: pgvector Setup * Added pgvector vector index migration for knowledge_embeddings * Vector index uses HNSW algorithm with cosine distance * Optimized for 1536-dimension OpenAI embeddings - #69: Embedding Generation Pipeline * Created EmbeddingService with OpenAI integration * Automatic embedding generation on entry create/update * Batch processing endpoint for existing entries * Async generation to avoid blocking API responses * Content preparation with title weighting - #70: Semantic Search API * POST /api/knowledge/search/semantic - pure vector search * POST /api/knowledge/search/hybrid - RRF combined search * POST /api/knowledge/embeddings/batch - batch generation * Comprehensive test coverage * Full documentation in docs/SEMANTIC_SEARCH.md Technical details: - Uses OpenAI text-embedding-3-small model (1536 dims) - HNSW index for O(log n) similarity search - Reciprocal Rank Fusion for hybrid search - Graceful degradation when OpenAI not configured - Async embedding generation for performance Configuration: - Added OPENAI_API_KEY to .env.example - Optional feature - disabled if API key not set - Falls back to keyword search in hybrid mode
2026-01-30 00:24:41 -06:00
parent 22cd68811d
commit 3ec2059470
14 changed files with 1408 additions and 5 deletions
--- a/apps/api/src/knowledge/services/search.service.ts
+++ b/apps/api/src/knowledge/services/search.service.ts
@@ -6,6 +6,7 @@ import type {
  PaginatedEntries,
 } from "../entities/knowledge-entry.entity";
 import { KnowledgeCacheService } from "./cache.service";
+import { EmbeddingService } from "./embedding.service";

 /**
 * Search options for full-text search
@@ -66,7 +67,8 @@ interface RawSearchResult {
 export class SearchService {
  constructor(
    private readonly prisma: PrismaService,
-    private readonly cache: KnowledgeCacheService
+    private readonly cache: KnowledgeCacheService,
+    private readonly embedding: EmbeddingService
  ) {}

  /**
@@ -428,4 +430,288 @@ export class SearchService {

    return tagsMap;
  }
+
+  /**
+   * Semantic search using vector similarity
+   *
+   * @param query - The search query string
+   * @param workspaceId - The workspace to search within
+   * @param options - Search options (status filter, pagination)
+   * @returns Paginated search results ranked by semantic similarity
+   */
+  async semanticSearch(
+    query: string,
+    workspaceId: string,
+    options: SearchOptions = {}
+  ): Promise<PaginatedSearchResults> {
+    if (!this.embedding.isConfigured()) {
+      throw new Error("Semantic search requires OPENAI_API_KEY to be configured");
+    }
+
+    const page = options.page || 1;
+    const limit = options.limit || 20;
+    const offset = (page - 1) * limit;
+
+    // Generate embedding for the query
+    const queryEmbedding = await this.embedding.generateEmbedding(query);
+    const embeddingString = `[${queryEmbedding.join(",")}]`;
+
+    // Build status filter
+    const statusFilter = options.status
+      ? Prisma.sql`AND e.status = ${options.status}::text::"EntryStatus"`
+      : Prisma.sql`AND e.status != 'ARCHIVED'`;
+
+    // Vector similarity search using cosine distance
+    const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
+      SELECT 
+        e.id,
+        e.workspace_id,
+        e.slug,
+        e.title,
+        e.content,
+        e.content_html,
+        e.summary,
+        e.status,
+        e.visibility,
+        e.created_at,
+        e.updated_at,
+        e.created_by,
+        e.updated_by,
+        (1 - (emb.embedding <=> ${embeddingString}::vector)) AS rank,
+        NULL AS headline
+      FROM knowledge_entries e
+      INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
+      WHERE e.workspace_id = ${workspaceId}::uuid
+        ${statusFilter}
+      ORDER BY emb.embedding <=> ${embeddingString}::vector
+      LIMIT ${limit}
+      OFFSET ${offset}
+    `;
+
+    // Get total count for pagination
+    const countResult = await this.prisma.$queryRaw<[{ count: bigint }]>`
+      SELECT COUNT(*) as count
+      FROM knowledge_entries e
+      INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
+      WHERE e.workspace_id = ${workspaceId}::uuid
+        ${statusFilter}
+    `;
+
+    const total = Number(countResult[0].count);
+
+    // Fetch tags for the results
+    const entryIds = searchResults.map((r) => r.id);
+    const tagsMap = await this.fetchTagsForEntries(entryIds);
+
+    // Transform results to the expected format
+    const data: SearchResult[] = searchResults.map((row) => ({
+      id: row.id,
+      workspaceId: row.workspace_id,
+      slug: row.slug,
+      title: row.title,
+      content: row.content,
+      contentHtml: row.content_html,
+      summary: row.summary,
+      status: row.status,
+      visibility: row.visibility as "PRIVATE" | "WORKSPACE" | "PUBLIC",
+      createdAt: row.created_at,
+      updatedAt: row.updated_at,
+      createdBy: row.created_by,
+      updatedBy: row.updated_by,
+      rank: row.rank,
+      headline: row.headline ?? undefined,
+      tags: tagsMap.get(row.id) || [],
+    }));
+
+    return {
+      data,
+      pagination: {
+        page,
+        limit,
+        total,
+        totalPages: Math.ceil(total / limit),
+      },
+      query,
+    };
+  }
+
+  /**
+   * Hybrid search combining vector similarity and full-text search
+   * Uses Reciprocal Rank Fusion (RRF) to combine rankings
+   *
+   * @param query - The search query string
+   * @param workspaceId - The workspace to search within
+   * @param options - Search options (status filter, pagination)
+   * @returns Paginated search results ranked by combined relevance
+   */
+  async hybridSearch(
+    query: string,
+    workspaceId: string,
+    options: SearchOptions = {}
+  ): Promise<PaginatedSearchResults> {
+    if (!this.embedding.isConfigured()) {
+      // Fall back to keyword search if embeddings not configured
+      return this.search(query, workspaceId, options);
+    }
+
+    const page = options.page || 1;
+    const limit = options.limit || 20;
+    const offset = (page - 1) * limit;
+
+    // Sanitize query for keyword search
+    const sanitizedQuery = this.sanitizeSearchQuery(query);
+
+    if (!sanitizedQuery) {
+      return {
+        data: [],
+        pagination: {
+          page,
+          limit,
+          total: 0,
+          totalPages: 0,
+        },
+        query,
+      };
+    }
+
+    // Generate embedding for vector search
+    const queryEmbedding = await this.embedding.generateEmbedding(query);
+    const embeddingString = `[${queryEmbedding.join(",")}]`;
+
+    // Build status filter
+    const statusFilter = options.status
+      ? Prisma.sql`AND e.status = ${options.status}::text::"EntryStatus"`
+      : Prisma.sql`AND e.status != 'ARCHIVED'`;
+
+    // Hybrid search using Reciprocal Rank Fusion (RRF)
+    // Combines vector similarity and full-text search rankings
+    const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
+      WITH vector_search AS (
+        SELECT 
+          e.id,
+          ROW_NUMBER() OVER (ORDER BY emb.embedding <=> ${embeddingString}::vector) AS rank
+        FROM knowledge_entries e
+        INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
+        WHERE e.workspace_id = ${workspaceId}::uuid
+          ${statusFilter}
+      ),
+      keyword_search AS (
+        SELECT 
+          e.id,
+          ROW_NUMBER() OVER (
+            ORDER BY ts_rank(
+              setweight(to_tsvector('english', e.title), 'A') ||
+              setweight(to_tsvector('english', e.content), 'B'),
+              plainto_tsquery('english', ${sanitizedQuery})
+            ) DESC
+          ) AS rank
+        FROM knowledge_entries e
+        WHERE e.workspace_id = ${workspaceId}::uuid
+          ${statusFilter}
+          AND (
+            to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
+            OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
+          )
+      ),
+      combined AS (
+        SELECT 
+          COALESCE(v.id, k.id) AS id,
+          -- Reciprocal Rank Fusion: RRF(d) = sum(1 / (k + rank_i))
+          -- k=60 is a common constant that prevents high rankings from dominating
+          (COALESCE(1.0 / (60 + v.rank), 0) + COALESCE(1.0 / (60 + k.rank), 0)) AS rrf_score
+        FROM vector_search v
+        FULL OUTER JOIN keyword_search k ON v.id = k.id
+      )
+      SELECT 
+        e.id,
+        e.workspace_id,
+        e.slug,
+        e.title,
+        e.content,
+        e.content_html,
+        e.summary,
+        e.status,
+        e.visibility,
+        e.created_at,
+        e.updated_at,
+        e.created_by,
+        e.updated_by,
+        c.rrf_score AS rank,
+        ts_headline(
+          'english',
+          e.content,
+          plainto_tsquery('english', ${sanitizedQuery}),
+          'MaxWords=50, MinWords=25, StartSel=<mark>, StopSel=</mark>'
+        ) AS headline
+      FROM combined c
+      INNER JOIN knowledge_entries e ON c.id = e.id
+      ORDER BY c.rrf_score DESC, e.updated_at DESC
+      LIMIT ${limit}
+      OFFSET ${offset}
+    `;
+
+    // Get total count
+    const countResult = await this.prisma.$queryRaw<[{ count: bigint }]>`
+      WITH vector_search AS (
+        SELECT e.id
+        FROM knowledge_entries e
+        INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
+        WHERE e.workspace_id = ${workspaceId}::uuid
+          ${statusFilter}
+      ),
+      keyword_search AS (
+        SELECT e.id
+        FROM knowledge_entries e
+        WHERE e.workspace_id = ${workspaceId}::uuid
+          ${statusFilter}
+          AND (
+            to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
+            OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
+          )
+      )
+      SELECT COUNT(DISTINCT id) as count
+      FROM (
+        SELECT id FROM vector_search
+        UNION
+        SELECT id FROM keyword_search
+      ) AS combined
+    `;
+
+    const total = Number(countResult[0].count);
+
+    // Fetch tags for the results
+    const entryIds = searchResults.map((r) => r.id);
+    const tagsMap = await this.fetchTagsForEntries(entryIds);
+
+    // Transform results to the expected format
+    const data: SearchResult[] = searchResults.map((row) => ({
+      id: row.id,
+      workspaceId: row.workspace_id,
+      slug: row.slug,
+      title: row.title,
+      content: row.content,
+      contentHtml: row.content_html,
+      summary: row.summary,
+      status: row.status,
+      visibility: row.visibility as "PRIVATE" | "WORKSPACE" | "PUBLIC",
+      createdAt: row.created_at,
+      updatedAt: row.updated_at,
+      createdBy: row.created_by,
+      updatedBy: row.updated_by,
+      rank: row.rank,
+      headline: row.headline ?? undefined,
+      tags: tagsMap.get(row.id) || [],
+    }));
+
+    return {
+      data,
+      pagination: {
+        page,
+        limit,
+        total,
+        totalPages: Math.ceil(total / limit),
+      },
+      query,
+    };
+  }
 }