feat: add semantic search with pgvector (closes #68, #69, #70)

Issues resolved: - #68: pgvector Setup * Added pgvector vector index migration for knowledge_embeddings * Vector index uses HNSW algorithm with cosine distance * Optimized for 1536-dimension OpenAI embeddings - #69: Embedding Generation Pipeline * Created EmbeddingService with OpenAI integration * Automatic embedding generation on entry create/update * Batch processing endpoint for existing entries * Async generation to avoid blocking API responses * Content preparation with title weighting - #70: Semantic Search API * POST /api/knowledge/search/semantic - pure vector search * POST /api/knowledge/search/hybrid - RRF combined search * POST /api/knowledge/embeddings/batch - batch generation * Comprehensive test coverage * Full documentation in docs/SEMANTIC_SEARCH.md Technical details: - Uses OpenAI text-embedding-3-small model (1536 dims) - HNSW index for O(log n) similarity search - Reciprocal Rank Fusion for hybrid search - Graceful degradation when OpenAI not configured - Async embedding generation for performance Configuration: - Added OPENAI_API_KEY to .env.example - Optional feature - disabled if API key not set - Falls back to keyword search in hybrid mode
2026-01-30 00:24:41 -06:00
parent 22cd68811d
commit 3ec2059470
14 changed files with 1408 additions and 5 deletions
--- a/apps/api/src/knowledge/services/semantic-search.integration.spec.ts
+++ b/apps/api/src/knowledge/services/semantic-search.integration.spec.ts
@@ -0,0 +1,257 @@
+import { describe, it, expect, beforeAll, afterAll } from "vitest";
+import { PrismaClient, EntryStatus } from "@prisma/client";
+import { SearchService } from "./search.service";
+import { EmbeddingService } from "./embedding.service";
+import { KnowledgeCacheService } from "./cache.service";
+import { PrismaService } from "../../prisma/prisma.service";
+
+/**
+ * Integration tests for semantic search functionality
+ * 
+ * These tests require:
+ * - A running PostgreSQL database with pgvector extension
+ * - OPENAI_API_KEY environment variable set
+ * 
+ * Run with: pnpm test semantic-search.integration.spec.ts
+ */
+describe("Semantic Search Integration", () => {
+  let prisma: PrismaClient;
+  let searchService: SearchService;
+  let embeddingService: EmbeddingService;
+  let cacheService: KnowledgeCacheService;
+  let testWorkspaceId: string;
+  let testUserId: string;
+
+  beforeAll(async () => {
+    // Initialize services
+    prisma = new PrismaClient();
+    const prismaService = prisma as unknown as PrismaService;
+    
+    // Mock cache service for testing
+    cacheService = {
+      getSearch: async () => null,
+      setSearch: async () => {},
+      isEnabled: () => false,
+      getStats: () => ({ hits: 0, misses: 0, hitRate: 0 }),
+      resetStats: () => {},
+    } as unknown as KnowledgeCacheService;
+
+    embeddingService = new EmbeddingService(prismaService);
+    searchService = new SearchService(
+      prismaService,
+      cacheService,
+      embeddingService
+    );
+
+    // Create test workspace and user
+    const workspace = await prisma.workspace.create({
+      data: {
+        name: "Test Workspace for Semantic Search",
+        owner: {
+          create: {
+            email: "semantic-test@example.com",
+            name: "Test User",
+          },
+        },
+      },
+    });
+
+    testWorkspaceId = workspace.id;
+    testUserId = workspace.ownerId;
+  });
+
+  afterAll(async () => {
+    // Cleanup test data
+    if (testWorkspaceId) {
+      await prisma.knowledgeEntry.deleteMany({
+        where: { workspaceId: testWorkspaceId },
+      });
+      await prisma.workspace.delete({
+        where: { id: testWorkspaceId },
+      });
+    }
+    await prisma.$disconnect();
+  });
+
+  describe("EmbeddingService", () => {
+    it("should check if OpenAI is configured", () => {
+      const isConfigured = embeddingService.isConfigured();
+      // This test will pass if OPENAI_API_KEY is set
+      expect(typeof isConfigured).toBe("boolean");
+    });
+
+    it("should prepare content for embedding correctly", () => {
+      const title = "Introduction to PostgreSQL";
+      const content = "PostgreSQL is a powerful open-source database.";
+
+      const prepared = embeddingService.prepareContentForEmbedding(
+        title,
+        content
+      );
+
+      // Title should appear twice for weighting
+      expect(prepared).toContain(title);
+      expect(prepared).toContain(content);
+      const titleCount = (prepared.match(new RegExp(title, "g")) || []).length;
+      expect(titleCount).toBe(2);
+    });
+  });
+
+  describe("Semantic Search", () => {
+    const testEntries = [
+      {
+        slug: "postgresql-intro",
+        title: "Introduction to PostgreSQL",
+        content:
+          "PostgreSQL is a powerful, open-source relational database system. It supports advanced data types and performance optimization features.",
+      },
+      {
+        slug: "mongodb-basics",
+        title: "MongoDB Basics",
+        content:
+          "MongoDB is a NoSQL document database. It stores data in flexible, JSON-like documents instead of tables and rows.",
+      },
+      {
+        slug: "database-indexing",
+        title: "Database Indexing Strategies",
+        content:
+          "Indexing is crucial for database performance. Both B-tree and hash indexes have their use cases depending on query patterns.",
+      },
+    ];
+
+    it("should skip semantic search if OpenAI not configured", async () => {
+      if (!embeddingService.isConfigured()) {
+        await expect(
+          searchService.semanticSearch(
+            "database performance",
+            testWorkspaceId
+          )
+        ).rejects.toThrow();
+      } else {
+        // If configured, this is expected to work (tested below)
+        expect(true).toBe(true);
+      }
+    });
+
+    it.skipIf(!process.env["OPENAI_API_KEY"])(
+      "should generate embeddings and perform semantic search",
+      async () => {
+        // Create test entries
+        for (const entry of testEntries) {
+          const created = await prisma.knowledgeEntry.create({
+            data: {
+              workspaceId: testWorkspaceId,
+              slug: entry.slug,
+              title: entry.title,
+              content: entry.content,
+              status: EntryStatus.PUBLISHED,
+              visibility: "WORKSPACE",
+              createdBy: testUserId,
+              updatedBy: testUserId,
+            },
+          });
+
+          // Generate embedding
+          const preparedContent = embeddingService.prepareContentForEmbedding(
+            entry.title,
+            entry.content
+          );
+          await embeddingService.generateAndStoreEmbedding(
+            created.id,
+            preparedContent
+          );
+        }
+
+        // Wait a bit for embeddings to be stored
+        await new Promise((resolve) => setTimeout(resolve, 1000));
+
+        // Perform semantic search
+        const results = await searchService.semanticSearch(
+          "relational database systems",
+          testWorkspaceId
+        );
+
+        // Should return results
+        expect(results.data.length).toBeGreaterThan(0);
+
+        // PostgreSQL entry should rank high for "relational database"
+        const postgresEntry = results.data.find(
+          (r) => r.slug === "postgresql-intro"
+        );
+        expect(postgresEntry).toBeDefined();
+        expect(postgresEntry!.rank).toBeGreaterThan(0);
+      },
+      30000 // 30 second timeout for API calls
+    );
+
+    it.skipIf(!process.env["OPENAI_API_KEY"])(
+      "should perform hybrid search combining vector and keyword",
+      async () => {
+        const results = await searchService.hybridSearch(
+          "indexing",
+          testWorkspaceId
+        );
+
+        // Should return results
+        expect(results.data.length).toBeGreaterThan(0);
+
+        // Should find the indexing entry
+        const indexingEntry = results.data.find(
+          (r) => r.slug === "database-indexing"
+        );
+        expect(indexingEntry).toBeDefined();
+      },
+      30000
+    );
+  });
+
+  describe("Batch Embedding Generation", () => {
+    it.skipIf(!process.env["OPENAI_API_KEY"])(
+      "should batch generate embeddings",
+      async () => {
+        // Create entries without embeddings
+        const entries = await Promise.all(
+          Array.from({ length: 3 }, (_, i) =>
+            prisma.knowledgeEntry.create({
+              data: {
+                workspaceId: testWorkspaceId,
+                slug: `batch-test-${i}`,
+                title: `Batch Test Entry ${i}`,
+                content: `This is test content for batch entry ${i}`,
+                status: EntryStatus.PUBLISHED,
+                visibility: "WORKSPACE",
+                createdBy: testUserId,
+                updatedBy: testUserId,
+              },
+            })
+          )
+        );
+
+        // Batch generate embeddings
+        const entriesForEmbedding = entries.map((e) => ({
+          id: e.id,
+          content: embeddingService.prepareContentForEmbedding(
+            e.title,
+            e.content
+          ),
+        }));
+
+        const successCount = await embeddingService.batchGenerateEmbeddings(
+          entriesForEmbedding
+        );
+
+        expect(successCount).toBe(3);
+
+        // Verify embeddings were created
+        const embeddings = await prisma.knowledgeEmbedding.findMany({
+          where: {
+            entryId: { in: entries.map((e) => e.id) },
+          },
+        });
+
+        expect(embeddings.length).toBe(3);
+      },
+      60000 // 60 second timeout for batch operations
+    );
+  });
+});