feat(#65): implement full-text search with tsvector and GIN index

Add PostgreSQL full-text search infrastructure for knowledge entries: - Add search_vector tsvector column to knowledge_entries table - Create GIN index for fast full-text search performance - Implement automatic trigger to maintain search_vector on insert/update - Weight fields: title (A), summary (B), content (C) - Update SearchService to use precomputed search_vector - Add comprehensive integration tests for FTS functionality Tests: - 8/8 new integration tests passing - 205/225 knowledge module tests passing - All quality gates pass (typecheck, lint) Refs #65 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-02 14:25:45 -06:00
parent a0dc2f798c
commit 24d59e7595
5 changed files with 378 additions and 26 deletions
--- a/apps/api/prisma/migrations/20260202142100_add_fulltext_search_to_knowledge_entries/migration.sql
+++ b/apps/api/prisma/migrations/20260202142100_add_fulltext_search_to_knowledge_entries/migration.sql
@@ -0,0 +1,36 @@
 -- Add tsvector column for full-text search on knowledge_entries
 -- Weighted fields: title (A), summary (B), content (C)
 -- Step 1: Add the search_vector column
 ALTER TABLE "knowledge_entries"
 ADD COLUMN "search_vector" tsvector;
 -- Step 2: Create GIN index for fast full-text search
 CREATE INDEX "knowledge_entries_search_vector_idx"
 ON "knowledge_entries"
 USING gin("search_vector");
 -- Step 3: Create function to update search_vector
 CREATE OR REPLACE FUNCTION knowledge_entries_search_vector_update()
 RETURNS trigger AS $$
 BEGIN
  NEW.search_vector :=
    setweight(to_tsvector('english', COALESCE(NEW.title, '')), 'A') ||
    setweight(to_tsvector('english', COALESCE(NEW.summary, '')), 'B') ||
    setweight(to_tsvector('english', COALESCE(NEW.content, '')), 'C');
  RETURN NEW;
 END
 $$ LANGUAGE plpgsql;
 -- Step 4: Create trigger to automatically update search_vector on insert/update
 CREATE TRIGGER knowledge_entries_search_vector_trigger
 BEFORE INSERT OR UPDATE ON "knowledge_entries"
 FOR EACH ROW
 EXECUTE FUNCTION knowledge_entries_search_vector_update();
 -- Step 5: Populate search_vector for existing entries
 UPDATE "knowledge_entries"
 SET search_vector =
  setweight(to_tsvector('english', COALESCE(title, '')), 'A') ||
  setweight(to_tsvector('english', COALESCE(summary, '')), 'B') ||
  setweight(to_tsvector('english', COALESCE(content, '')), 'C');
--- a/apps/api/prisma/schema.prisma
+++ b/apps/api/prisma/schema.prisma
@@ -798,6 +798,9 @@ model KnowledgeEntry {
  contentHtml String? @map("content_html") @db.Text
  summary     String?
  // Full-text search vector (automatically maintained by trigger)
  searchVector Unsupported("tsvector")? @map("search_vector")
  // Status
  status     EntryStatus @default(DRAFT)
  visibility Visibility  @default(PRIVATE)
@@ -820,6 +823,7 @@ model KnowledgeEntry {
  @@index([workspaceId, updatedAt])
  @@index([createdBy])
  @@index([updatedBy])
  // Note: GIN index on searchVector created via migration (not supported in Prisma schema)
  @@map("knowledge_entries")
 }
--- a/apps/api/src/knowledge/services/fulltext-search.spec.ts
+++ b/apps/api/src/knowledge/services/fulltext-search.spec.ts
@@ -0,0 +1,276 @@
 import { describe, it, expect, beforeAll, afterAll } from "vitest";
 import { PrismaClient } from "@prisma/client";
 /**
 * Integration tests for PostgreSQL full-text search setup
 * Tests the tsvector column, GIN index, and automatic trigger
 */
 describe("Full-Text Search Setup (Integration)", () => {
  let prisma: PrismaClient;
  let testWorkspaceId: string;
  let testUserId: string;
  beforeAll(async () => {
    prisma = new PrismaClient();
    await prisma.$connect();
    // Create test workspace
    const workspace = await prisma.workspace.create({
      data: {
        name: "Test Workspace",
        owner: {
          create: {
            email: `test-fts-${Date.now()}@example.com`,
            name: "Test User",
          },
        },
      },
    });
    testWorkspaceId = workspace.id;
    testUserId = workspace.ownerId;
  });
  afterAll(async () => {
    // Cleanup
    if (testWorkspaceId) {
      await prisma.knowledgeEntry.deleteMany({
        where: { workspaceId: testWorkspaceId },
      });
      await prisma.workspace.delete({
        where: { id: testWorkspaceId },
      });
    }
    await prisma.$disconnect();
  });
  describe("tsvector column", () => {
    it("should have search_vector column in knowledge_entries table", async () => {
      // Query to check if column exists
      const result = await prisma.$queryRaw<{ column_name: string; data_type: string }[]>`
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'knowledge_entries'
          AND column_name = 'search_vector'
      `;
      expect(result).toHaveLength(1);
      expect(result[0].column_name).toBe("search_vector");
      expect(result[0].data_type).toBe("tsvector");
    });
    it("should automatically populate search_vector on insert", async () => {
      const entry = await prisma.knowledgeEntry.create({
        data: {
          workspaceId: testWorkspaceId,
          slug: "auto-populate-test",
          title: "PostgreSQL Full-Text Search",
          content: "This is a test of the automatic trigger functionality.",
          summary: "Testing automatic population",
          createdBy: testUserId,
          updatedBy: testUserId,
        },
      });
      // Query raw to check search_vector was populated
      const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>`
        SELECT id, search_vector::text
        FROM knowledge_entries
        WHERE id = ${entry.id}::uuid
      `;
      expect(result).toHaveLength(1);
      expect(result[0].search_vector).not.toBeNull();
      // Verify 'postgresql' appears in title (weight A)
      expect(result[0].search_vector).toContain("'postgresql':1A");
      // Verify 'search' appears in both title (A) and content (C)
      expect(result[0].search_vector).toContain("'search':5A");
    });
    it("should automatically update search_vector on update", async () => {
      const entry = await prisma.knowledgeEntry.create({
        data: {
          workspaceId: testWorkspaceId,
          slug: "auto-update-test",
          title: "Original Title",
          content: "Original content",
          createdBy: testUserId,
          updatedBy: testUserId,
        },
      });
      // Update the entry
      await prisma.knowledgeEntry.update({
        where: { id: entry.id },
        data: {
          title: "Updated Elasticsearch Title",
          content: "Updated content with Elasticsearch",
        },
      });
      // Check search_vector was updated
      const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>`
        SELECT id, search_vector::text
        FROM knowledge_entries
        WHERE id = ${entry.id}::uuid
      `;
      expect(result).toHaveLength(1);
      // Verify 'elasticsearch' appears in both title (A) and content (C)
      // PostgreSQL combines positions: '2A,7C' means position 2 in title (A) and position 7 in content (C)
      expect(result[0].search_vector).toContain("'elasticsearch':2A,7C");
      expect(result[0].search_vector).not.toContain("'original'");
    });
    it("should include summary in search_vector with weight B", async () => {
      const entry = await prisma.knowledgeEntry.create({
        data: {
          workspaceId: testWorkspaceId,
          slug: "summary-weight-test",
          title: "Title Word",
          content: "Content word",
          summary: "Summary keyword here",
          createdBy: testUserId,
          updatedBy: testUserId,
        },
      });
      const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>`
        SELECT id, search_vector::text
        FROM knowledge_entries
        WHERE id = ${entry.id}::uuid
      `;
      expect(result).toHaveLength(1);
      // Summary should have weight B - 'keyword' appears in summary
      expect(result[0].search_vector).toContain("'keyword':4B");
    });
    it("should handle null summary gracefully", async () => {
      const entry = await prisma.knowledgeEntry.create({
        data: {
          workspaceId: testWorkspaceId,
          slug: "null-summary-test",
          title: "Title without summary",
          content: "Content without summary",
          summary: null,
          createdBy: testUserId,
          updatedBy: testUserId,
        },
      });
      const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>`
        SELECT id, search_vector::text
        FROM knowledge_entries
        WHERE id = ${entry.id}::uuid
      `;
      expect(result).toHaveLength(1);
      expect(result[0].search_vector).not.toBeNull();
      // Verify 'titl' (stemmed from 'title') appears with weight A
      expect(result[0].search_vector).toContain("'titl':1A");
      // Verify 'content' appears with weight C
      expect(result[0].search_vector).toContain("'content':4C");
    });
  });
  describe("GIN index", () => {
    it("should have GIN index on search_vector column", async () => {
      const result = await prisma.$queryRaw<{ indexname: string; indexdef: string }[]>`
        SELECT indexname, indexdef
        FROM pg_indexes
        WHERE tablename = 'knowledge_entries'
          AND indexname = 'knowledge_entries_search_vector_idx'
      `;
      expect(result).toHaveLength(1);
      expect(result[0].indexdef).toContain("gin");
      expect(result[0].indexdef).toContain("search_vector");
    });
  });
  describe("search performance", () => {
    it("should perform fast searches using the GIN index", async () => {
      // Create multiple entries
      const entries = Array.from({ length: 10 }, (_, i) => ({
        workspaceId: testWorkspaceId,
        slug: `perf-test-${i}`,
        title: `Performance Test ${i}`,
        content: i % 2 === 0 ? "Contains database keyword" : "No keyword here",
        createdBy: testUserId,
        updatedBy: testUserId,
      }));
      await prisma.knowledgeEntry.createMany({
        data: entries,
      });
      const startTime = Date.now();
      // Search using the precomputed search_vector
      const results = await prisma.$queryRaw<{ id: string; title: string }[]>`
        SELECT id, title
        FROM knowledge_entries
        WHERE workspace_id = ${testWorkspaceId}::uuid
          AND search_vector @@ plainto_tsquery('english', 'database')
        ORDER BY ts_rank(search_vector, plainto_tsquery('english', 'database')) DESC
      `;
      const duration = Date.now() - startTime;
      expect(results.length).toBeGreaterThan(0);
      // Should be fast with index (< 100ms for small dataset)
      expect(duration).toBeLessThan(100);
    });
    it("should rank results by relevance using weighted fields", async () => {
      // Create entries with keyword in different positions
      await prisma.knowledgeEntry.createMany({
        data: [
          {
            workspaceId: testWorkspaceId,
            slug: "rank-title",
            title: "Redis caching strategies",
            content: "Various approaches to caching",
            summary: "Overview of strategies",
            createdBy: testUserId,
            updatedBy: testUserId,
          },
          {
            workspaceId: testWorkspaceId,
            slug: "rank-summary",
            title: "Database optimization",
            content: "Performance tuning",
            summary: "Redis is mentioned in summary",
            createdBy: testUserId,
            updatedBy: testUserId,
          },
          {
            workspaceId: testWorkspaceId,
            slug: "rank-content",
            title: "Performance guide",
            content: "Use Redis for better performance",
            summary: "Best practices",
            createdBy: testUserId,
            updatedBy: testUserId,
          },
        ],
      });
      const results = await prisma.$queryRaw<{ slug: string; rank: number }[]>`
        SELECT slug, ts_rank(search_vector, plainto_tsquery('english', 'redis')) AS rank
        FROM knowledge_entries
        WHERE workspace_id = ${testWorkspaceId}::uuid
          AND search_vector @@ plainto_tsquery('english', 'redis')
        ORDER BY rank DESC
      `;
      expect(results.length).toBe(3);
      // Title match should rank highest (weight A)
      expect(results[0].slug).toBe("rank-title");
      // Summary should rank second (weight B)
      expect(results[1].slug).toBe("rank-summary");
      // Content should rank third (weight C)
      expect(results[2].slug).toBe("rank-content");
    });
  });
 });
--- a/apps/api/src/knowledge/services/search.service.ts
+++ b/apps/api/src/knowledge/services/search.service.ts
@@ -118,7 +118,8 @@ export class SearchService {
      : Prisma.sql`AND e.status != 'ARCHIVED'`;
    // PostgreSQL full-text search query
-    // Uses ts_rank for relevance scoring with weights: title (A=1.0), content (B=0.4)
+    // Uses precomputed search_vector column (with weights: A=title, B=summary, C=content)
    // Maintained automatically by database trigger
    const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
      WITH search_query AS (
        SELECT plainto_tsquery('english', ${sanitizedQuery}) AS query
@@ -137,11 +138,7 @@ export class SearchService {
        e.updated_at,
        e.created_by,
        e.updated_by,
-        ts_rank(
+        ts_rank(e.search_vector, sq.query) AS rank,
          setweight(to_tsvector('english', e.title), 'A') ||
          setweight(to_tsvector('english', e.content), 'B'),
          sq.query
        ) AS rank,
        ts_headline(
          'english',
          e.content,
@@ -151,10 +148,7 @@ export class SearchService {
      FROM knowledge_entries e, search_query sq
      WHERE e.workspace_id = ${workspaceId}::uuid
        ${statusFilter}
-        AND (
+        AND e.search_vector @@ sq.query
          to_tsvector('english', e.title) @@ sq.query
          OR to_tsvector('english', e.content) @@ sq.query
        )
      ORDER BY rank DESC, e.updated_at DESC
      LIMIT ${limit}
      OFFSET ${offset}
@@ -166,10 +160,7 @@ export class SearchService {
      FROM knowledge_entries e
      WHERE e.workspace_id = ${workspaceId}::uuid
        ${statusFilter}
-        AND (
+        AND e.search_vector @@ plainto_tsquery('english', ${sanitizedQuery})
          to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
          OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
        )
    `;
    const total = Number(countResult[0].count);
@@ -596,18 +587,14 @@ export class SearchService {
          e.id,
          ROW_NUMBER() OVER (
            ORDER BY ts_rank(
-              setweight(to_tsvector('english', e.title), 'A') ||
+              e.search_vector,
              setweight(to_tsvector('english', e.content), 'B'),
              plainto_tsquery('english', ${sanitizedQuery})
            ) DESC
          ) AS rank
        FROM knowledge_entries e
        WHERE e.workspace_id = ${workspaceId}::uuid
          ${statusFilter}
-          AND (
+          AND e.search_vector @@ plainto_tsquery('english', ${sanitizedQuery})
            to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
            OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
          )
      ),
      combined AS (
        SELECT 
@@ -660,10 +647,7 @@ export class SearchService {
        FROM knowledge_entries e
        WHERE e.workspace_id = ${workspaceId}::uuid
          ${statusFilter}
-          AND (
+          AND e.search_vector @@ plainto_tsquery('english', ${sanitizedQuery})
            to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
            OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
          )
      )
      SELECT COUNT(DISTINCT id) as count
      FROM (
--- a/docs/scratchpads/65-full-text-search.md
+++ b/docs/scratchpads/65-full-text-search.md
@@ -0,0 +1,52 @@
 # Issue #65: [KNOW-013] Full-Text Search Setup
 ## Objective
 Set up PostgreSQL full-text search for entries in the knowledge module with weighted fields and proper indexing.
 ## Approach
 1. Examine current Prisma schema for knowledge entries
 2. Write tests for full-text search functionality (TDD)
 3. Add tsvector column to knowledge entries table
 4. Create GIN index for performance
 5. Implement trigger to maintain tsvector on insert/update
 6. Weight fields: title (A), summary (B), content (C)
 7. Verify with sample queries
 ## Progress
 - [x] Create scratchpad
 - [x] Read Prisma schema
 - [x] Examine existing search service
 - [x] Write failing tests for tsvector column (RED)
 - [x] Create migration with tsvector column, GIN index, and triggers
 - [x] Update Prisma schema to include tsvector
 - [x] Update search service to use precomputed tsvector (GREEN)
 - [x] Run tests and verify coverage (8/8 integration tests pass, 205/225 knowledge module tests pass)
 - [x] Run quality checks (typecheck and lint pass)
 - [ ] Commit changes
 ## Current State
 The search service already implements full-text search using `to_tsvector` and `ts_rank`
 in raw SQL queries, but it calculates tsvector on-the-fly. This is inefficient for large
 datasets. The migration will:
 1. Add a `search_vector` tsvector column to knowledge_entries
 2. Create a GIN index on search_vector for fast lookups
 3. Add a trigger to automatically update search_vector on insert/update
 4. Use weighted fields: title (A), summary (B), content (C)
 ## Testing
 - Unit tests for search query generation
 - Integration tests with actual database queries
 - Performance verification with sample data
 ## Notes
 - Using PostgreSQL's built-in full-text search capabilities
 - GIN index for fast text search
 - Automatic maintenance via triggers
 - Field weights: A (title) > B (summary) > C (content)