From 24d59e7595c215050a7da9d1ecaf0ba39ee8400a Mon Sep 17 00:00:00 2001 From: Jason Woltje Date: Mon, 2 Feb 2026 14:25:45 -0600 Subject: [PATCH] feat(#65): implement full-text search with tsvector and GIN index Add PostgreSQL full-text search infrastructure for knowledge entries: - Add search_vector tsvector column to knowledge_entries table - Create GIN index for fast full-text search performance - Implement automatic trigger to maintain search_vector on insert/update - Weight fields: title (A), summary (B), content (C) - Update SearchService to use precomputed search_vector - Add comprehensive integration tests for FTS functionality Tests: - 8/8 new integration tests passing - 205/225 knowledge module tests passing - All quality gates pass (typecheck, lint) Refs #65 Co-Authored-By: Claude Sonnet 4.5 --- .../migration.sql | 36 +++ apps/api/prisma/schema.prisma | 4 + .../services/fulltext-search.spec.ts | 276 ++++++++++++++++++ .../src/knowledge/services/search.service.ts | 36 +-- docs/scratchpads/65-full-text-search.md | 52 ++++ 5 files changed, 378 insertions(+), 26 deletions(-) create mode 100644 apps/api/prisma/migrations/20260202142100_add_fulltext_search_to_knowledge_entries/migration.sql create mode 100644 apps/api/src/knowledge/services/fulltext-search.spec.ts create mode 100644 docs/scratchpads/65-full-text-search.md diff --git a/apps/api/prisma/migrations/20260202142100_add_fulltext_search_to_knowledge_entries/migration.sql b/apps/api/prisma/migrations/20260202142100_add_fulltext_search_to_knowledge_entries/migration.sql new file mode 100644 index 0000000..1289d9d --- /dev/null +++ b/apps/api/prisma/migrations/20260202142100_add_fulltext_search_to_knowledge_entries/migration.sql @@ -0,0 +1,36 @@ +-- Add tsvector column for full-text search on knowledge_entries +-- Weighted fields: title (A), summary (B), content (C) + +-- Step 1: Add the search_vector column +ALTER TABLE "knowledge_entries" +ADD COLUMN "search_vector" tsvector; + +-- Step 2: Create GIN index for fast full-text search +CREATE INDEX "knowledge_entries_search_vector_idx" +ON "knowledge_entries" +USING gin("search_vector"); + +-- Step 3: Create function to update search_vector +CREATE OR REPLACE FUNCTION knowledge_entries_search_vector_update() +RETURNS trigger AS $$ +BEGIN + NEW.search_vector := + setweight(to_tsvector('english', COALESCE(NEW.title, '')), 'A') || + setweight(to_tsvector('english', COALESCE(NEW.summary, '')), 'B') || + setweight(to_tsvector('english', COALESCE(NEW.content, '')), 'C'); + RETURN NEW; +END +$$ LANGUAGE plpgsql; + +-- Step 4: Create trigger to automatically update search_vector on insert/update +CREATE TRIGGER knowledge_entries_search_vector_trigger +BEFORE INSERT OR UPDATE ON "knowledge_entries" +FOR EACH ROW +EXECUTE FUNCTION knowledge_entries_search_vector_update(); + +-- Step 5: Populate search_vector for existing entries +UPDATE "knowledge_entries" +SET search_vector = + setweight(to_tsvector('english', COALESCE(title, '')), 'A') || + setweight(to_tsvector('english', COALESCE(summary, '')), 'B') || + setweight(to_tsvector('english', COALESCE(content, '')), 'C'); diff --git a/apps/api/prisma/schema.prisma b/apps/api/prisma/schema.prisma index 7bc4532..2e59cb3 100644 --- a/apps/api/prisma/schema.prisma +++ b/apps/api/prisma/schema.prisma @@ -798,6 +798,9 @@ model KnowledgeEntry { contentHtml String? @map("content_html") @db.Text summary String? + // Full-text search vector (automatically maintained by trigger) + searchVector Unsupported("tsvector")? @map("search_vector") + // Status status EntryStatus @default(DRAFT) visibility Visibility @default(PRIVATE) @@ -820,6 +823,7 @@ model KnowledgeEntry { @@index([workspaceId, updatedAt]) @@index([createdBy]) @@index([updatedBy]) + // Note: GIN index on searchVector created via migration (not supported in Prisma schema) @@map("knowledge_entries") } diff --git a/apps/api/src/knowledge/services/fulltext-search.spec.ts b/apps/api/src/knowledge/services/fulltext-search.spec.ts new file mode 100644 index 0000000..36005b9 --- /dev/null +++ b/apps/api/src/knowledge/services/fulltext-search.spec.ts @@ -0,0 +1,276 @@ +import { describe, it, expect, beforeAll, afterAll } from "vitest"; +import { PrismaClient } from "@prisma/client"; + +/** + * Integration tests for PostgreSQL full-text search setup + * Tests the tsvector column, GIN index, and automatic trigger + */ +describe("Full-Text Search Setup (Integration)", () => { + let prisma: PrismaClient; + let testWorkspaceId: string; + let testUserId: string; + + beforeAll(async () => { + prisma = new PrismaClient(); + await prisma.$connect(); + + // Create test workspace + const workspace = await prisma.workspace.create({ + data: { + name: "Test Workspace", + owner: { + create: { + email: `test-fts-${Date.now()}@example.com`, + name: "Test User", + }, + }, + }, + }); + testWorkspaceId = workspace.id; + testUserId = workspace.ownerId; + }); + + afterAll(async () => { + // Cleanup + if (testWorkspaceId) { + await prisma.knowledgeEntry.deleteMany({ + where: { workspaceId: testWorkspaceId }, + }); + await prisma.workspace.delete({ + where: { id: testWorkspaceId }, + }); + } + await prisma.$disconnect(); + }); + + describe("tsvector column", () => { + it("should have search_vector column in knowledge_entries table", async () => { + // Query to check if column exists + const result = await prisma.$queryRaw<{ column_name: string; data_type: string }[]>` + SELECT column_name, data_type + FROM information_schema.columns + WHERE table_name = 'knowledge_entries' + AND column_name = 'search_vector' + `; + + expect(result).toHaveLength(1); + expect(result[0].column_name).toBe("search_vector"); + expect(result[0].data_type).toBe("tsvector"); + }); + + it("should automatically populate search_vector on insert", async () => { + const entry = await prisma.knowledgeEntry.create({ + data: { + workspaceId: testWorkspaceId, + slug: "auto-populate-test", + title: "PostgreSQL Full-Text Search", + content: "This is a test of the automatic trigger functionality.", + summary: "Testing automatic population", + createdBy: testUserId, + updatedBy: testUserId, + }, + }); + + // Query raw to check search_vector was populated + const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>` + SELECT id, search_vector::text + FROM knowledge_entries + WHERE id = ${entry.id}::uuid + `; + + expect(result).toHaveLength(1); + expect(result[0].search_vector).not.toBeNull(); + // Verify 'postgresql' appears in title (weight A) + expect(result[0].search_vector).toContain("'postgresql':1A"); + // Verify 'search' appears in both title (A) and content (C) + expect(result[0].search_vector).toContain("'search':5A"); + }); + + it("should automatically update search_vector on update", async () => { + const entry = await prisma.knowledgeEntry.create({ + data: { + workspaceId: testWorkspaceId, + slug: "auto-update-test", + title: "Original Title", + content: "Original content", + createdBy: testUserId, + updatedBy: testUserId, + }, + }); + + // Update the entry + await prisma.knowledgeEntry.update({ + where: { id: entry.id }, + data: { + title: "Updated Elasticsearch Title", + content: "Updated content with Elasticsearch", + }, + }); + + // Check search_vector was updated + const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>` + SELECT id, search_vector::text + FROM knowledge_entries + WHERE id = ${entry.id}::uuid + `; + + expect(result).toHaveLength(1); + // Verify 'elasticsearch' appears in both title (A) and content (C) + // PostgreSQL combines positions: '2A,7C' means position 2 in title (A) and position 7 in content (C) + expect(result[0].search_vector).toContain("'elasticsearch':2A,7C"); + expect(result[0].search_vector).not.toContain("'original'"); + }); + + it("should include summary in search_vector with weight B", async () => { + const entry = await prisma.knowledgeEntry.create({ + data: { + workspaceId: testWorkspaceId, + slug: "summary-weight-test", + title: "Title Word", + content: "Content word", + summary: "Summary keyword here", + createdBy: testUserId, + updatedBy: testUserId, + }, + }); + + const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>` + SELECT id, search_vector::text + FROM knowledge_entries + WHERE id = ${entry.id}::uuid + `; + + expect(result).toHaveLength(1); + // Summary should have weight B - 'keyword' appears in summary + expect(result[0].search_vector).toContain("'keyword':4B"); + }); + + it("should handle null summary gracefully", async () => { + const entry = await prisma.knowledgeEntry.create({ + data: { + workspaceId: testWorkspaceId, + slug: "null-summary-test", + title: "Title without summary", + content: "Content without summary", + summary: null, + createdBy: testUserId, + updatedBy: testUserId, + }, + }); + + const result = await prisma.$queryRaw<{ id: string; search_vector: string | null }[]>` + SELECT id, search_vector::text + FROM knowledge_entries + WHERE id = ${entry.id}::uuid + `; + + expect(result).toHaveLength(1); + expect(result[0].search_vector).not.toBeNull(); + // Verify 'titl' (stemmed from 'title') appears with weight A + expect(result[0].search_vector).toContain("'titl':1A"); + // Verify 'content' appears with weight C + expect(result[0].search_vector).toContain("'content':4C"); + }); + }); + + describe("GIN index", () => { + it("should have GIN index on search_vector column", async () => { + const result = await prisma.$queryRaw<{ indexname: string; indexdef: string }[]>` + SELECT indexname, indexdef + FROM pg_indexes + WHERE tablename = 'knowledge_entries' + AND indexname = 'knowledge_entries_search_vector_idx' + `; + + expect(result).toHaveLength(1); + expect(result[0].indexdef).toContain("gin"); + expect(result[0].indexdef).toContain("search_vector"); + }); + }); + + describe("search performance", () => { + it("should perform fast searches using the GIN index", async () => { + // Create multiple entries + const entries = Array.from({ length: 10 }, (_, i) => ({ + workspaceId: testWorkspaceId, + slug: `perf-test-${i}`, + title: `Performance Test ${i}`, + content: i % 2 === 0 ? "Contains database keyword" : "No keyword here", + createdBy: testUserId, + updatedBy: testUserId, + })); + + await prisma.knowledgeEntry.createMany({ + data: entries, + }); + + const startTime = Date.now(); + + // Search using the precomputed search_vector + const results = await prisma.$queryRaw<{ id: string; title: string }[]>` + SELECT id, title + FROM knowledge_entries + WHERE workspace_id = ${testWorkspaceId}::uuid + AND search_vector @@ plainto_tsquery('english', 'database') + ORDER BY ts_rank(search_vector, plainto_tsquery('english', 'database')) DESC + `; + + const duration = Date.now() - startTime; + + expect(results.length).toBeGreaterThan(0); + // Should be fast with index (< 100ms for small dataset) + expect(duration).toBeLessThan(100); + }); + + it("should rank results by relevance using weighted fields", async () => { + // Create entries with keyword in different positions + await prisma.knowledgeEntry.createMany({ + data: [ + { + workspaceId: testWorkspaceId, + slug: "rank-title", + title: "Redis caching strategies", + content: "Various approaches to caching", + summary: "Overview of strategies", + createdBy: testUserId, + updatedBy: testUserId, + }, + { + workspaceId: testWorkspaceId, + slug: "rank-summary", + title: "Database optimization", + content: "Performance tuning", + summary: "Redis is mentioned in summary", + createdBy: testUserId, + updatedBy: testUserId, + }, + { + workspaceId: testWorkspaceId, + slug: "rank-content", + title: "Performance guide", + content: "Use Redis for better performance", + summary: "Best practices", + createdBy: testUserId, + updatedBy: testUserId, + }, + ], + }); + + const results = await prisma.$queryRaw<{ slug: string; rank: number }[]>` + SELECT slug, ts_rank(search_vector, plainto_tsquery('english', 'redis')) AS rank + FROM knowledge_entries + WHERE workspace_id = ${testWorkspaceId}::uuid + AND search_vector @@ plainto_tsquery('english', 'redis') + ORDER BY rank DESC + `; + + expect(results.length).toBe(3); + // Title match should rank highest (weight A) + expect(results[0].slug).toBe("rank-title"); + // Summary should rank second (weight B) + expect(results[1].slug).toBe("rank-summary"); + // Content should rank third (weight C) + expect(results[2].slug).toBe("rank-content"); + }); + }); +}); diff --git a/apps/api/src/knowledge/services/search.service.ts b/apps/api/src/knowledge/services/search.service.ts index abfc202..0acb620 100644 --- a/apps/api/src/knowledge/services/search.service.ts +++ b/apps/api/src/knowledge/services/search.service.ts @@ -118,12 +118,13 @@ export class SearchService { : Prisma.sql`AND e.status != 'ARCHIVED'`; // PostgreSQL full-text search query - // Uses ts_rank for relevance scoring with weights: title (A=1.0), content (B=0.4) + // Uses precomputed search_vector column (with weights: A=title, B=summary, C=content) + // Maintained automatically by database trigger const searchResults = await this.prisma.$queryRaw` WITH search_query AS ( SELECT plainto_tsquery('english', ${sanitizedQuery}) AS query ) - SELECT + SELECT e.id, e.workspace_id, e.slug, @@ -137,11 +138,7 @@ export class SearchService { e.updated_at, e.created_by, e.updated_by, - ts_rank( - setweight(to_tsvector('english', e.title), 'A') || - setweight(to_tsvector('english', e.content), 'B'), - sq.query - ) AS rank, + ts_rank(e.search_vector, sq.query) AS rank, ts_headline( 'english', e.content, @@ -151,10 +148,7 @@ export class SearchService { FROM knowledge_entries e, search_query sq WHERE e.workspace_id = ${workspaceId}::uuid ${statusFilter} - AND ( - to_tsvector('english', e.title) @@ sq.query - OR to_tsvector('english', e.content) @@ sq.query - ) + AND e.search_vector @@ sq.query ORDER BY rank DESC, e.updated_at DESC LIMIT ${limit} OFFSET ${offset} @@ -166,10 +160,7 @@ export class SearchService { FROM knowledge_entries e WHERE e.workspace_id = ${workspaceId}::uuid ${statusFilter} - AND ( - to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery}) - OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery}) - ) + AND e.search_vector @@ plainto_tsquery('english', ${sanitizedQuery}) `; const total = Number(countResult[0].count); @@ -592,22 +583,18 @@ export class SearchService { ${statusFilter} ), keyword_search AS ( - SELECT + SELECT e.id, ROW_NUMBER() OVER ( ORDER BY ts_rank( - setweight(to_tsvector('english', e.title), 'A') || - setweight(to_tsvector('english', e.content), 'B'), + e.search_vector, plainto_tsquery('english', ${sanitizedQuery}) ) DESC ) AS rank FROM knowledge_entries e WHERE e.workspace_id = ${workspaceId}::uuid ${statusFilter} - AND ( - to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery}) - OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery}) - ) + AND e.search_vector @@ plainto_tsquery('english', ${sanitizedQuery}) ), combined AS ( SELECT @@ -660,10 +647,7 @@ export class SearchService { FROM knowledge_entries e WHERE e.workspace_id = ${workspaceId}::uuid ${statusFilter} - AND ( - to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery}) - OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery}) - ) + AND e.search_vector @@ plainto_tsquery('english', ${sanitizedQuery}) ) SELECT COUNT(DISTINCT id) as count FROM ( diff --git a/docs/scratchpads/65-full-text-search.md b/docs/scratchpads/65-full-text-search.md new file mode 100644 index 0000000..db26eb8 --- /dev/null +++ b/docs/scratchpads/65-full-text-search.md @@ -0,0 +1,52 @@ +# Issue #65: [KNOW-013] Full-Text Search Setup + +## Objective + +Set up PostgreSQL full-text search for entries in the knowledge module with weighted fields and proper indexing. + +## Approach + +1. Examine current Prisma schema for knowledge entries +2. Write tests for full-text search functionality (TDD) +3. Add tsvector column to knowledge entries table +4. Create GIN index for performance +5. Implement trigger to maintain tsvector on insert/update +6. Weight fields: title (A), summary (B), content (C) +7. Verify with sample queries + +## Progress + +- [x] Create scratchpad +- [x] Read Prisma schema +- [x] Examine existing search service +- [x] Write failing tests for tsvector column (RED) +- [x] Create migration with tsvector column, GIN index, and triggers +- [x] Update Prisma schema to include tsvector +- [x] Update search service to use precomputed tsvector (GREEN) +- [x] Run tests and verify coverage (8/8 integration tests pass, 205/225 knowledge module tests pass) +- [x] Run quality checks (typecheck and lint pass) +- [ ] Commit changes + +## Current State + +The search service already implements full-text search using `to_tsvector` and `ts_rank` +in raw SQL queries, but it calculates tsvector on-the-fly. This is inefficient for large +datasets. The migration will: + +1. Add a `search_vector` tsvector column to knowledge_entries +2. Create a GIN index on search_vector for fast lookups +3. Add a trigger to automatically update search_vector on insert/update +4. Use weighted fields: title (A), summary (B), content (C) + +## Testing + +- Unit tests for search query generation +- Integration tests with actual database queries +- Performance verification with sample data + +## Notes + +- Using PostgreSQL's built-in full-text search capabilities +- GIN index for fast text search +- Automatic maintenance via triggers +- Field weights: A (title) > B (summary) > C (content)