Files
stack/apps/api/src/knowledge/services/semantic-search.integration.spec.ts
Jason Woltje 3ec2059470
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
ci/woodpecker/pr/woodpecker Pipeline failed
feat: add semantic search with pgvector (closes #68, #69, #70)
Issues resolved:
- #68: pgvector Setup
  * Added pgvector vector index migration for knowledge_embeddings
  * Vector index uses HNSW algorithm with cosine distance
  * Optimized for 1536-dimension OpenAI embeddings

- #69: Embedding Generation Pipeline
  * Created EmbeddingService with OpenAI integration
  * Automatic embedding generation on entry create/update
  * Batch processing endpoint for existing entries
  * Async generation to avoid blocking API responses
  * Content preparation with title weighting

- #70: Semantic Search API
  * POST /api/knowledge/search/semantic - pure vector search
  * POST /api/knowledge/search/hybrid - RRF combined search
  * POST /api/knowledge/embeddings/batch - batch generation
  * Comprehensive test coverage
  * Full documentation in docs/SEMANTIC_SEARCH.md

Technical details:
- Uses OpenAI text-embedding-3-small model (1536 dims)
- HNSW index for O(log n) similarity search
- Reciprocal Rank Fusion for hybrid search
- Graceful degradation when OpenAI not configured
- Async embedding generation for performance

Configuration:
- Added OPENAI_API_KEY to .env.example
- Optional feature - disabled if API key not set
- Falls back to keyword search in hybrid mode
2026-01-30 15:19:13 -06:00

258 lines
7.8 KiB
TypeScript

import { describe, it, expect, beforeAll, afterAll } from "vitest";
import { PrismaClient, EntryStatus } from "@prisma/client";
import { SearchService } from "./search.service";
import { EmbeddingService } from "./embedding.service";
import { KnowledgeCacheService } from "./cache.service";
import { PrismaService } from "../../prisma/prisma.service";
/**
* Integration tests for semantic search functionality
*
* These tests require:
* - A running PostgreSQL database with pgvector extension
* - OPENAI_API_KEY environment variable set
*
* Run with: pnpm test semantic-search.integration.spec.ts
*/
describe("Semantic Search Integration", () => {
let prisma: PrismaClient;
let searchService: SearchService;
let embeddingService: EmbeddingService;
let cacheService: KnowledgeCacheService;
let testWorkspaceId: string;
let testUserId: string;
beforeAll(async () => {
// Initialize services
prisma = new PrismaClient();
const prismaService = prisma as unknown as PrismaService;
// Mock cache service for testing
cacheService = {
getSearch: async () => null,
setSearch: async () => {},
isEnabled: () => false,
getStats: () => ({ hits: 0, misses: 0, hitRate: 0 }),
resetStats: () => {},
} as unknown as KnowledgeCacheService;
embeddingService = new EmbeddingService(prismaService);
searchService = new SearchService(
prismaService,
cacheService,
embeddingService
);
// Create test workspace and user
const workspace = await prisma.workspace.create({
data: {
name: "Test Workspace for Semantic Search",
owner: {
create: {
email: "semantic-test@example.com",
name: "Test User",
},
},
},
});
testWorkspaceId = workspace.id;
testUserId = workspace.ownerId;
});
afterAll(async () => {
// Cleanup test data
if (testWorkspaceId) {
await prisma.knowledgeEntry.deleteMany({
where: { workspaceId: testWorkspaceId },
});
await prisma.workspace.delete({
where: { id: testWorkspaceId },
});
}
await prisma.$disconnect();
});
describe("EmbeddingService", () => {
it("should check if OpenAI is configured", () => {
const isConfigured = embeddingService.isConfigured();
// This test will pass if OPENAI_API_KEY is set
expect(typeof isConfigured).toBe("boolean");
});
it("should prepare content for embedding correctly", () => {
const title = "Introduction to PostgreSQL";
const content = "PostgreSQL is a powerful open-source database.";
const prepared = embeddingService.prepareContentForEmbedding(
title,
content
);
// Title should appear twice for weighting
expect(prepared).toContain(title);
expect(prepared).toContain(content);
const titleCount = (prepared.match(new RegExp(title, "g")) || []).length;
expect(titleCount).toBe(2);
});
});
describe("Semantic Search", () => {
const testEntries = [
{
slug: "postgresql-intro",
title: "Introduction to PostgreSQL",
content:
"PostgreSQL is a powerful, open-source relational database system. It supports advanced data types and performance optimization features.",
},
{
slug: "mongodb-basics",
title: "MongoDB Basics",
content:
"MongoDB is a NoSQL document database. It stores data in flexible, JSON-like documents instead of tables and rows.",
},
{
slug: "database-indexing",
title: "Database Indexing Strategies",
content:
"Indexing is crucial for database performance. Both B-tree and hash indexes have their use cases depending on query patterns.",
},
];
it("should skip semantic search if OpenAI not configured", async () => {
if (!embeddingService.isConfigured()) {
await expect(
searchService.semanticSearch(
"database performance",
testWorkspaceId
)
).rejects.toThrow();
} else {
// If configured, this is expected to work (tested below)
expect(true).toBe(true);
}
});
it.skipIf(!process.env["OPENAI_API_KEY"])(
"should generate embeddings and perform semantic search",
async () => {
// Create test entries
for (const entry of testEntries) {
const created = await prisma.knowledgeEntry.create({
data: {
workspaceId: testWorkspaceId,
slug: entry.slug,
title: entry.title,
content: entry.content,
status: EntryStatus.PUBLISHED,
visibility: "WORKSPACE",
createdBy: testUserId,
updatedBy: testUserId,
},
});
// Generate embedding
const preparedContent = embeddingService.prepareContentForEmbedding(
entry.title,
entry.content
);
await embeddingService.generateAndStoreEmbedding(
created.id,
preparedContent
);
}
// Wait a bit for embeddings to be stored
await new Promise((resolve) => setTimeout(resolve, 1000));
// Perform semantic search
const results = await searchService.semanticSearch(
"relational database systems",
testWorkspaceId
);
// Should return results
expect(results.data.length).toBeGreaterThan(0);
// PostgreSQL entry should rank high for "relational database"
const postgresEntry = results.data.find(
(r) => r.slug === "postgresql-intro"
);
expect(postgresEntry).toBeDefined();
expect(postgresEntry!.rank).toBeGreaterThan(0);
},
30000 // 30 second timeout for API calls
);
it.skipIf(!process.env["OPENAI_API_KEY"])(
"should perform hybrid search combining vector and keyword",
async () => {
const results = await searchService.hybridSearch(
"indexing",
testWorkspaceId
);
// Should return results
expect(results.data.length).toBeGreaterThan(0);
// Should find the indexing entry
const indexingEntry = results.data.find(
(r) => r.slug === "database-indexing"
);
expect(indexingEntry).toBeDefined();
},
30000
);
});
describe("Batch Embedding Generation", () => {
it.skipIf(!process.env["OPENAI_API_KEY"])(
"should batch generate embeddings",
async () => {
// Create entries without embeddings
const entries = await Promise.all(
Array.from({ length: 3 }, (_, i) =>
prisma.knowledgeEntry.create({
data: {
workspaceId: testWorkspaceId,
slug: `batch-test-${i}`,
title: `Batch Test Entry ${i}`,
content: `This is test content for batch entry ${i}`,
status: EntryStatus.PUBLISHED,
visibility: "WORKSPACE",
createdBy: testUserId,
updatedBy: testUserId,
},
})
)
);
// Batch generate embeddings
const entriesForEmbedding = entries.map((e) => ({
id: e.id,
content: embeddingService.prepareContentForEmbedding(
e.title,
e.content
),
}));
const successCount = await embeddingService.batchGenerateEmbeddings(
entriesForEmbedding
);
expect(successCount).toBe(3);
// Verify embeddings were created
const embeddings = await prisma.knowledgeEmbedding.findMany({
where: {
entryId: { in: entries.map((e) => e.id) },
},
});
expect(embeddings.length).toBe(3);
},
60000 // 60 second timeout for batch operations
);
});
});