Issues resolved: - #68: pgvector Setup * Added pgvector vector index migration for knowledge_embeddings * Vector index uses HNSW algorithm with cosine distance * Optimized for 1536-dimension OpenAI embeddings - #69: Embedding Generation Pipeline * Created EmbeddingService with OpenAI integration * Automatic embedding generation on entry create/update * Batch processing endpoint for existing entries * Async generation to avoid blocking API responses * Content preparation with title weighting - #70: Semantic Search API * POST /api/knowledge/search/semantic - pure vector search * POST /api/knowledge/search/hybrid - RRF combined search * POST /api/knowledge/embeddings/batch - batch generation * Comprehensive test coverage * Full documentation in docs/SEMANTIC_SEARCH.md Technical details: - Uses OpenAI text-embedding-3-small model (1536 dims) - HNSW index for O(log n) similarity search - Reciprocal Rank Fusion for hybrid search - Graceful degradation when OpenAI not configured - Async embedding generation for performance Configuration: - Added OPENAI_API_KEY to .env.example - Optional feature - disabled if API key not set - Falls back to keyword search in hybrid mode
258 lines
7.8 KiB
TypeScript
258 lines
7.8 KiB
TypeScript
import { describe, it, expect, beforeAll, afterAll } from "vitest";
|
|
import { PrismaClient, EntryStatus } from "@prisma/client";
|
|
import { SearchService } from "./search.service";
|
|
import { EmbeddingService } from "./embedding.service";
|
|
import { KnowledgeCacheService } from "./cache.service";
|
|
import { PrismaService } from "../../prisma/prisma.service";
|
|
|
|
/**
|
|
* Integration tests for semantic search functionality
|
|
*
|
|
* These tests require:
|
|
* - A running PostgreSQL database with pgvector extension
|
|
* - OPENAI_API_KEY environment variable set
|
|
*
|
|
* Run with: pnpm test semantic-search.integration.spec.ts
|
|
*/
|
|
describe("Semantic Search Integration", () => {
|
|
let prisma: PrismaClient;
|
|
let searchService: SearchService;
|
|
let embeddingService: EmbeddingService;
|
|
let cacheService: KnowledgeCacheService;
|
|
let testWorkspaceId: string;
|
|
let testUserId: string;
|
|
|
|
beforeAll(async () => {
|
|
// Initialize services
|
|
prisma = new PrismaClient();
|
|
const prismaService = prisma as unknown as PrismaService;
|
|
|
|
// Mock cache service for testing
|
|
cacheService = {
|
|
getSearch: async () => null,
|
|
setSearch: async () => {},
|
|
isEnabled: () => false,
|
|
getStats: () => ({ hits: 0, misses: 0, hitRate: 0 }),
|
|
resetStats: () => {},
|
|
} as unknown as KnowledgeCacheService;
|
|
|
|
embeddingService = new EmbeddingService(prismaService);
|
|
searchService = new SearchService(
|
|
prismaService,
|
|
cacheService,
|
|
embeddingService
|
|
);
|
|
|
|
// Create test workspace and user
|
|
const workspace = await prisma.workspace.create({
|
|
data: {
|
|
name: "Test Workspace for Semantic Search",
|
|
owner: {
|
|
create: {
|
|
email: "semantic-test@example.com",
|
|
name: "Test User",
|
|
},
|
|
},
|
|
},
|
|
});
|
|
|
|
testWorkspaceId = workspace.id;
|
|
testUserId = workspace.ownerId;
|
|
});
|
|
|
|
afterAll(async () => {
|
|
// Cleanup test data
|
|
if (testWorkspaceId) {
|
|
await prisma.knowledgeEntry.deleteMany({
|
|
where: { workspaceId: testWorkspaceId },
|
|
});
|
|
await prisma.workspace.delete({
|
|
where: { id: testWorkspaceId },
|
|
});
|
|
}
|
|
await prisma.$disconnect();
|
|
});
|
|
|
|
describe("EmbeddingService", () => {
|
|
it("should check if OpenAI is configured", () => {
|
|
const isConfigured = embeddingService.isConfigured();
|
|
// This test will pass if OPENAI_API_KEY is set
|
|
expect(typeof isConfigured).toBe("boolean");
|
|
});
|
|
|
|
it("should prepare content for embedding correctly", () => {
|
|
const title = "Introduction to PostgreSQL";
|
|
const content = "PostgreSQL is a powerful open-source database.";
|
|
|
|
const prepared = embeddingService.prepareContentForEmbedding(
|
|
title,
|
|
content
|
|
);
|
|
|
|
// Title should appear twice for weighting
|
|
expect(prepared).toContain(title);
|
|
expect(prepared).toContain(content);
|
|
const titleCount = (prepared.match(new RegExp(title, "g")) || []).length;
|
|
expect(titleCount).toBe(2);
|
|
});
|
|
});
|
|
|
|
describe("Semantic Search", () => {
|
|
const testEntries = [
|
|
{
|
|
slug: "postgresql-intro",
|
|
title: "Introduction to PostgreSQL",
|
|
content:
|
|
"PostgreSQL is a powerful, open-source relational database system. It supports advanced data types and performance optimization features.",
|
|
},
|
|
{
|
|
slug: "mongodb-basics",
|
|
title: "MongoDB Basics",
|
|
content:
|
|
"MongoDB is a NoSQL document database. It stores data in flexible, JSON-like documents instead of tables and rows.",
|
|
},
|
|
{
|
|
slug: "database-indexing",
|
|
title: "Database Indexing Strategies",
|
|
content:
|
|
"Indexing is crucial for database performance. Both B-tree and hash indexes have their use cases depending on query patterns.",
|
|
},
|
|
];
|
|
|
|
it("should skip semantic search if OpenAI not configured", async () => {
|
|
if (!embeddingService.isConfigured()) {
|
|
await expect(
|
|
searchService.semanticSearch(
|
|
"database performance",
|
|
testWorkspaceId
|
|
)
|
|
).rejects.toThrow();
|
|
} else {
|
|
// If configured, this is expected to work (tested below)
|
|
expect(true).toBe(true);
|
|
}
|
|
});
|
|
|
|
it.skipIf(!process.env["OPENAI_API_KEY"])(
|
|
"should generate embeddings and perform semantic search",
|
|
async () => {
|
|
// Create test entries
|
|
for (const entry of testEntries) {
|
|
const created = await prisma.knowledgeEntry.create({
|
|
data: {
|
|
workspaceId: testWorkspaceId,
|
|
slug: entry.slug,
|
|
title: entry.title,
|
|
content: entry.content,
|
|
status: EntryStatus.PUBLISHED,
|
|
visibility: "WORKSPACE",
|
|
createdBy: testUserId,
|
|
updatedBy: testUserId,
|
|
},
|
|
});
|
|
|
|
// Generate embedding
|
|
const preparedContent = embeddingService.prepareContentForEmbedding(
|
|
entry.title,
|
|
entry.content
|
|
);
|
|
await embeddingService.generateAndStoreEmbedding(
|
|
created.id,
|
|
preparedContent
|
|
);
|
|
}
|
|
|
|
// Wait a bit for embeddings to be stored
|
|
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
|
|
// Perform semantic search
|
|
const results = await searchService.semanticSearch(
|
|
"relational database systems",
|
|
testWorkspaceId
|
|
);
|
|
|
|
// Should return results
|
|
expect(results.data.length).toBeGreaterThan(0);
|
|
|
|
// PostgreSQL entry should rank high for "relational database"
|
|
const postgresEntry = results.data.find(
|
|
(r) => r.slug === "postgresql-intro"
|
|
);
|
|
expect(postgresEntry).toBeDefined();
|
|
expect(postgresEntry!.rank).toBeGreaterThan(0);
|
|
},
|
|
30000 // 30 second timeout for API calls
|
|
);
|
|
|
|
it.skipIf(!process.env["OPENAI_API_KEY"])(
|
|
"should perform hybrid search combining vector and keyword",
|
|
async () => {
|
|
const results = await searchService.hybridSearch(
|
|
"indexing",
|
|
testWorkspaceId
|
|
);
|
|
|
|
// Should return results
|
|
expect(results.data.length).toBeGreaterThan(0);
|
|
|
|
// Should find the indexing entry
|
|
const indexingEntry = results.data.find(
|
|
(r) => r.slug === "database-indexing"
|
|
);
|
|
expect(indexingEntry).toBeDefined();
|
|
},
|
|
30000
|
|
);
|
|
});
|
|
|
|
describe("Batch Embedding Generation", () => {
|
|
it.skipIf(!process.env["OPENAI_API_KEY"])(
|
|
"should batch generate embeddings",
|
|
async () => {
|
|
// Create entries without embeddings
|
|
const entries = await Promise.all(
|
|
Array.from({ length: 3 }, (_, i) =>
|
|
prisma.knowledgeEntry.create({
|
|
data: {
|
|
workspaceId: testWorkspaceId,
|
|
slug: `batch-test-${i}`,
|
|
title: `Batch Test Entry ${i}`,
|
|
content: `This is test content for batch entry ${i}`,
|
|
status: EntryStatus.PUBLISHED,
|
|
visibility: "WORKSPACE",
|
|
createdBy: testUserId,
|
|
updatedBy: testUserId,
|
|
},
|
|
})
|
|
)
|
|
);
|
|
|
|
// Batch generate embeddings
|
|
const entriesForEmbedding = entries.map((e) => ({
|
|
id: e.id,
|
|
content: embeddingService.prepareContentForEmbedding(
|
|
e.title,
|
|
e.content
|
|
),
|
|
}));
|
|
|
|
const successCount = await embeddingService.batchGenerateEmbeddings(
|
|
entriesForEmbedding
|
|
);
|
|
|
|
expect(successCount).toBe(3);
|
|
|
|
// Verify embeddings were created
|
|
const embeddings = await prisma.knowledgeEmbedding.findMany({
|
|
where: {
|
|
entryId: { in: entries.map((e) => e.id) },
|
|
},
|
|
});
|
|
|
|
expect(embeddings.length).toBe(3);
|
|
},
|
|
60000 // 60 second timeout for batch operations
|
|
);
|
|
});
|
|
});
|