Issues resolved: - #68: pgvector Setup * Added pgvector vector index migration for knowledge_embeddings * Vector index uses HNSW algorithm with cosine distance * Optimized for 1536-dimension OpenAI embeddings - #69: Embedding Generation Pipeline * Created EmbeddingService with OpenAI integration * Automatic embedding generation on entry create/update * Batch processing endpoint for existing entries * Async generation to avoid blocking API responses * Content preparation with title weighting - #70: Semantic Search API * POST /api/knowledge/search/semantic - pure vector search * POST /api/knowledge/search/hybrid - RRF combined search * POST /api/knowledge/embeddings/batch - batch generation * Comprehensive test coverage * Full documentation in docs/SEMANTIC_SEARCH.md Technical details: - Uses OpenAI text-embedding-3-small model (1536 dims) - HNSW index for O(log n) similarity search - Reciprocal Rank Fusion for hybrid search - Graceful degradation when OpenAI not configured - Async embedding generation for performance Configuration: - Added OPENAI_API_KEY to .env.example - Optional feature - disabled if API key not set - Falls back to keyword search in hybrid mode
This commit is contained in:
@@ -0,0 +1,257 @@
|
||||
import { describe, it, expect, beforeAll, afterAll } from "vitest";
|
||||
import { PrismaClient, EntryStatus } from "@prisma/client";
|
||||
import { SearchService } from "./search.service";
|
||||
import { EmbeddingService } from "./embedding.service";
|
||||
import { KnowledgeCacheService } from "./cache.service";
|
||||
import { PrismaService } from "../../prisma/prisma.service";
|
||||
|
||||
/**
|
||||
* Integration tests for semantic search functionality
|
||||
*
|
||||
* These tests require:
|
||||
* - A running PostgreSQL database with pgvector extension
|
||||
* - OPENAI_API_KEY environment variable set
|
||||
*
|
||||
* Run with: pnpm test semantic-search.integration.spec.ts
|
||||
*/
|
||||
describe("Semantic Search Integration", () => {
|
||||
let prisma: PrismaClient;
|
||||
let searchService: SearchService;
|
||||
let embeddingService: EmbeddingService;
|
||||
let cacheService: KnowledgeCacheService;
|
||||
let testWorkspaceId: string;
|
||||
let testUserId: string;
|
||||
|
||||
beforeAll(async () => {
|
||||
// Initialize services
|
||||
prisma = new PrismaClient();
|
||||
const prismaService = prisma as unknown as PrismaService;
|
||||
|
||||
// Mock cache service for testing
|
||||
cacheService = {
|
||||
getSearch: async () => null,
|
||||
setSearch: async () => {},
|
||||
isEnabled: () => false,
|
||||
getStats: () => ({ hits: 0, misses: 0, hitRate: 0 }),
|
||||
resetStats: () => {},
|
||||
} as unknown as KnowledgeCacheService;
|
||||
|
||||
embeddingService = new EmbeddingService(prismaService);
|
||||
searchService = new SearchService(
|
||||
prismaService,
|
||||
cacheService,
|
||||
embeddingService
|
||||
);
|
||||
|
||||
// Create test workspace and user
|
||||
const workspace = await prisma.workspace.create({
|
||||
data: {
|
||||
name: "Test Workspace for Semantic Search",
|
||||
owner: {
|
||||
create: {
|
||||
email: "semantic-test@example.com",
|
||||
name: "Test User",
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
testWorkspaceId = workspace.id;
|
||||
testUserId = workspace.ownerId;
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
// Cleanup test data
|
||||
if (testWorkspaceId) {
|
||||
await prisma.knowledgeEntry.deleteMany({
|
||||
where: { workspaceId: testWorkspaceId },
|
||||
});
|
||||
await prisma.workspace.delete({
|
||||
where: { id: testWorkspaceId },
|
||||
});
|
||||
}
|
||||
await prisma.$disconnect();
|
||||
});
|
||||
|
||||
describe("EmbeddingService", () => {
|
||||
it("should check if OpenAI is configured", () => {
|
||||
const isConfigured = embeddingService.isConfigured();
|
||||
// This test will pass if OPENAI_API_KEY is set
|
||||
expect(typeof isConfigured).toBe("boolean");
|
||||
});
|
||||
|
||||
it("should prepare content for embedding correctly", () => {
|
||||
const title = "Introduction to PostgreSQL";
|
||||
const content = "PostgreSQL is a powerful open-source database.";
|
||||
|
||||
const prepared = embeddingService.prepareContentForEmbedding(
|
||||
title,
|
||||
content
|
||||
);
|
||||
|
||||
// Title should appear twice for weighting
|
||||
expect(prepared).toContain(title);
|
||||
expect(prepared).toContain(content);
|
||||
const titleCount = (prepared.match(new RegExp(title, "g")) || []).length;
|
||||
expect(titleCount).toBe(2);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Semantic Search", () => {
|
||||
const testEntries = [
|
||||
{
|
||||
slug: "postgresql-intro",
|
||||
title: "Introduction to PostgreSQL",
|
||||
content:
|
||||
"PostgreSQL is a powerful, open-source relational database system. It supports advanced data types and performance optimization features.",
|
||||
},
|
||||
{
|
||||
slug: "mongodb-basics",
|
||||
title: "MongoDB Basics",
|
||||
content:
|
||||
"MongoDB is a NoSQL document database. It stores data in flexible, JSON-like documents instead of tables and rows.",
|
||||
},
|
||||
{
|
||||
slug: "database-indexing",
|
||||
title: "Database Indexing Strategies",
|
||||
content:
|
||||
"Indexing is crucial for database performance. Both B-tree and hash indexes have their use cases depending on query patterns.",
|
||||
},
|
||||
];
|
||||
|
||||
it("should skip semantic search if OpenAI not configured", async () => {
|
||||
if (!embeddingService.isConfigured()) {
|
||||
await expect(
|
||||
searchService.semanticSearch(
|
||||
"database performance",
|
||||
testWorkspaceId
|
||||
)
|
||||
).rejects.toThrow();
|
||||
} else {
|
||||
// If configured, this is expected to work (tested below)
|
||||
expect(true).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
it.skipIf(!process.env["OPENAI_API_KEY"])(
|
||||
"should generate embeddings and perform semantic search",
|
||||
async () => {
|
||||
// Create test entries
|
||||
for (const entry of testEntries) {
|
||||
const created = await prisma.knowledgeEntry.create({
|
||||
data: {
|
||||
workspaceId: testWorkspaceId,
|
||||
slug: entry.slug,
|
||||
title: entry.title,
|
||||
content: entry.content,
|
||||
status: EntryStatus.PUBLISHED,
|
||||
visibility: "WORKSPACE",
|
||||
createdBy: testUserId,
|
||||
updatedBy: testUserId,
|
||||
},
|
||||
});
|
||||
|
||||
// Generate embedding
|
||||
const preparedContent = embeddingService.prepareContentForEmbedding(
|
||||
entry.title,
|
||||
entry.content
|
||||
);
|
||||
await embeddingService.generateAndStoreEmbedding(
|
||||
created.id,
|
||||
preparedContent
|
||||
);
|
||||
}
|
||||
|
||||
// Wait a bit for embeddings to be stored
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
|
||||
// Perform semantic search
|
||||
const results = await searchService.semanticSearch(
|
||||
"relational database systems",
|
||||
testWorkspaceId
|
||||
);
|
||||
|
||||
// Should return results
|
||||
expect(results.data.length).toBeGreaterThan(0);
|
||||
|
||||
// PostgreSQL entry should rank high for "relational database"
|
||||
const postgresEntry = results.data.find(
|
||||
(r) => r.slug === "postgresql-intro"
|
||||
);
|
||||
expect(postgresEntry).toBeDefined();
|
||||
expect(postgresEntry!.rank).toBeGreaterThan(0);
|
||||
},
|
||||
30000 // 30 second timeout for API calls
|
||||
);
|
||||
|
||||
it.skipIf(!process.env["OPENAI_API_KEY"])(
|
||||
"should perform hybrid search combining vector and keyword",
|
||||
async () => {
|
||||
const results = await searchService.hybridSearch(
|
||||
"indexing",
|
||||
testWorkspaceId
|
||||
);
|
||||
|
||||
// Should return results
|
||||
expect(results.data.length).toBeGreaterThan(0);
|
||||
|
||||
// Should find the indexing entry
|
||||
const indexingEntry = results.data.find(
|
||||
(r) => r.slug === "database-indexing"
|
||||
);
|
||||
expect(indexingEntry).toBeDefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
});
|
||||
|
||||
describe("Batch Embedding Generation", () => {
|
||||
it.skipIf(!process.env["OPENAI_API_KEY"])(
|
||||
"should batch generate embeddings",
|
||||
async () => {
|
||||
// Create entries without embeddings
|
||||
const entries = await Promise.all(
|
||||
Array.from({ length: 3 }, (_, i) =>
|
||||
prisma.knowledgeEntry.create({
|
||||
data: {
|
||||
workspaceId: testWorkspaceId,
|
||||
slug: `batch-test-${i}`,
|
||||
title: `Batch Test Entry ${i}`,
|
||||
content: `This is test content for batch entry ${i}`,
|
||||
status: EntryStatus.PUBLISHED,
|
||||
visibility: "WORKSPACE",
|
||||
createdBy: testUserId,
|
||||
updatedBy: testUserId,
|
||||
},
|
||||
})
|
||||
)
|
||||
);
|
||||
|
||||
// Batch generate embeddings
|
||||
const entriesForEmbedding = entries.map((e) => ({
|
||||
id: e.id,
|
||||
content: embeddingService.prepareContentForEmbedding(
|
||||
e.title,
|
||||
e.content
|
||||
),
|
||||
}));
|
||||
|
||||
const successCount = await embeddingService.batchGenerateEmbeddings(
|
||||
entriesForEmbedding
|
||||
);
|
||||
|
||||
expect(successCount).toBe(3);
|
||||
|
||||
// Verify embeddings were created
|
||||
const embeddings = await prisma.knowledgeEmbedding.findMany({
|
||||
where: {
|
||||
entryId: { in: entries.map((e) => e.id) },
|
||||
},
|
||||
});
|
||||
|
||||
expect(embeddings.length).toBe(3);
|
||||
},
|
||||
60000 // 60 second timeout for batch operations
|
||||
);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user