stack/apps/api/src/knowledge/services/semantic-search.integration.spec.ts

import { describe, it, expect, beforeAll, afterAll } from "vitest";
import { PrismaClient, EntryStatus } from "@prisma/client";
import { SearchService } from "./search.service";
import { EmbeddingService } from "./embedding.service";
import { KnowledgeCacheService } from "./cache.service";
import { PrismaService } from "../../prisma/prisma.service";

/**
 * Integration tests for semantic search functionality
 *
 * These tests require:
 * - A running PostgreSQL database with pgvector extension
 * - OPENAI_API_KEY environment variable set
 *
 * Run with: pnpm test semantic-search.integration.spec.ts
 */
describe("Semantic Search Integration", () => {
  let prisma: PrismaClient;
  let searchService: SearchService;
  let embeddingService: EmbeddingService;
  let cacheService: KnowledgeCacheService;
  let testWorkspaceId: string;
  let testUserId: string;

  beforeAll(async () => {
    // Initialize services
    prisma = new PrismaClient();
    const prismaService = prisma as unknown as PrismaService;

    // Mock cache service for testing
    cacheService = {
      getSearch: async () => null,
      setSearch: async () => {},
      isEnabled: () => false,
      getStats: () => ({ hits: 0, misses: 0, hitRate: 0 }),
      resetStats: () => {},
    } as unknown as KnowledgeCacheService;

    embeddingService = new EmbeddingService(prismaService);
    searchService = new SearchService(
      prismaService,
      cacheService,
      embeddingService
    );

    // Create test workspace and user
    const workspace = await prisma.workspace.create({
      data: {
        name: "Test Workspace for Semantic Search",
        owner: {
          create: {
            email: "semantic-test@example.com",
            name: "Test User",
          },
        },
      },
    });

    testWorkspaceId = workspace.id;
    testUserId = workspace.ownerId;
  });

  afterAll(async () => {
    // Cleanup test data
    if (testWorkspaceId) {
      await prisma.knowledgeEntry.deleteMany({
        where: { workspaceId: testWorkspaceId },
      });
      await prisma.workspace.delete({
        where: { id: testWorkspaceId },
      });
    }
    await prisma.$disconnect();
  });

  describe("EmbeddingService", () => {
    it("should check if OpenAI is configured", () => {
      const isConfigured = embeddingService.isConfigured();
      // This test will pass if OPENAI_API_KEY is set
      expect(typeof isConfigured).toBe("boolean");
    });

    it("should prepare content for embedding correctly", () => {
      const title = "Introduction to PostgreSQL";
      const content = "PostgreSQL is a powerful open-source database.";

      const prepared = embeddingService.prepareContentForEmbedding(
        title,
        content
      );

      // Title should appear twice for weighting
      expect(prepared).toContain(title);
      expect(prepared).toContain(content);
      const titleCount = (prepared.match(new RegExp(title, "g")) || []).length;
      expect(titleCount).toBe(2);
    });
  });

  describe("Semantic Search", () => {
    const testEntries = [
      {
        slug: "postgresql-intro",
        title: "Introduction to PostgreSQL",
        content:
          "PostgreSQL is a powerful, open-source relational database system. It supports advanced data types and performance optimization features.",
      },
      {
        slug: "mongodb-basics",
        title: "MongoDB Basics",
        content:
          "MongoDB is a NoSQL document database. It stores data in flexible, JSON-like documents instead of tables and rows.",
      },
      {
        slug: "database-indexing",
        title: "Database Indexing Strategies",
        content:
          "Indexing is crucial for database performance. Both B-tree and hash indexes have their use cases depending on query patterns.",
      },
    ];

    it("should skip semantic search if OpenAI not configured", async () => {
      if (!embeddingService.isConfigured()) {
        await expect(
          searchService.semanticSearch(
            "database performance",
            testWorkspaceId
          )
        ).rejects.toThrow();
      } else {
        // If configured, this is expected to work (tested below)
        expect(true).toBe(true);
      }
    });

    it.skipIf(!process.env["OPENAI_API_KEY"])(
      "should generate embeddings and perform semantic search",
      async () => {
        // Create test entries
        for (const entry of testEntries) {
          const created = await prisma.knowledgeEntry.create({
            data: {
              workspaceId: testWorkspaceId,
              slug: entry.slug,
              title: entry.title,
              content: entry.content,
              status: EntryStatus.PUBLISHED,
              visibility: "WORKSPACE",
              createdBy: testUserId,
              updatedBy: testUserId,
            },
          });

          // Generate embedding
          const preparedContent = embeddingService.prepareContentForEmbedding(
            entry.title,
            entry.content
          );
          await embeddingService.generateAndStoreEmbedding(
            created.id,
            preparedContent
          );
        }

        // Wait a bit for embeddings to be stored
        await new Promise((resolve) => setTimeout(resolve, 1000));

        // Perform semantic search
        const results = await searchService.semanticSearch(
          "relational database systems",
          testWorkspaceId
        );

        // Should return results
        expect(results.data.length).toBeGreaterThan(0);

        // PostgreSQL entry should rank high for "relational database"
        const postgresEntry = results.data.find(
          (r) => r.slug === "postgresql-intro"
        );
        expect(postgresEntry).toBeDefined();
        expect(postgresEntry!.rank).toBeGreaterThan(0);
      },
      30000 // 30 second timeout for API calls
    );

    it.skipIf(!process.env["OPENAI_API_KEY"])(
      "should perform hybrid search combining vector and keyword",
      async () => {
        const results = await searchService.hybridSearch(
          "indexing",
          testWorkspaceId
        );

        // Should return results
        expect(results.data.length).toBeGreaterThan(0);

        // Should find the indexing entry
        const indexingEntry = results.data.find(
          (r) => r.slug === "database-indexing"
        );
        expect(indexingEntry).toBeDefined();
      },
      30000
    );
  });

  describe("Batch Embedding Generation", () => {
    it.skipIf(!process.env["OPENAI_API_KEY"])(
      "should batch generate embeddings",
      async () => {
        // Create entries without embeddings
        const entries = await Promise.all(
          Array.from({ length: 3 }, (_, i) =>
            prisma.knowledgeEntry.create({
              data: {
                workspaceId: testWorkspaceId,
                slug: `batch-test-${i}`,
                title: `Batch Test Entry ${i}`,
                content: `This is test content for batch entry ${i}`,
                status: EntryStatus.PUBLISHED,
                visibility: "WORKSPACE",
                createdBy: testUserId,
                updatedBy: testUserId,
              },
            })
          )
        );

        // Batch generate embeddings
        const entriesForEmbedding = entries.map((e) => ({
          id: e.id,
          content: embeddingService.prepareContentForEmbedding(
            e.title,
            e.content
          ),
        }));

        const successCount = await embeddingService.batchGenerateEmbeddings(
          entriesForEmbedding
        );

        expect(successCount).toBe(3);

        // Verify embeddings were created
        const embeddings = await prisma.knowledgeEmbedding.findMany({
          where: {
            entryId: { in: entries.map((e) => e.id) },
          },
        });

        expect(embeddings.length).toBe(3);
      },
      60000 // 60 second timeout for batch operations
    );
  });
});