stack/apps/api/src/knowledge/services/search.service.ts

import { Injectable } from "@nestjs/common";
import { EntryStatus, Prisma } from "@prisma/client";
import { PrismaService } from "../../prisma/prisma.service";
import type { KnowledgeEntryWithTags, PaginatedEntries } from "../entities/knowledge-entry.entity";
import { KnowledgeCacheService } from "./cache.service";
import { EmbeddingService } from "./embedding.service";

/**
 * Search options for full-text search
 */
export interface SearchOptions {
  status?: EntryStatus | undefined;
  page?: number | undefined;
  limit?: number | undefined;
}

/**
 * Search result with relevance ranking
 */
export interface SearchResult extends KnowledgeEntryWithTags {
  rank: number;
  headline?: string | undefined;
}

/**
 * Paginated search results
 */
export interface PaginatedSearchResults {
  data: SearchResult[];
  pagination: {
    page: number;
    limit: number;
    total: number;
    totalPages: number;
  };
  query: string;
}

/**
 * Raw search result from PostgreSQL query
 */
interface RawSearchResult {
  id: string;
  workspace_id: string;
  slug: string;
  title: string;
  content: string;
  content_html: string | null;
  summary: string | null;
  status: EntryStatus;
  visibility: string;
  created_at: Date;
  updated_at: Date;
  created_by: string;
  updated_by: string;
  rank: number;
  headline: string | null;
}

/**
 * Service for searching knowledge entries using PostgreSQL full-text search
 */
@Injectable()
export class SearchService {
  constructor(
    private readonly prisma: PrismaService,
    private readonly cache: KnowledgeCacheService,
    private readonly embedding: EmbeddingService
  ) {}

  /**
   * Full-text search on title and content using PostgreSQL ts_vector
   *
   * @param query - The search query string
   * @param workspaceId - The workspace to search within
   * @param options - Search options (status filter, pagination)
   * @returns Paginated search results ranked by relevance
   */
  async search(
    query: string,
    workspaceId: string,
    options: SearchOptions = {}
  ): Promise<PaginatedSearchResults> {
    const page = options.page ?? 1;
    const limit = options.limit ?? 20;
    const offset = (page - 1) * limit;

    // Sanitize and prepare the search query
    const sanitizedQuery = this.sanitizeSearchQuery(query);

    if (!sanitizedQuery) {
      return {
        data: [],
        pagination: {
          page,
          limit,
          total: 0,
          totalPages: 0,
        },
        query,
      };
    }

    // Check cache first
    const filters = { status: options.status, page, limit };
    const cached = await this.cache.getSearch<PaginatedSearchResults>(
      workspaceId,
      sanitizedQuery,
      filters
    );
    if (cached) {
      return cached;
    }

    // Build status filter
    const statusFilter = options.status
      ? Prisma.sql`AND e.status = ${options.status}::text::"EntryStatus"`
      : Prisma.sql`AND e.status != 'ARCHIVED'`;

    // PostgreSQL full-text search query
    // Uses ts_rank for relevance scoring with weights: title (A=1.0), content (B=0.4)
    const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
      WITH search_query AS (
        SELECT plainto_tsquery('english', ${sanitizedQuery}) AS query
      )
      SELECT
        e.id,
        e.workspace_id,
        e.slug,
        e.title,
        e.content,
        e.content_html,
        e.summary,
        e.status,
        e.visibility,
        e.created_at,
        e.updated_at,
        e.created_by,
        e.updated_by,
        ts_rank(
          setweight(to_tsvector('english', e.title), 'A') ||
          setweight(to_tsvector('english', e.content), 'B'),
          sq.query
        ) AS rank,
        ts_headline(
          'english',
          e.content,
          sq.query,
          'MaxWords=50, MinWords=25, StartSel=<mark>, StopSel=</mark>'
        ) AS headline
      FROM knowledge_entries e, search_query sq
      WHERE e.workspace_id = ${workspaceId}::uuid
        ${statusFilter}
        AND (
          to_tsvector('english', e.title) @@ sq.query
          OR to_tsvector('english', e.content) @@ sq.query
        )
      ORDER BY rank DESC, e.updated_at DESC
      LIMIT ${limit}
      OFFSET ${offset}
    `;

    // Get total count for pagination
    const countResult = await this.prisma.$queryRaw<[{ count: bigint }]>`
      SELECT COUNT(*) as count
      FROM knowledge_entries e
      WHERE e.workspace_id = ${workspaceId}::uuid
        ${statusFilter}
        AND (
          to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
          OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
        )
    `;

    const total = Number(countResult[0].count);

    // Fetch tags for the results
    const entryIds = searchResults.map((r) => r.id);
    const tagsMap = await this.fetchTagsForEntries(entryIds);

    // Transform results to the expected format
    const data: SearchResult[] = searchResults.map((row) => ({
      id: row.id,
      workspaceId: row.workspace_id,
      slug: row.slug,
      title: row.title,
      content: row.content,
      contentHtml: row.content_html,
      summary: row.summary,
      status: row.status,
      visibility: row.visibility as "PRIVATE" | "WORKSPACE" | "PUBLIC",
      createdAt: row.created_at,
      updatedAt: row.updated_at,
      createdBy: row.created_by,
      updatedBy: row.updated_by,
      rank: row.rank,
      headline: row.headline ?? undefined,
      tags: tagsMap.get(row.id) ?? [],
    }));

    const result = {
      data,
      pagination: {
        page,
        limit,
        total,
        totalPages: Math.ceil(total / limit),
      },
      query,
    };

    // Cache the result
    await this.cache.setSearch(workspaceId, sanitizedQuery, filters, result);

    return result;
  }

  /**
   * Search entries by tags (entries must have ALL specified tags)
   *
   * @param tags - Array of tag slugs to filter by
   * @param workspaceId - The workspace to search within
   * @param options - Search options (status filter, pagination)
   * @returns Paginated entries that have all specified tags
   */
  async searchByTags(
    tags: string[],
    workspaceId: string,
    options: SearchOptions = {}
  ): Promise<PaginatedEntries> {
    const page = options.page ?? 1;
    const limit = options.limit ?? 20;
    const skip = (page - 1) * limit;

    if (tags.length === 0) {
      return {
        data: [],
        pagination: {
          page,
          limit,
          total: 0,
          totalPages: 0,
        },
      };
    }

    // Build where clause for entries that have ALL specified tags
    const where: Prisma.KnowledgeEntryWhereInput = {
      workspaceId,
      status: options.status ?? { not: EntryStatus.ARCHIVED },
      AND: tags.map((tagSlug) => ({
        tags: {
          some: {
            tag: {
              slug: tagSlug,
            },
          },
        },
      })),
    };

    // Get total count
    const total = await this.prisma.knowledgeEntry.count({ where });

    // Get entries
    const entries = await this.prisma.knowledgeEntry.findMany({
      where,
      include: {
        tags: {
          include: {
            tag: true,
          },
        },
      },
      orderBy: {
        updatedAt: "desc",
      },
      skip,
      take: limit,
    });

    // Transform to response format
    const data: KnowledgeEntryWithTags[] = entries.map((entry) => ({
      id: entry.id,
      workspaceId: entry.workspaceId,
      slug: entry.slug,
      title: entry.title,
      content: entry.content,
      contentHtml: entry.contentHtml,
      summary: entry.summary,
      status: entry.status,
      visibility: entry.visibility,
      createdAt: entry.createdAt,
      updatedAt: entry.updatedAt,
      createdBy: entry.createdBy,
      updatedBy: entry.updatedBy,
      tags: entry.tags.map((et) => ({
        id: et.tag.id,
        name: et.tag.name,
        slug: et.tag.slug,
        color: et.tag.color,
      })),
    }));

    return {
      data,
      pagination: {
        page,
        limit,
        total,
        totalPages: Math.ceil(total / limit),
      },
    };
  }

  /**
   * Get recently modified entries
   *
   * @param workspaceId - The workspace to query
   * @param limit - Maximum number of entries to return (default: 10)
   * @param status - Optional status filter
   * @returns Array of recently modified entries
   */
  async recentEntries(
    workspaceId: string,
    limit = 10,
    status?: EntryStatus
  ): Promise<KnowledgeEntryWithTags[]> {
    const where: Prisma.KnowledgeEntryWhereInput = {
      workspaceId,
      status: status ?? { not: EntryStatus.ARCHIVED },
    };

    const entries = await this.prisma.knowledgeEntry.findMany({
      where,
      include: {
        tags: {
          include: {
            tag: true,
          },
        },
      },
      orderBy: {
        updatedAt: "desc",
      },
      take: limit,
    });

    return entries.map((entry) => ({
      id: entry.id,
      workspaceId: entry.workspaceId,
      slug: entry.slug,
      title: entry.title,
      content: entry.content,
      contentHtml: entry.contentHtml,
      summary: entry.summary,
      status: entry.status,
      visibility: entry.visibility,
      createdAt: entry.createdAt,
      updatedAt: entry.updatedAt,
      createdBy: entry.createdBy,
      updatedBy: entry.updatedBy,
      tags: entry.tags.map((et) => ({
        id: et.tag.id,
        name: et.tag.name,
        slug: et.tag.slug,
        color: et.tag.color,
      })),
    }));
  }

  /**
   * Sanitize search query to prevent SQL injection and handle special characters
   */
  private sanitizeSearchQuery(query: string): string {
    if (!query || typeof query !== "string") {
      return "";
    }

    // Trim and normalize whitespace
    let sanitized = query.trim().replace(/\s+/g, " ");

    // Remove PostgreSQL full-text search operators that could cause issues
    sanitized = sanitized.replace(/[&|!:*()]/g, " ");

    // Trim again after removing special chars
    sanitized = sanitized.trim();

    return sanitized;
  }

  /**
   * Fetch tags for a list of entry IDs
   */
  private async fetchTagsForEntries(
    entryIds: string[]
  ): Promise<Map<string, { id: string; name: string; slug: string; color: string | null }[]>> {
    if (entryIds.length === 0) {
      return new Map();
    }

    const entryTags = await this.prisma.knowledgeEntryTag.findMany({
      where: {
        entryId: { in: entryIds },
      },
      include: {
        tag: true,
      },
    });

    const tagsMap = new Map<
      string,
      { id: string; name: string; slug: string; color: string | null }[]
    >();

    for (const et of entryTags) {
      const tags = tagsMap.get(et.entryId) ?? [];
      tags.push({
        id: et.tag.id,
        name: et.tag.name,
        slug: et.tag.slug,
        color: et.tag.color,
      });
      tagsMap.set(et.entryId, tags);
    }

    return tagsMap;
  }

  /**
   * Semantic search using vector similarity
   *
   * @param query - The search query string
   * @param workspaceId - The workspace to search within
   * @param options - Search options (status filter, pagination)
   * @returns Paginated search results ranked by semantic similarity
   */
  async semanticSearch(
    query: string,
    workspaceId: string,
    options: SearchOptions = {}
  ): Promise<PaginatedSearchResults> {
    if (!this.embedding.isConfigured()) {
      throw new Error("Semantic search requires OPENAI_API_KEY to be configured");
    }

    const page = options.page ?? 1;
    const limit = options.limit ?? 20;
    const offset = (page - 1) * limit;

    // Generate embedding for the query
    const queryEmbedding = await this.embedding.generateEmbedding(query);
    const embeddingString = `[${queryEmbedding.join(",")}]`;

    // Build status filter
    const statusFilter = options.status
      ? Prisma.sql`AND e.status = ${options.status}::text::"EntryStatus"`
      : Prisma.sql`AND e.status != 'ARCHIVED'`;

    // Vector similarity search using cosine distance
    const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
      SELECT
        e.id,
        e.workspace_id,
        e.slug,
        e.title,
        e.content,
        e.content_html,
        e.summary,
        e.status,
        e.visibility,
        e.created_at,
        e.updated_at,
        e.created_by,
        e.updated_by,
        (1 - (emb.embedding <=> ${embeddingString}::vector)) AS rank,
        NULL AS headline
      FROM knowledge_entries e
      INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
      WHERE e.workspace_id = ${workspaceId}::uuid
        ${statusFilter}
      ORDER BY emb.embedding <=> ${embeddingString}::vector
      LIMIT ${limit}
      OFFSET ${offset}
    `;

    // Get total count for pagination
    const countResult = await this.prisma.$queryRaw<[{ count: bigint }]>`
      SELECT COUNT(*) as count
      FROM knowledge_entries e
      INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
      WHERE e.workspace_id = ${workspaceId}::uuid
        ${statusFilter}
    `;

    const total = Number(countResult[0].count);

    // Fetch tags for the results
    const entryIds = searchResults.map((r) => r.id);
    const tagsMap = await this.fetchTagsForEntries(entryIds);

    // Transform results to the expected format
    const data: SearchResult[] = searchResults.map((row) => ({
      id: row.id,
      workspaceId: row.workspace_id,
      slug: row.slug,
      title: row.title,
      content: row.content,
      contentHtml: row.content_html,
      summary: row.summary,
      status: row.status,
      visibility: row.visibility as "PRIVATE" | "WORKSPACE" | "PUBLIC",
      createdAt: row.created_at,
      updatedAt: row.updated_at,
      createdBy: row.created_by,
      updatedBy: row.updated_by,
      rank: row.rank,
      headline: row.headline ?? undefined,
      tags: tagsMap.get(row.id) ?? [],
    }));

    return {
      data,
      pagination: {
        page,
        limit,
        total,
        totalPages: Math.ceil(total / limit),
      },
      query,
    };
  }

  /**
   * Hybrid search combining vector similarity and full-text search
   * Uses Reciprocal Rank Fusion (RRF) to combine rankings
   *
   * @param query - The search query string
   * @param workspaceId - The workspace to search within
   * @param options - Search options (status filter, pagination)
   * @returns Paginated search results ranked by combined relevance
   */
  async hybridSearch(
    query: string,
    workspaceId: string,
    options: SearchOptions = {}
  ): Promise<PaginatedSearchResults> {
    if (!this.embedding.isConfigured()) {
      // Fall back to keyword search if embeddings not configured
      return this.search(query, workspaceId, options);
    }

    const page = options.page ?? 1;
    const limit = options.limit ?? 20;
    const offset = (page - 1) * limit;

    // Sanitize query for keyword search
    const sanitizedQuery = this.sanitizeSearchQuery(query);

    if (!sanitizedQuery) {
      return {
        data: [],
        pagination: {
          page,
          limit,
          total: 0,
          totalPages: 0,
        },
        query,
      };
    }

    // Generate embedding for vector search
    const queryEmbedding = await this.embedding.generateEmbedding(query);
    const embeddingString = `[${queryEmbedding.join(",")}]`;

    // Build status filter
    const statusFilter = options.status
      ? Prisma.sql`AND e.status = ${options.status}::text::"EntryStatus"`
      : Prisma.sql`AND e.status != 'ARCHIVED'`;

    // Hybrid search using Reciprocal Rank Fusion (RRF)
    // Combines vector similarity and full-text search rankings
    const searchResults = await this.prisma.$queryRaw<RawSearchResult[]>`
      WITH vector_search AS (
        SELECT
          e.id,
          ROW_NUMBER() OVER (ORDER BY emb.embedding <=> ${embeddingString}::vector) AS rank
        FROM knowledge_entries e
        INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
        WHERE e.workspace_id = ${workspaceId}::uuid
          ${statusFilter}
      ),
      keyword_search AS (
        SELECT
          e.id,
          ROW_NUMBER() OVER (
            ORDER BY ts_rank(
              setweight(to_tsvector('english', e.title), 'A') ||
              setweight(to_tsvector('english', e.content), 'B'),
              plainto_tsquery('english', ${sanitizedQuery})
            ) DESC
          ) AS rank
        FROM knowledge_entries e
        WHERE e.workspace_id = ${workspaceId}::uuid
          ${statusFilter}
          AND (
            to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
            OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
          )
      ),
      combined AS (
        SELECT
          COALESCE(v.id, k.id) AS id,
          -- Reciprocal Rank Fusion: RRF(d) = sum(1 / (k + rank_i))
          -- k=60 is a common constant that prevents high rankings from dominating
          (COALESCE(1.0 / (60 + v.rank), 0) + COALESCE(1.0 / (60 + k.rank), 0)) AS rrf_score
        FROM vector_search v
        FULL OUTER JOIN keyword_search k ON v.id = k.id
      )
      SELECT
        e.id,
        e.workspace_id,
        e.slug,
        e.title,
        e.content,
        e.content_html,
        e.summary,
        e.status,
        e.visibility,
        e.created_at,
        e.updated_at,
        e.created_by,
        e.updated_by,
        c.rrf_score AS rank,
        ts_headline(
          'english',
          e.content,
          plainto_tsquery('english', ${sanitizedQuery}),
          'MaxWords=50, MinWords=25, StartSel=<mark>, StopSel=</mark>'
        ) AS headline
      FROM combined c
      INNER JOIN knowledge_entries e ON c.id = e.id
      ORDER BY c.rrf_score DESC, e.updated_at DESC
      LIMIT ${limit}
      OFFSET ${offset}
    `;

    // Get total count
    const countResult = await this.prisma.$queryRaw<[{ count: bigint }]>`
      WITH vector_search AS (
        SELECT e.id
        FROM knowledge_entries e
        INNER JOIN knowledge_embeddings emb ON e.id = emb.entry_id
        WHERE e.workspace_id = ${workspaceId}::uuid
          ${statusFilter}
      ),
      keyword_search AS (
        SELECT e.id
        FROM knowledge_entries e
        WHERE e.workspace_id = ${workspaceId}::uuid
          ${statusFilter}
          AND (
            to_tsvector('english', e.title) @@ plainto_tsquery('english', ${sanitizedQuery})
            OR to_tsvector('english', e.content) @@ plainto_tsquery('english', ${sanitizedQuery})
          )
      )
      SELECT COUNT(DISTINCT id) as count
      FROM (
        SELECT id FROM vector_search
        UNION
        SELECT id FROM keyword_search
      ) AS combined
    `;

    const total = Number(countResult[0].count);

    // Fetch tags for the results
    const entryIds = searchResults.map((r) => r.id);
    const tagsMap = await this.fetchTagsForEntries(entryIds);

    // Transform results to the expected format
    const data: SearchResult[] = searchResults.map((row) => ({
      id: row.id,
      workspaceId: row.workspace_id,
      slug: row.slug,
      title: row.title,
      content: row.content,
      contentHtml: row.content_html,
      summary: row.summary,
      status: row.status,
      visibility: row.visibility as "PRIVATE" | "WORKSPACE" | "PUBLIC",
      createdAt: row.created_at,
      updatedAt: row.updated_at,
      createdBy: row.created_by,
      updatedBy: row.updated_by,
      rank: row.rank,
      headline: row.headline ?? undefined,
      tags: tagsMap.get(row.id) ?? [],
    }));

    return {
      data,
      pagination: {
        page,
        limit,
        total,
        totalPages: Math.ceil(total / limit),
      },
      query,
    };
  }
}