AppFlowy-Cloud/libs/database/src/index/search_ops.rs

use chrono::{DateTime, Utc};
use pgvector::Vector;
use sqlx::{Executor, Postgres};
use uuid::Uuid;

/// Logs each search request to track usage by workspace. It either inserts a new record or updates
/// an existing one with the current date, workspace ID, request count, and token usage. This ensures
/// accurate usage tracking for billing or monitoring.
///
/// Searches and retrieves documents based on their similarity to a given search embedding.
/// It filters by workspace, user access, and document status, and returns a limited number
/// of the most relevant documents, sorted by similarity score.
pub async fn search_documents<'a, E: Executor<'a, Database = Postgres>>(
  executor: E,
  params: SearchDocumentParams,
  tokens_used: u32,
) -> Result<Vec<SearchDocumentItem>, sqlx::Error> {
  let query = sqlx::query_as::<_, SearchDocumentItem>(
    r#"
    WITH workspace AS (
      INSERT INTO af_workspace_ai_usage(created_at, workspace_id, search_requests, search_tokens_consumed, index_tokens_consumed)
      VALUES (now()::date, $2, 1, $6, 0)
      ON CONFLICT (created_at, workspace_id) DO UPDATE
      SET search_requests = af_workspace_ai_usage.search_requests + 1,
          search_tokens_consumed = af_workspace_ai_usage.search_tokens_consumed + $6
      RETURNING workspace_id
    )
    SELECT
      em.oid AS object_id,
      collab.workspace_id,
      em.partition_key AS collab_type,
      em.content_type,
      LEFT(em.content, $4) AS content_preview,
      u.name AS created_by,
      collab.created_at AS created_at,
      em.embedding <=> $3 AS score
    FROM af_collab_embeddings em
    JOIN af_collab collab ON em.oid = collab.oid AND em.partition_key = collab.partition_key
    JOIN af_user u ON collab.owner_uid = u.uid
    WHERE collab.workspace_id = $2 AND NOT(collab.oid = ANY($7::text[]))
    ORDER BY em.embedding <=> $3
    LIMIT $5
  "#,
  )
  .bind(params.user_id)
  .bind(params.workspace_id)
  .bind(Vector::from(params.embedding))
  .bind(params.preview)
  .bind(params.limit)
  .bind(tokens_used as i64)
  .bind(params.non_viewable_view_ids);
  let rows = query.fetch_all(executor).await?;
  Ok(rows)
}

#[derive(Debug, Clone)]
pub struct SearchDocumentParams {
  /// ID of the user who is searching.
  pub user_id: i64,
  /// Workspace ID to search for documents in.
  pub workspace_id: Uuid,
  /// How many results should be returned.
  pub limit: i32,
  /// How many characters of the content (starting from the beginning) should be returned.
  pub preview: i32,
  /// Embedding of the query - generated by OpenAI embedder.
  pub embedding: Vec<f32>,
  /// List of view ids which is not supposed to be returned in the search results.
  pub non_viewable_view_ids: Vec<String>,
}

#[derive(Debug, Clone, sqlx::FromRow)]
pub struct SearchDocumentItem {
  /// Document identifier.
  pub object_id: String,
  /// Workspace identifier, given document belongs to.
  pub workspace_id: Uuid,
  /// Partition key, which maps directly onto [collab_entity::CollabType].
  pub collab_type: i32,
  /// Type of the content to be presented. Maps directly onto [database_entity::dto::EmbeddingContentType].
  pub content_type: i32,
  /// First N character of the indexed content.
  pub content_preview: Option<String>,
  /// Name of the user who's an owner of the document.
  pub created_by: String,
  /// When the document was created.
  pub created_at: DateTime<Utc>,
  /// Similarity score to an original query. Lower is better.
  pub score: f64,
}