AppFlowy-Cloud/libs/database/src/index/search_ops.rs

91 lines
3.4 KiB
Rust

use chrono::{DateTime, Utc};
use pgvector::Vector;
use sqlx::{Executor, Postgres};
use uuid::Uuid;
/// Logs each search request to track usage by workspace. It either inserts a new record or updates
/// an existing one with the current date, workspace ID, request count, and token usage. This ensures
/// accurate usage tracking for billing or monitoring.
///
/// Searches and retrieves documents based on their similarity to a given search embedding.
/// It filters by workspace, user access, and document status, and returns a limited number
/// of the most relevant documents, sorted by similarity score.
pub async fn search_documents<'a, E: Executor<'a, Database = Postgres>>(
executor: E,
params: SearchDocumentParams,
tokens_used: u32,
) -> Result<Vec<SearchDocumentItem>, sqlx::Error> {
let query = sqlx::query_as::<_, SearchDocumentItem>(
r#"
WITH workspace AS (
INSERT INTO af_workspace_ai_usage(created_at, workspace_id, search_requests, search_tokens_consumed, index_tokens_consumed)
VALUES (now()::date, $2, 1, $6, 0)
ON CONFLICT (created_at, workspace_id) DO UPDATE
SET search_requests = af_workspace_ai_usage.search_requests + 1,
search_tokens_consumed = af_workspace_ai_usage.search_tokens_consumed + $6
RETURNING workspace_id
)
SELECT
em.oid AS object_id,
collab.workspace_id,
em.partition_key AS collab_type,
em.content_type,
LEFT(em.content, $4) AS content_preview,
u.name AS created_by,
collab.created_at AS created_at,
em.embedding <=> $3 AS score
FROM af_collab_embeddings em
JOIN af_collab collab ON em.oid = collab.oid AND em.partition_key = collab.partition_key
JOIN af_user u ON collab.owner_uid = u.uid
WHERE collab.workspace_id = $2 AND NOT(collab.oid = ANY($7::text[]))
ORDER BY em.embedding <=> $3
LIMIT $5
"#,
)
.bind(params.user_id)
.bind(params.workspace_id)
.bind(Vector::from(params.embedding))
.bind(params.preview)
.bind(params.limit)
.bind(tokens_used as i64)
.bind(params.non_viewable_view_ids);
let rows = query.fetch_all(executor).await?;
Ok(rows)
}
#[derive(Debug, Clone)]
pub struct SearchDocumentParams {
/// ID of the user who is searching.
pub user_id: i64,
/// Workspace ID to search for documents in.
pub workspace_id: Uuid,
/// How many results should be returned.
pub limit: i32,
/// How many characters of the content (starting from the beginning) should be returned.
pub preview: i32,
/// Embedding of the query - generated by OpenAI embedder.
pub embedding: Vec<f32>,
/// List of view ids which is not supposed to be returned in the search results.
pub non_viewable_view_ids: Vec<String>,
}
#[derive(Debug, Clone, sqlx::FromRow)]
pub struct SearchDocumentItem {
/// Document identifier.
pub object_id: String,
/// Workspace identifier, given document belongs to.
pub workspace_id: Uuid,
/// Partition key, which maps directly onto [collab_entity::CollabType].
pub collab_type: i32,
/// Type of the content to be presented. Maps directly onto [database_entity::dto::EmbeddingContentType].
pub content_type: i32,
/// First N character of the indexed content.
pub content_preview: Option<String>,
/// Name of the user who's an owner of the document.
pub created_by: String,
/// When the document was created.
pub created_at: DateTime<Utc>,
/// Similarity score to an original query. Lower is better.
pub score: f64,
}