From 832bd7f9c97121fe6d3bd4009d710e3f97276976 Mon Sep 17 00:00:00 2001 From: Khor Shu Heng <32997938+khorshuheng@users.noreply.github.com> Date: Fri, 10 Jan 2025 16:30:35 +0800 Subject: [PATCH] fix: children of trash views and other user's private space should not appear in search (#1146) --- libs/client-api-test/src/test_client.rs | 49 ++++++++++++- libs/database/src/index/search_ops.rs | 6 +- src/biz/collab/folder_view.rs | 81 +++++++++++---------- src/biz/search/ops.rs | 97 +++++++++++++++++++------ tests/collab/collab_embedding_test.rs | 8 ++ tests/search/document_search.rs | 13 +++- 6 files changed, 187 insertions(+), 67 deletions(-) diff --git a/libs/client-api-test/src/test_client.rs b/libs/client-api-test/src/test_client.rs index e6474a56..86343834 100644 --- a/libs/client-api-test/src/test_client.rs +++ b/libs/client-api-test/src/test_client.rs @@ -17,7 +17,8 @@ use collab_database::database::{Database, DatabaseContext}; use collab_database::workspace_database::WorkspaceDatabase; use collab_document::document::Document; use collab_entity::CollabType; -use collab_folder::Folder; +use collab_folder::hierarchy_builder::NestedChildViewBuilder; +use collab_folder::{Folder, ViewLayout}; use collab_user::core::UserAwareness; use mime::Mime; use serde::Deserialize; @@ -33,7 +34,7 @@ use client_api::collab_sync::{SinkConfig, SyncObject, SyncPlugin}; use client_api::entity::id::user_awareness_object_id; use client_api::entity::{ PublishCollabItem, PublishCollabMetadata, QueryWorkspaceMember, QuestionStream, - QuestionStreamValue, + QuestionStreamValue, UpdateCollabWebParams, }; use client_api::ws::{WSClient, WSClientConfig}; use database_entity::dto::{ @@ -140,6 +141,50 @@ impl TestClient { Self::new(registered_user, false).await } + pub async fn insert_view_to_general_space( + &self, + workspace_id: &str, + view_id: &str, + view_name: &str, + view_layout: ViewLayout, + ) { + let mut folder = self.get_folder(workspace_id).await; + let general_space_id = folder + .get_view(workspace_id) + .unwrap() + .children + .first() + .unwrap() + .clone(); + let view = NestedChildViewBuilder::new(self.uid().await, general_space_id.id.clone()) + .with_view_id(view_id.to_string()) + .with_name(view_name) + .with_layout(view_layout) + .build() + .view; + { + let mut txn = folder.collab.transact_mut(); + folder.body.views.insert(&mut txn, view, None); + } + let folder_collab_type = CollabType::Folder; + self + .api_client + .update_web_collab( + workspace_id, + workspace_id, + UpdateCollabWebParams { + doc_state: folder + .encode_collab_v1(|c| folder_collab_type.validate_require_data(c)) + .unwrap() + .doc_state + .to_vec(), + collab_type: CollabType::Folder, + }, + ) + .await + .unwrap(); + } + pub async fn get_folder(&self, workspace_id: &str) -> Folder { let uid = self.uid().await; let folder_collab = self diff --git a/libs/database/src/index/search_ops.rs b/libs/database/src/index/search_ops.rs index 97eedc1a..912b14a3 100644 --- a/libs/database/src/index/search_ops.rs +++ b/libs/database/src/index/search_ops.rs @@ -37,7 +37,7 @@ pub async fn search_documents<'a, E: Executor<'a, Database = Postgres>>( FROM af_collab_embeddings em JOIN af_collab collab ON em.oid = collab.oid AND em.partition_key = collab.partition_key JOIN af_user u ON collab.owner_uid = u.uid - WHERE collab.workspace_id = $2 AND NOT(collab.oid = ANY($7::text[])) + WHERE collab.workspace_id = $2 AND (collab.oid = ANY($7::text[])) ORDER BY em.embedding <=> $3 LIMIT $5 "#, @@ -48,7 +48,7 @@ pub async fn search_documents<'a, E: Executor<'a, Database = Postgres>>( .bind(params.preview) .bind(params.limit) .bind(tokens_used as i64) - .bind(params.non_viewable_view_ids); + .bind(params.searchable_view_ids); let rows = query.fetch_all(executor).await?; Ok(rows) } @@ -66,7 +66,7 @@ pub struct SearchDocumentParams { /// Embedding of the query - generated by OpenAI embedder. pub embedding: Vec, /// List of view ids which is not supposed to be returned in the search results. - pub non_viewable_view_ids: Vec, + pub searchable_view_ids: Vec, } #[derive(Debug, Clone, sqlx::FromRow)] diff --git a/src/biz/collab/folder_view.rs b/src/biz/collab/folder_view.rs index ce9158cd..0c5ab2e4 100644 --- a/src/biz/collab/folder_view.rs +++ b/src/biz/collab/folder_view.rs @@ -9,43 +9,43 @@ use shared_entity::dto::workspace_dto::{ }; use uuid::Uuid; -pub struct PrivateAndNonviewableViews { - pub my_private_view_ids: HashSet, - pub nonviewable_view_ids: HashSet, +pub struct PrivateSpaceAndTrashViews { + pub my_private_space_ids: HashSet, + pub other_private_space_ids: HashSet, + pub view_ids_in_trash: HashSet, } -pub fn private_and_nonviewable_view_ids(folder: &Folder) -> PrivateAndNonviewableViews { - let mut nonviewable_view_ids = HashSet::new(); - let mut my_private_view_ids = HashSet::new(); +pub fn private_space_and_trash_view_ids(folder: &Folder) -> PrivateSpaceAndTrashViews { + let mut view_ids_in_trash = HashSet::new(); + let mut my_private_space_ids = HashSet::new(); + let mut other_private_space_ids = HashSet::new(); for private_section in folder.get_my_private_sections() { - my_private_view_ids.insert(private_section.id); + match folder.get_view(&private_section.id) { + Some(private_view) if check_if_view_is_space(&private_view) => { + my_private_space_ids.insert(private_section.id.clone()); + }, + _ => (), + } } + for private_section in folder.get_all_private_sections() { - if let Some(private_view) = folder.get_view(&private_section.id) { - if check_if_view_is_space(&private_view) && !my_private_view_ids.contains(&private_section.id) + match folder.get_view(&private_section.id) { + Some(private_view) + if check_if_view_is_space(&private_view) + && !my_private_space_ids.contains(&private_section.id) => { - nonviewable_view_ids.insert(private_section.id); - let private_view_ids_in_space: HashSet = folder - .get_views_belong_to(&private_view.id) - .iter() - .map(|v| v.id.clone()) - .collect(); - nonviewable_view_ids.extend(private_view_ids_in_space); - } + other_private_space_ids.insert(private_section.id.clone()); + }, + _ => (), } } for trash_view in folder.get_all_trash_sections() { - nonviewable_view_ids.insert(trash_view.id.clone()); - let child_views_for_trash: HashSet = folder - .get_views_belong_to(&trash_view.id) - .iter() - .map(|v| v.id.clone()) - .collect(); - nonviewable_view_ids.extend(child_views_for_trash); + view_ids_in_trash.insert(trash_view.id.clone()); } - PrivateAndNonviewableViews { - my_private_view_ids, - nonviewable_view_ids, + PrivateSpaceAndTrashViews { + my_private_space_ids, + other_private_space_ids, + view_ids_in_trash, } } @@ -57,15 +57,14 @@ pub fn collab_folder_to_folder_view( max_depth: u32, pubished_view_ids: &HashSet, ) -> Result { - let private_and_nonviewable_views = private_and_nonviewable_view_ids(folder); + let private_space_and_trash_view_ids = private_space_and_trash_view_ids(folder); to_folder_view( workspace_id, "", root_view_id, folder, - &private_and_nonviewable_views.nonviewable_view_ids, - &private_and_nonviewable_views.my_private_view_ids, + &private_space_and_trash_view_ids, pubished_view_ids, false, 0, @@ -83,14 +82,23 @@ fn to_folder_view( parent_view_id: &str, view_id: &str, folder: &Folder, - unviewable: &HashSet, - private_view_ids: &HashSet, + private_space_and_trash_views: &PrivateSpaceAndTrashViews, published_view_ids: &HashSet, parent_is_private: bool, depth: u32, max_depth: u32, ) -> Option { - if depth > max_depth || unviewable.contains(view_id) { + let is_trash = private_space_and_trash_views + .view_ids_in_trash + .contains(view_id); + let is_my_private_space = private_space_and_trash_views + .my_private_space_ids + .contains(view_id); + let is_other_private_space = private_space_and_trash_views + .other_private_space_ids + .contains(view_id); + + if depth > max_depth || is_other_private_space || is_trash { return None; } @@ -114,7 +122,7 @@ fn to_folder_view( return None; } - let is_private = parent_is_private || (view_is_space && private_view_ids.contains(view_id)); + let is_private = parent_is_private || is_my_private_space; let extra = view.extra.as_deref().map(|extra| { serde_json::from_str::(extra).unwrap_or_else(|e| { tracing::warn!("failed to parse extra field({}): {}", extra, e); @@ -130,8 +138,7 @@ fn to_folder_view( view_id, &child_view_id.id, folder, - unviewable, - private_view_ids, + private_space_and_trash_views, published_view_ids, is_private, depth + 1, @@ -267,7 +274,7 @@ pub fn check_if_view_ancestors_fulfil_condition( return true; } current_view_id = view.parent_view_id.clone(); - if current_view_id.is_empty() { + if current_view_id.is_empty() || current_view_id == view.id { return false; } } diff --git a/src/biz/search/ops.rs b/src/biz/search/ops.rs index 2fbe456a..b8438937 100644 --- a/src/biz/search/ops.rs +++ b/src/biz/search/ops.rs @@ -1,15 +1,18 @@ -use crate::api::metrics::RequestMetrics; -use crate::biz::collab::folder_view::private_and_nonviewable_view_ids; +use crate::biz::collab::folder_view::PrivateSpaceAndTrashViews; use crate::biz::collab::utils::get_latest_collab_folder; +use crate::{ + api::metrics::RequestMetrics, biz::collab::folder_view::private_space_and_trash_view_ids, +}; use app_error::AppError; use appflowy_ai_client::dto::{ EmbeddingEncodingFormat, EmbeddingInput, EmbeddingModel, EmbeddingOutput, EmbeddingRequest, }; use appflowy_collaborate::collab::storage::CollabAccessControlStorage; +use collab_folder::{Folder, View}; use database::collab::GetCollabOrigin; -use itertools::Itertools; use std::collections::HashSet; use std::sync::Arc; +use tracing::info; use database::index::{search_documents, SearchDocumentParams}; use shared_entity::dto::search_dto::{ @@ -20,12 +23,60 @@ use sqlx::PgPool; use indexer::scheduler::IndexerScheduler; use uuid::Uuid; +static MAX_SEARCH_DEPTH: i32 = 10; + +fn is_view_searchable(view: &View, workspace_id: &str) -> bool { + view.id != workspace_id && view.parent_view_id != workspace_id && view.layout.is_document() +} + +fn populate_searchable_view_ids( + folder: &Folder, + private_space_and_trash_views: &PrivateSpaceAndTrashViews, + searchable_view_ids: &mut HashSet, + workspace_id: &str, + current_view_id: &str, + depth: i32, + max_depth: i32, +) { + if depth > max_depth { + return; + } + let is_other_private_space = private_space_and_trash_views + .other_private_space_ids + .contains(current_view_id); + let is_trash = private_space_and_trash_views + .view_ids_in_trash + .contains(current_view_id); + if is_other_private_space || is_trash { + return; + } + let view = match folder.get_view(current_view_id) { + Some(view) => view, + None => return, + }; + + if is_view_searchable(&view, workspace_id) { + searchable_view_ids.insert(current_view_id.to_string()); + } + for child in view.children.iter() { + populate_searchable_view_ids( + folder, + private_space_and_trash_views, + searchable_view_ids, + workspace_id, + &child.id, + depth + 1, + max_depth, + ); + } +} + pub async fn search_document( pg_pool: &PgPool, collab_storage: &CollabAccessControlStorage, indexer_scheduler: &Arc, uid: i64, - workspace_id: Uuid, + workspace_uuid: Uuid, request: SearchDocumentRequest, metrics: &RequestMetrics, ) -> Result, AppError> { @@ -38,10 +89,10 @@ pub async fn search_document( }) .await?; let total_tokens = embeddings.usage.total_tokens as u32; - metrics.record_search_tokens_used(&workspace_id, total_tokens); + metrics.record_search_tokens_used(&workspace_uuid, total_tokens); tracing::info!( "workspace {} OpenAI API search tokens used: {}", - workspace_id, + workspace_uuid, total_tokens ); @@ -61,32 +112,30 @@ pub async fn search_document( let folder = get_latest_collab_folder( collab_storage, GetCollabOrigin::User { uid }, - &workspace_id.to_string(), + &workspace_uuid.to_string(), ) .await?; - let private_and_nonviewable_views = private_and_nonviewable_view_ids(&folder); - let space_ids: HashSet = folder - .get_view(&workspace_id.to_string()) - .ok_or_else(|| AppError::Internal(anyhow::anyhow!("Workspace view not found in folder")))? - .children - .iter() - .map(|c| c.id.clone()) - .collect(); - - let mut non_searchable_view_ids = private_and_nonviewable_views.nonviewable_view_ids; - non_searchable_view_ids.extend(space_ids); + let private_space_and_trash_views = private_space_and_trash_view_ids(&folder); + let mut searchable_view_ids = HashSet::new(); + populate_searchable_view_ids( + &folder, + &private_space_and_trash_views, + &mut searchable_view_ids, + &workspace_uuid.to_string(), + &workspace_uuid.to_string(), + 0, + MAX_SEARCH_DEPTH, + ); + info!("{:?}", searchable_view_ids); let results = search_documents( pg_pool, SearchDocumentParams { user_id: uid, - workspace_id, + workspace_id: workspace_uuid, limit: request.limit.unwrap_or(10) as i32, preview: request.preview_size.unwrap_or(500) as i32, embedding, - non_viewable_view_ids: non_searchable_view_ids - .iter() - .map(|uuid| uuid.to_string()) - .collect_vec(), + searchable_view_ids: searchable_view_ids.into_iter().collect(), }, total_tokens, ) @@ -94,7 +143,7 @@ pub async fn search_document( tracing::trace!( "user {} search request in workspace {} returned {} results for query: `{}`", uid, - workspace_id, + workspace_uuid, results.len(), request.query ); diff --git a/tests/collab/collab_embedding_test.rs b/tests/collab/collab_embedding_test.rs index 97f560bf..6c2a7784 100644 --- a/tests/collab/collab_embedding_test.rs +++ b/tests/collab/collab_embedding_test.rs @@ -50,6 +50,14 @@ async fn document_full_sync_then_search_test() { collab_type: CollabType::Document, }; test_client.api_client.create_collab(params).await.unwrap(); + test_client + .insert_view_to_general_space( + &workspace_id, + &object_id, + "AppFlowy", + collab_folder::ViewLayout::Document, + ) + .await; let contents = vec![ "AppFlowy is an open-source project.", diff --git a/tests/search/document_search.rs b/tests/search/document_search.rs index 522722b9..7dac7513 100644 --- a/tests/search/document_search.rs +++ b/tests/search/document_search.rs @@ -7,6 +7,7 @@ use collab::preclude::Collab; use collab_document::document::Document; use collab_document::importer::md_importer::MDImporter; use collab_entity::CollabType; +use collab_folder::ViewLayout; use shared_entity::dto::chat_dto::{CreateChatMessageParams, CreateChatParams}; use tokio::time::sleep; use workspace_template::document::getting_started::getting_started_document_data; @@ -15,7 +16,6 @@ use workspace_template::document::getting_started::getting_started_document_data async fn test_embedding_when_create_document() { let mut test_client = TestClient::new_user().await; let workspace_id = test_client.workspace_id().await; - let object_id_1 = uuid::Uuid::new_v4().to_string(); let the_five_dysfunctions_of_a_team = create_document_collab(&object_id_1, "the_five_dysfunctions_of_a_team.md").await; @@ -29,6 +29,14 @@ async fn test_embedding_when_create_document() { ) .await .unwrap(); + test_client + .insert_view_to_general_space( + &workspace_id, + &object_id_1, + "five dysfunctional", + ViewLayout::Document, + ) + .await; test_client .wait_until_get_embedding(&workspace_id, &object_id_1) @@ -46,6 +54,9 @@ async fn test_embedding_when_create_document() { ) .await .unwrap(); + test_client + .insert_view_to_general_space(&workspace_id, &object_id_2, "tennis", ViewLayout::Document) + .await; test_client .wait_until_get_embedding(&workspace_id, &object_id_2)