chore: fixed tests for document content extraction

This commit is contained in:
Bartosz Sypytkowski 2024-06-14 12:29:14 +02:00
parent 7ad105ee34
commit 3c72f1292d
3 changed files with 48 additions and 19 deletions

View File

@ -399,7 +399,7 @@ mod test {
assert_ne!(contents.len(), 0);
let content: Option<String> = contents[0].get(0);
assert_eq!(content.as_deref(), Some("test-value\n"));
assert_eq!(content.as_deref(), Some("test-value "));
}
#[ignore]

View File

@ -8,18 +8,10 @@ pub fn document_to_plain_text(document: &DocumentData) -> String {
// do a depth-first scan of the document blocks
while let Some(block_id) = stack.pop() {
if let Some(block) = document.blocks.get(block_id) {
if let Some(delta) = block.data.get("delta") {
if let Ok(deltas) = serde_json::from_value::<Vec<TextDelta>>(delta.clone()) {
for delta in deltas {
if let TextDelta::Inserted(text, _) = delta {
let trimmed = text.trim();
if !trimmed.is_empty() {
buf.push_str(&trimmed);
buf.push(' ');
}
}
}
}
if let Some(deltas) = get_delta_from_block_data(block) {
push_deltas_to_str(&mut buf, deltas);
} else if let Some(deltas) = get_delta_from_external_text_id(block, text_map) {
push_deltas_to_str(&mut buf, deltas);
}
if let Some(children) = document.meta.children_map.get(&block.children) {
// we want to process children blocks in the same order they are given in children_map
@ -34,6 +26,45 @@ pub fn document_to_plain_text(document: &DocumentData) -> String {
buf
}
/// Try to retrieve deltas from `block.data.delta`.
fn get_delta_from_block_data(block: &collab_document::blocks::Block) -> Option<Vec<TextDelta>> {
if let Some(delta) = block.data.get("delta") {
if let Ok(deltas) = serde_json::from_value::<Vec<TextDelta>>(delta.clone()) {
return Some(deltas);
}
}
None
}
/// Try to retrieve deltas from text_map's text associated with `block.external_id`.
fn get_delta_from_external_text_id(
block: &collab_document::blocks::Block,
text_map: &std::collections::HashMap<String, String>,
) -> Option<Vec<TextDelta>> {
if block.external_type.as_deref() == Some("text") {
if let Some(text_id) = block.external_id.as_deref() {
if let Some(json) = text_map.get(text_id) {
if let Ok(deltas) = serde_json::from_str::<Vec<TextDelta>>(json) {
return Some(deltas);
}
}
}
}
None
}
fn push_deltas_to_str(buf: &mut String, deltas: Vec<TextDelta>) {
for delta in deltas {
if let TextDelta::Inserted(text, _) = delta {
let trimmed = text.trim();
if !trimmed.is_empty() {
buf.push_str(&trimmed);
buf.push(' ');
}
}
}
}
#[cfg(test)]
mod test {
use crate::extract::document_to_plain_text;
@ -45,7 +76,7 @@ mod test {
fn document_plain_text() {
let doc = get_started_document_data().unwrap();
let text = document_to_plain_text(&doc);
let expected = "\nWelcome to AppFlowy!\nHere are the basics\nClick anywhere and just start typing.\nHighlight any text, and use the editing menu to style your writing however you like.\nAs soon as you type / a menu will pop up. Select different types of content blocks you can add.\nType / followed by /bullet or /num to create a list.\nClick + New Page button at the bottom of your sidebar to add a new page.\nClick + next to any page title in the sidebar to quickly add a new subpage, Document, Grid, or Kanban Board.\n\n\nKeyboard shortcuts, markdown, and code block\nKeyboard shortcuts guide\nMarkdown reference\nType /code to insert a code block\n// This is the main function.\nfn main() {\n // Print text to the console.\n println!(\"Hello World!\");\n}\n\nHave a question❓\nClick ? at the bottom right for help and support.\n\n\nLike AppFlowy? Follow us:\nGitHub\nTwitter: @appflowy\nNewsletter\n\n\n\n\n";
let expected = "Welcome to AppFlowy! Here are the basics Click anywhere and just start typing. Highlight any text, and use the editing menu to style your writing however you like. As soon as you type / a menu will pop up. Select different types of content blocks you can add. Type / followed by /bullet or /num to create a list. Click + New Page button at the bottom of your sidebar to add a new page. Click + next to any page title in the sidebar to quickly add a new subpage, Document , Grid , or Kanban Board . Keyboard shortcuts, markdown, and code block Keyboard shortcuts guide Markdown reference Type /code to insert a code block // This is the main function.\nfn main() {\n // Print text to the console.\n println!(\"Hello World!\");\n} Have a question❓ Click ? at the bottom right for help and support. Like AppFlowy? Follow us: GitHub Twitter : @appflowy Newsletter ";
assert_eq!(&text, expected);
}
@ -53,7 +84,7 @@ mod test {
fn document_plain_text_with_nested_blocks() {
let doc = get_initial_document_data().unwrap();
let text = document_to_plain_text(&doc);
let expected = "Welcome to AppFlowy!\nHere are the basics\nHere is H3\nClick anywhere and just start typing.\nClick Enter to create a new line.\nHighlight any text, and use the editing menu to style your writing however you like.\nAs soon as you type / a menu will pop up. Select different types of content blocks you can add.\nType / followed by /bullet or /num to create a list.\nClick + New Page button at the bottom of your sidebar to add a new page.\nClick + next to any page title in the sidebar to quickly add a new subpage, Document, Grid, or Kanban Board.\n\n\nKeyboard shortcuts, markdown, and code block\nKeyboard shortcuts guide\nMarkdown reference\nType /code to insert a code block\n// This is the main function.\nfn main() {\n // Print text to the console.\n println!(\"Hello World!\");\n}\n\nThis is a paragraph\nThis is a paragraph\nHave a question❓\nClick ? at the bottom right for help and support.\nThis is a paragraph\nThis is a paragraph\nClick ? at the bottom right for help and support.\n\n\nLike AppFlowy? Follow us:\nGitHub\nTwitter: @appflowy\nNewsletter\n\n\n\n\n";
let expected = "Welcome to AppFlowy! Here are the basics Here is H3 Click anywhere and just start typing. Click Enter to create a new line. Highlight any text, and use the editing menu to style your writing however you like. As soon as you type / a menu will pop up. Select different types of content blocks you can add. Type / followed by /bullet or /num to create a list. Click + New Page button at the bottom of your sidebar to add a new page. Click + next to any page title in the sidebar to quickly add a new subpage, Document , Grid , or Kanban Board . Keyboard shortcuts, markdown, and code block Keyboard shortcuts guide Markdown reference Type /code to insert a code block // This is the main function.\nfn main() {\n // Print text to the console.\n println!(\"Hello World!\");\n} This is a paragraph This is a paragraph Have a question❓ Click ? at the bottom right for help and support. This is a paragraph This is a paragraph Click ? at the bottom right for help and support. Like AppFlowy? Follow us: GitHub Twitter : @appflowy Newsletter ";
assert_eq!(&text, expected);
}
}

View File

@ -3,10 +3,8 @@ use std::sync::Arc;
use async_stream::stream;
use collab::core::collab::MutexCollab;
use collab_document::blocks::DeltaType;
use collab_document::document::Document;
use collab_entity::CollabType;
use dashmap::DashMap;
use database_entity::dto::EmbeddingContentType;
use futures::Stream;
use tokio::sync::watch::Sender;
@ -149,7 +147,7 @@ mod test {
collab_type: CollabType::Document,
content_type: EmbeddingContentType::PlainText,
object_id: "o-1".to_string(),
content: "A\n".to_string(),
content: "A ".to_string(),
})
);
@ -166,7 +164,7 @@ mod test {
collab_type: CollabType::Document,
content_type: EmbeddingContentType::PlainText,
object_id: "o-1".to_string(),
content: "BA\n".to_string(),
content: "BA ".to_string(),
})
);
}