feat: try to resotre from snapshot when fail to initialize the collab in CollabStoragePlugin (#222)

This commit is contained in:
Nathan.fooo 2023-12-18 02:27:49 +08:00 committed by GitHub
parent 80f72b91b5
commit 453329dc0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 116 additions and 43 deletions

View File

@ -22,6 +22,8 @@ services:
- 5433:5432
volumes:
- ./migrations/before:/docker-entrypoint-initdb.d
# comment out the following line if you want to persist data when restarting docker
#- postgres_data:/var/lib/postgresql/data
redis:
restart: on-failure
@ -95,3 +97,5 @@ services:
volumes:
- ./docker/pgadmin/servers.json:/pgadmin4/servers.json
volumes:
postgres_data:

View File

@ -5,6 +5,7 @@ use database_entity::dto::{
InsertCollabParams, QueryCollabResult, RawData,
};
use crate::collab::SNAPSHOT_PER_HOUR;
use app_error::AppError;
use chrono::{Duration, Utc};
use database_entity::pg_row::AFCollabMemerAccessLevelRow;
@ -288,8 +289,6 @@ pub async fn create_snapshot(
Ok(())
}
const SNAPSHOT_PER_HOUR: i64 = 3;
/// Determines whether a new snapshot should be created for the given `oid`.
///
/// This asynchronous function checks the most recent snapshot creation time for the specified `oid`.

View File

@ -17,7 +17,8 @@ use std::sync::{Arc, Weak};
use tracing::{debug, warn};
use validator::Validate;
pub const COLLAB_SNAPSHOT_LIMIT: i64 = 10;
pub const COLLAB_SNAPSHOT_LIMIT: i64 = 15;
pub const SNAPSHOT_PER_HOUR: i64 = 6;
pub type DatabaseResult<T, E = AppError> = core::result::Result<T, E>;
/// [CollabStorageAccessControl] is a trait that provides access control when accessing the storage
@ -61,7 +62,9 @@ pub trait CollabStorage: Send + Sync + 'static {
/// * `bool` - `true` if the collaboration exists, `false` otherwise.
async fn is_exist(&self, object_id: &str) -> bool;
async fn cache_collab(&self, _object_id: &str, _collab: Weak<MutexCollab>);
async fn cache_collab(&self, object_id: &str, collab: Weak<MutexCollab>);
async fn remove_collab_cache(&self, object_id: &str);
async fn is_collab_exist(&self, oid: &str) -> DatabaseResult<bool>;
@ -129,8 +132,12 @@ where
self.as_ref().is_exist(object_id).await
}
async fn cache_collab(&self, _object_id: &str, _collab: Weak<MutexCollab>) {
self.as_ref().cache_collab(_object_id, _collab).await
async fn cache_collab(&self, object_id: &str, collab: Weak<MutexCollab>) {
self.as_ref().cache_collab(object_id, collab).await
}
async fn remove_collab_cache(&self, object_id: &str) {
self.as_ref().remove_collab_cache(object_id).await
}
async fn is_collab_exist(&self, oid: &str) -> DatabaseResult<bool> {
@ -220,6 +227,8 @@ impl CollabStorage for CollabStoragePgImpl {
async fn cache_collab(&self, _object_id: &str, _collab: Weak<MutexCollab>) {}
async fn remove_collab_cache(&self, _object_id: &str) {}
async fn is_collab_exist(&self, oid: &str) -> DatabaseResult<bool> {
let is_exist = is_collab_exists(oid, &self.pg_pool).await?;
Ok(is_exist)

View File

@ -29,11 +29,8 @@ pub struct CollabBroadcast {
object_id: String,
collab: MutexCollab,
sender: Sender<CollabMessage>,
#[allow(dead_code)]
awareness_sub: awareness::UpdateSubscription,
#[allow(dead_code)]
doc_sub: UpdateSubscription,
awareness_sub: Mutex<Option<awareness::UpdateSubscription>>,
doc_sub: Mutex<Option<UpdateSubscription>>,
}
impl CollabBroadcast {
@ -47,28 +44,43 @@ impl CollabBroadcast {
let object_id = object_id.to_owned();
// broadcast channel
let (sender, _) = channel(buffer_capacity);
CollabBroadcast {
object_id,
collab,
sender,
awareness_sub: Default::default(),
doc_sub: Default::default(),
}
}
pub async fn observe_collab_changes(&self) {
let (doc_sub, awareness_sub) = {
let mut mutex_collab = collab.lock();
let mut mutex_collab = self.collab.lock();
// Observer the document's update and broadcast it to all subscribers.
let cloned_oid = object_id.clone();
let broadcast_sink = sender.clone();
let cloned_oid = self.object_id.clone();
let broadcast_sink = self.sender.clone();
let doc_sub = mutex_collab
.get_mut_awareness()
.doc_mut()
.observe_update_v1(move |txn, event| {
trace!("broadcast doc update with len:{}", event.update.len());
let update_len = event.update.len();
let origin = CollabOrigin::from(txn);
let payload = gen_update_message(&event.update);
let msg = CollabBroadcastData::new(origin, cloned_oid.clone(), payload);
if let Err(e) = broadcast_sink.send(msg.into()) {
error!("broadcast sink fail: {}", e);
match broadcast_sink.send(msg.into()) {
Ok(_) => trace!("observe doc update with len:{}", update_len),
Err(e) => error!(
"observe doc update with len:{} - broadcast sink fail: {}",
update_len, e
),
}
})
.unwrap();
let broadcast_sink = sender.clone();
let cloned_oid = object_id.clone();
let broadcast_sink = self.sender.clone();
let cloned_oid = self.object_id.clone();
// Observer the awareness's update and broadcast it to all subscribers.
let awareness_sub = mutex_collab
@ -84,13 +96,9 @@ impl CollabBroadcast {
});
(doc_sub, awareness_sub)
};
CollabBroadcast {
object_id,
collab,
sender,
awareness_sub,
doc_sub,
}
*self.doc_sub.lock().await = Some(doc_sub);
*self.awareness_sub.lock().await = Some(awareness_sub);
}
/// Returns a reference to an underlying [MutexCollab] instance.

View File

@ -80,6 +80,7 @@ where
match self.group_by_object_id.try_write() {
Ok(mut group_by_object_id) => {
group_by_object_id.remove(object_id);
// self.storage.remove_collab_cache(object_id).await;
},
Err(err) => error!("Failed to acquire write lock to remove group: {:?}", err),
}
@ -127,12 +128,7 @@ where
let collab = Arc::new(collab.clone());
// The lifecycle of the collab is managed by the group.
let group = Arc::new(CollabGroup {
collab: collab.clone(),
broadcast,
subscribers: Default::default(),
});
let group = Arc::new(CollabGroup::new(collab.clone(), broadcast));
let plugin = CollabStoragePlugin::new(
uid,
workspace_id,
@ -148,6 +144,7 @@ where
.storage
.cache_collab(object_id, Arc::downgrade(&collab))
.await;
group.observe_collab().await;
group
}
}
@ -169,6 +166,18 @@ impl<U> CollabGroup<U>
where
U: RealtimeUser,
{
pub fn new(collab: Arc<MutexCollab>, broadcast: CollabBroadcast) -> Self {
Self {
collab,
broadcast,
subscribers: Default::default(),
}
}
pub async fn observe_collab(&self) {
self.broadcast.observe_collab_changes().await;
}
/// Mutate the [Collab] by the given closure
pub fn get_mut_collab<F>(&self, f: F)
where

View File

@ -174,9 +174,11 @@ where
};
match self.storage.get_collab_encoded_v1(&self.uid, params).await {
Ok(encoded_collab) => match init_collab_with_raw_data(&encoded_collab, doc).await {
Ok(encoded_collab_v1) => match init_collab_with_raw_data(&encoded_collab_v1, doc).await {
Ok(_) => {
// Try to create a snapshot for the collab object
// Attempt to create a snapshot for the collaboration object. When creating this snapshot, it is
// assumed that the 'encoded_collab_v1' is already in a valid format. Therefore, there is no need
// to verify the outcome of the 'encode_to_bytes' operation.
if self.storage.should_create_snapshot(object_id).await {
let cloned_workspace_id = self.workspace_id.clone();
let cloned_object_id = object_id.to_string();
@ -184,23 +186,39 @@ where
let _ = tokio::task::spawn_blocking(move || {
let params = InsertSnapshotParams {
object_id: cloned_object_id,
encoded_collab_v1: encoded_collab.encode_to_bytes().unwrap(),
encoded_collab_v1: encoded_collab_v1.encode_to_bytes().unwrap(),
workspace_id: cloned_workspace_id,
};
tokio::spawn(async move {
// FIXME(nathan): There is a potential issue when concurrently spawning tasks to create snapshots. A subsequent
// task for creating a snapshot might write to the database before a previous task completes. To address
// this, consider using `stream!` to queue these tasks, ensuring they are executed in the order they were
// spawned.
if let Err(err) = storage.create_snapshot(params).await {
error!("Create snapshot {:?}", err);
error!("create snapshot {:?}", err);
}
});
})
.await;
}
},
Err(e) => error!("🔴Init collab failed: {:?}", e),
Err(err) => {
// When initializing a collaboration object, if the 'init_collab_with_raw_data' operation fails, attempt to
// restore the collaboration object from the latest snapshot.
if let Some(encoded_collab_v1) = get_latest_snapshot(object_id, &self.storage).await {
if let Err(err) = init_collab_with_raw_data(&encoded_collab_v1, doc).await {
error!("restore collab with snapshot failed: {:?}", err);
return;
}
}
error!("init collab failed: {:?}", err)
},
},
Err(err) => match &err {
AppError::RecordNotFound(_) => {
// When attempting to retrieve collaboration data from the disk and a 'Record Not Found' error is returned,
// this indicates that the collaboration is new. Therefore, the current collaboration data should be saved to disk.
if let Err(err) = self.insert_new_collab(doc, object_id).await {
error!("Insert collab {:?}", err);
}
@ -223,7 +241,7 @@ where
self.edit_state.flush_edit();
trace!("number of updates reach flush_per_update, start flushing");
match self.group.upgrade() {
None => error!("🔴Group is dropped, skip flush collab"),
None => error!("Group is dropped, skip flush collab"),
Some(group) => group.flush_collab(),
}
}
@ -260,11 +278,21 @@ where
fn encoded_v1_from_doc(doc: &Doc) -> EncodedCollabV1 {
let txn = doc.transact();
let doc_state = txn.encode_state_as_update_v1(&StateVector::default());
let state_vector = txn.state_vector().encode_v1();
let doc_state = txn.encode_state_as_update_v1(&StateVector::default());
EncodedCollabV1::new(state_vector, doc_state)
}
async fn get_latest_snapshot<S>(object_id: &str, storage: &S) -> Option<EncodedCollabV1>
where
S: CollabStorage,
{
let metas = storage.get_collab_snapshot_list(object_id).await.ok()?;
let meta = metas.0.first()?;
let snapshot_data = storage.get_collab_snapshot(&meta.snapshot_id).await.ok()?;
EncodedCollabV1::decode_from_bytes(&snapshot_data.encoded_collab_v1).ok()
}
struct CollabEditState {
edit_count: AtomicU32,
flush_edit_count: AtomicU32,

View File

@ -20,7 +20,7 @@ use std::{
sync::{Arc, Weak},
};
use tokio::sync::RwLock;
use tracing::{event, info, instrument};
use tracing::{event, instrument};
use validator::Validate;
pub type CollabPostgresDBStorage = CollabStorageWrapper<
@ -75,7 +75,7 @@ where
}
async fn cache_collab(&self, object_id: &str, collab: Weak<MutexCollab>) {
tracing::trace!("Cache collab:{} in memory", object_id);
tracing::trace!("cache collab:{}", object_id);
self
.collab_by_object_id
.write()
@ -83,6 +83,11 @@ where
.insert(object_id.to_string(), collab);
}
async fn remove_collab_cache(&self, object_id: &str) {
tracing::trace!("remove collab:{} cache", object_id);
self.collab_by_object_id.write().await.remove(object_id);
}
async fn is_collab_exist(&self, oid: &str) -> DatabaseResult<bool> {
self.inner.is_collab_exist(oid).await
}
@ -156,9 +161,20 @@ where
.and_then(|collab| collab.upgrade());
match collab {
None => self.inner.get_collab_encoded_v1(uid, params).await,
None => {
event!(
tracing::Level::DEBUG,
"Get collab data:{} from disk",
params.object_id
);
self.inner.get_collab_encoded_v1(uid, params).await
},
Some(collab) => {
info!("Get collab data:{} from memory", params.object_id);
event!(
tracing::Level::DEBUG,
"Get collab data:{} from memory",
params.object_id
);
let data = collab.encode_collab_v1();
Ok(data)
},