-
Notifications
You must be signed in to change notification settings - Fork 102
feat(ntx-builder): deactivate accounts which crash repeatedly #1712
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: santiagopittella-ntx-builder-actor-deactivation
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -138,6 +138,13 @@ pub struct NtxBuilderConfig { | |||||||||||||||
| )] | ||||||||||||||||
| pub idle_timeout: Duration, | ||||||||||||||||
|
|
||||||||||||||||
| /// Maximum number of crashes before an account actor is blacklisted. | ||||||||||||||||
| /// | ||||||||||||||||
| /// Once an actor for a given account exceeds this crash count, no new actor will be | ||||||||||||||||
| /// spawned for that account. | ||||||||||||||||
|
Comment on lines
+141
to
+144
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||
| #[arg(long = "ntx-builder.max-actor-crashes", default_value_t = 10, value_name = "NUM")] | ||||||||||||||||
| pub max_actor_crashes: usize, | ||||||||||||||||
|
Comment on lines
+145
to
+146
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would avoid mentioning
Suggested change
|
||||||||||||||||
|
|
||||||||||||||||
| /// Directory for the ntx-builder's persistent database. | ||||||||||||||||
| /// | ||||||||||||||||
| /// If not set, defaults to the node's data directory. | ||||||||||||||||
|
|
@@ -169,6 +176,7 @@ impl NtxBuilderConfig { | |||||||||||||||
| .with_tx_prover_url(self.tx_prover_url) | ||||||||||||||||
| .with_script_cache_size(self.script_cache_size) | ||||||||||||||||
| .with_idle_timeout(self.idle_timeout) | ||||||||||||||||
| .with_max_actor_crashes(self.max_actor_crashes) | ||||||||||||||||
| } | ||||||||||||||||
| } | ||||||||||||||||
|
|
||||||||||||||||
|
|
||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -95,16 +95,29 @@ pub struct Coordinator { | |||||
|
|
||||||
| /// Database for persistent state. | ||||||
| db: Db, | ||||||
|
|
||||||
| /// Tracks the number of crashes per account actor. | ||||||
| /// | ||||||
| /// When an actor shuts down due to a DB error, its crash count is incremented. Once | ||||||
| /// the count reaches `max_actor_crashes`, the account is blacklisted and no new actor | ||||||
| /// will be spawned for it. | ||||||
| crash_counts: HashMap<NetworkAccountId, usize>, | ||||||
|
|
||||||
| /// Maximum number of crashes an account actor is allowed before being blacklisted. | ||||||
| max_actor_crashes: usize, | ||||||
| } | ||||||
|
|
||||||
| impl Coordinator { | ||||||
| /// Creates a new coordinator with the specified maximum number of inflight transactions. | ||||||
| pub fn new(max_inflight_transactions: usize, db: Db) -> Self { | ||||||
| /// Creates a new coordinator with the specified maximum number of inflight transactions | ||||||
| /// and the crash threshold for account blacklisting. | ||||||
| pub fn new(max_inflight_transactions: usize, max_actor_crashes: usize, db: Db) -> Self { | ||||||
| Self { | ||||||
| actor_registry: HashMap::new(), | ||||||
| actor_join_set: JoinSet::new(), | ||||||
| semaphore: Arc::new(Semaphore::new(max_inflight_transactions)), | ||||||
| db, | ||||||
| crash_counts: HashMap::new(), | ||||||
| max_actor_crashes, | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -117,6 +130,18 @@ impl Coordinator { | |||||
| pub fn spawn_actor(&mut self, origin: AccountOrigin, actor_context: &AccountActorContext) { | ||||||
| let account_id = origin.id(); | ||||||
|
|
||||||
| // Skip spawning if the account has been blacklisted due to repeated crashes. | ||||||
| if let Some(&count) = self.crash_counts.get(&account_id) { | ||||||
| if count >= self.max_actor_crashes { | ||||||
| tracing::warn!( | ||||||
| %account_id, | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| crash_count = count, | ||||||
| "Account blacklisted due to repeated crashes, skipping actor spawn" | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: should we make this more explicit?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should already be logging the crash itself, so this is probably okay at |
||||||
| ); | ||||||
| return; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: should we first check for running actors and terminate them? |
||||||
| // If an actor already exists for this account ID, something has gone wrong. | ||||||
| if let Some(handle) = self.actor_registry.remove(&account_id) { | ||||||
| tracing::error!( | ||||||
|
|
@@ -173,7 +198,13 @@ impl Coordinator { | |||||
| }, | ||||||
| ActorShutdownReason::SemaphoreFailed(err) => Err(err).context("semaphore failed"), | ||||||
| ActorShutdownReason::DbError(account_id) => { | ||||||
| tracing::error!(account_id = %account_id, "Account actor shut down due to DB error"); | ||||||
| let count = self.crash_counts.entry(account_id).or_insert(0); | ||||||
| *count += 1; | ||||||
| tracing::error!( | ||||||
| %account_id, | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| crash_count = *count, | ||||||
| "Account actor shut down due to DB error" | ||||||
| ); | ||||||
| Ok(None) | ||||||
| }, | ||||||
| ActorShutdownReason::IdleTimeout(account_id) => { | ||||||
|
|
@@ -305,17 +336,55 @@ impl Coordinator { | |||||
|
|
||||||
| #[cfg(test)] | ||||||
| mod tests { | ||||||
| use std::num::NonZeroUsize; | ||||||
| use std::sync::Arc; | ||||||
| use std::time::Duration; | ||||||
|
|
||||||
| use miden_node_proto::domain::mempool::MempoolEvent; | ||||||
| use miden_node_proto::domain::note::NetworkNote; | ||||||
| use miden_node_utils::lru_cache::LruCache; | ||||||
| use tokio::sync::{RwLock, mpsc}; | ||||||
| use url::Url; | ||||||
|
|
||||||
| use super::*; | ||||||
| use crate::actor::{AccountActorContext, AccountOrigin}; | ||||||
| use crate::chain_state::ChainState; | ||||||
| use crate::clients::StoreClient; | ||||||
| use crate::db::Db; | ||||||
| use crate::test_utils::*; | ||||||
|
|
||||||
| /// Creates a coordinator with default settings backed by a temp DB. | ||||||
| async fn test_coordinator() -> (Coordinator, tempfile::TempDir) { | ||||||
| let (db, dir) = Db::test_setup().await; | ||||||
| (Coordinator::new(4, db), dir) | ||||||
| (Coordinator::new(4, 10, db), dir) | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could also be a method on |
||||||
| } | ||||||
|
|
||||||
| /// Creates a minimal `AccountActorContext` suitable for unit tests. | ||||||
| /// | ||||||
| /// The URLs are fake and actors spawned with this context will fail on their first gRPC call, | ||||||
| /// but this is sufficient for testing coordinator logic (registry, blacklisting, etc.). | ||||||
| fn test_actor_context(db: &Db) -> AccountActorContext { | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe have this be a method on |
||||||
| use miden_protocol::crypto::merkle::mmr::{Forest, MmrPeaks, PartialMmr}; | ||||||
|
|
||||||
| let url = Url::parse("http://127.0.0.1:1").unwrap(); | ||||||
| let block_header = mock_block_header(0_u32.into()); | ||||||
| let chain_mmr = PartialMmr::from_peaks(MmrPeaks::new(Forest::new(0), vec![]).unwrap()); | ||||||
| let chain_state = Arc::new(RwLock::new(ChainState::new(block_header, chain_mmr))); | ||||||
| let (request_tx, _request_rx) = mpsc::channel(1); | ||||||
|
|
||||||
| AccountActorContext { | ||||||
| block_producer_url: url.clone(), | ||||||
| validator_url: url.clone(), | ||||||
| tx_prover_url: None, | ||||||
| chain_state, | ||||||
| store: StoreClient::new(url), | ||||||
| script_cache: LruCache::new(NonZeroUsize::new(1).unwrap()), | ||||||
| max_notes_per_tx: NonZeroUsize::new(1).unwrap(), | ||||||
| max_note_attempts: 1, | ||||||
| idle_timeout: Duration::from_secs(60), | ||||||
| db: db.clone(), | ||||||
| request_tx, | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| /// Registers a dummy actor handle (no real actor task) in the coordinator's registry. | ||||||
|
|
@@ -358,4 +427,47 @@ mod tests { | |||||
| assert_eq!(inactive_targets.len(), 1); | ||||||
| assert_eq!(inactive_targets[0], inactive_id); | ||||||
| } | ||||||
|
|
||||||
| // BLACKLIST TESTS | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: we might want to find a different term to "blacklist", i.e.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer something much shorter -- we're already overly verbose and stuttery throughout the codebase imo. Maybe |
||||||
| // ============================================================================================ | ||||||
|
|
||||||
| #[tokio::test] | ||||||
| async fn spawn_actor_skips_blacklisted_account() { | ||||||
| let (db, _dir) = Db::test_setup().await; | ||||||
| let max_crashes = 3; | ||||||
| let mut coordinator = Coordinator::new(4, max_crashes, db.clone()); | ||||||
| let actor_context = test_actor_context(&db); | ||||||
|
|
||||||
| let account_id = mock_network_account_id(); | ||||||
|
|
||||||
| // Simulate the account having reached the crash threshold. | ||||||
| coordinator.crash_counts.insert(account_id, max_crashes); | ||||||
|
|
||||||
| coordinator.spawn_actor(AccountOrigin::Store(account_id), &actor_context); | ||||||
|
|
||||||
| assert!( | ||||||
| !coordinator.actor_registry.contains_key(&account_id), | ||||||
| "Blacklisted account should not have an actor in the registry" | ||||||
| ); | ||||||
| } | ||||||
|
|
||||||
| #[tokio::test] | ||||||
| async fn spawn_actor_allows_below_threshold() { | ||||||
| let (db, _dir) = Db::test_setup().await; | ||||||
| let max_crashes = 3; | ||||||
| let mut coordinator = Coordinator::new(4, max_crashes, db.clone()); | ||||||
| let actor_context = test_actor_context(&db); | ||||||
|
|
||||||
| let account_id = mock_network_account_id(); | ||||||
|
|
||||||
| // Set crash count below the threshold. | ||||||
| coordinator.crash_counts.insert(account_id, max_crashes - 1); | ||||||
|
|
||||||
| coordinator.spawn_actor(AccountOrigin::Store(account_id), &actor_context); | ||||||
|
|
||||||
| assert!( | ||||||
| coordinator.actor_registry.contains_key(&account_id), | ||||||
| "Account below crash threshold should have an actor in the registry" | ||||||
| ); | ||||||
| } | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.