From f7a891caca33f6a43b0872caebe15d51bdb0fa72 Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Fri, 15 May 2026 05:59:28 +0100 Subject: [PATCH 01/12] feat: accept multiple URLs for checkpoint sync --- bin/ethlambda/src/main.rs | 117 ++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 42 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 04bda440..1d462aa3 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -76,14 +76,20 @@ struct CliOptions { /// The node ID to look up in annotated_validators.yaml (e.g., "ethlambda_0") #[arg(long)] node_id: String, - /// Base URL of a checkpoint-sync peer's API server (e.g., http://peer:5052). + /// Base URL(s) of checkpoint-sync peer API servers (e.g., http://peer:5052). /// When set, skips genesis initialization and fetches the finalized state - /// and block from the peer's `/lean/v0/states/finalized` and + /// and block from each peer's `/lean/v0/states/finalized` and /// `/lean/v0/blocks/finalized` endpoints. For backward compatibility, a /// URL ending in `/lean/v0/states/finalized` is accepted and the trailing /// path is stripped. - #[arg(long)] - checkpoint_sync_url: Option, + /// + /// Multiple URLs may be supplied for redundancy, either comma-separated + /// (`--checkpoint-sync-url u1,u2`) or by repeating the flag + /// (`--checkpoint-sync-url u1 --checkpoint-sync-url u2`). URLs are tried + /// in order; the first one that succeeds is used and any failures fall + /// over to the next URL. Startup only aborts if every URL fails. + #[arg(long, value_delimiter = ',')] + checkpoint_sync_url: Option>, /// Whether this node acts as a committee aggregator. /// /// Seeds the initial value of the live aggregator flag shared by the @@ -207,7 +213,7 @@ async fn main() -> eyre::Result<()> { let backend = Arc::new(RocksDBBackend::open(&data_dir).expect("Failed to open RocksDB")); let store = fetch_initial_state( - options.checkpoint_sync_url.as_deref(), + options.checkpoint_sync_url.as_deref().unwrap_or(&[]), &genesis_config, backend.clone(), ) @@ -558,12 +564,45 @@ fn read_hex_file_bytes(path: impl AsRef) -> Vec { bytes } +/// Fetch the finalized anchor from a single checkpoint URL, retrying transient +/// races where the peer advances finalization between the state and block +/// fetches. +async fn try_checkpoint_url( + url: &str, + genesis_time: u64, + validators: &[ethlambda_types::state::Validator], +) -> Result<(State, ethlambda_types::block::SignedBlock), checkpoint_sync::CheckpointSyncError> { + const MAX_ANCHOR_FETCH_ATTEMPTS: u32 = 3; + const ANCHOR_FETCH_RETRY_DELAY: std::time::Duration = std::time::Duration::from_secs(1); + + let mut attempt = 1; + loop { + match checkpoint_sync::fetch_finalized_anchor(url, genesis_time, validators).await { + Ok(pair) => return Ok(pair), + Err(checkpoint_sync::CheckpointSyncError::AnchorPairingMismatch) + if attempt < MAX_ANCHOR_FETCH_ATTEMPTS => + { + warn!( + %url, + attempt, + max = MAX_ANCHOR_FETCH_ATTEMPTS, + "Anchor state and block disagree (peer likely advanced finalization mid-fetch); retrying" + ); + tokio::time::sleep(ANCHOR_FETCH_RETRY_DELAY).await; + attempt += 1; + } + Err(err) => return Err(err), + } + } +} + /// Fetch the initial state for the node. /// -/// If `checkpoint_url` is provided, performs checkpoint sync by downloading -/// and verifying the finalized state AND signed block in parallel from a -/// remote peer. Otherwise, creates a genesis state from the local genesis -/// configuration. +/// If `checkpoint_urls` is empty, creates a genesis state from the local +/// genesis configuration. Otherwise performs checkpoint sync by downloading +/// and verifying the finalized state AND signed block from a peer. URLs are +/// tried in order: the first peer that succeeds wins, and failures fall over +/// to the next URL. Startup only aborts if every URL fails. /// /// Fetching the matching signed block lets the local store serve a valid /// anchor via the `BlocksByRoot` req-resp protocol; without it, peers @@ -572,7 +611,7 @@ fn read_hex_file_bytes(path: impl AsRef) -> Vec { /// /// # Arguments /// -/// * `checkpoint_url` - Optional base URL to a peer's API server +/// * `checkpoint_urls` - Zero or more base URLs of peer API servers /// * `genesis` - Genesis configuration (for genesis_time verification and genesis state creation) /// * `backend` - Storage backend for Store creation /// @@ -581,51 +620,45 @@ fn read_hex_file_bytes(path: impl AsRef) -> Vec { /// `Ok(Store)` on success, or `Err(CheckpointSyncError)` if checkpoint sync fails. /// Genesis path is infallible and always returns `Ok`. async fn fetch_initial_state( - checkpoint_url: Option<&str>, + checkpoint_urls: &[String], genesis: &GenesisConfig, backend: Arc, ) -> Result { let validators = genesis.validators(); - let Some(checkpoint_url) = checkpoint_url else { + let Some((first_url, rest_urls)) = checkpoint_urls.split_first() else { info!("No checkpoint sync URL provided, initializing from genesis state"); let genesis_state = State::from_genesis(genesis.genesis_time, validators); return Ok(Store::from_anchor_state(backend, genesis_state)); }; - // Checkpoint sync path - info!(%checkpoint_url, "Starting checkpoint sync"); + // Checkpoint sync path: try URLs in order, fail over to the next on error. + info!(urls = ?checkpoint_urls, "Starting checkpoint sync"); - // The state and block are fetched in parallel; if the peer advances - // finalization between the two requests the pair won't match. Retry a - // small number of times so this transient race doesn't fail node startup. - const MAX_ANCHOR_FETCH_ATTEMPTS: u32 = 3; - const ANCHOR_FETCH_RETRY_DELAY: std::time::Duration = std::time::Duration::from_secs(1); + let mut result = try_checkpoint_url(first_url, genesis.genesis_time, &validators).await; + if let Err(err) = &result { + warn!( + url = %first_url, + %err, + "Checkpoint sync failed for this peer; trying next URL" + ); + } - let mut attempt = 1; - let (state, signed_block) = loop { - match checkpoint_sync::fetch_finalized_anchor( - checkpoint_url, - genesis.genesis_time, - &validators, - ) - .await - { - Ok(pair) => break pair, - Err(checkpoint_sync::CheckpointSyncError::AnchorPairingMismatch) - if attempt < MAX_ANCHOR_FETCH_ATTEMPTS => - { - warn!( - attempt, - max = MAX_ANCHOR_FETCH_ATTEMPTS, - "Anchor state and block disagree (peer likely advanced finalization mid-fetch); retrying" - ); - tokio::time::sleep(ANCHOR_FETCH_RETRY_DELAY).await; - attempt += 1; - } - Err(err) => return Err(err), + for url in rest_urls { + if result.is_ok() { + break; } - }; + result = try_checkpoint_url(url, genesis.genesis_time, &validators).await; + if let Err(err) = &result { + warn!( + %url, + %err, + "Checkpoint sync failed for this peer; trying next URL" + ); + } + } + + let (state, signed_block) = result?; info!( slot = state.slot, From 39c681bd696f9510f0d5585f3cf15024b8637478 Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Fri, 15 May 2026 06:41:24 +0100 Subject: [PATCH 02/12] fix(checkpoint-sync): clarify failover log on last URL failure --- bin/ethlambda/src/main.rs | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 1d462aa3..98327d63 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -637,24 +637,25 @@ async fn fetch_initial_state( let mut result = try_checkpoint_url(first_url, genesis.genesis_time, &validators).await; if let Err(err) = &result { - warn!( - url = %first_url, - %err, - "Checkpoint sync failed for this peer; trying next URL" - ); + if !rest_urls.is_empty() { + warn!(%first_url, %err, "Checkpoint sync failed for this peer; trying next URL"); + } else { + warn!(%first_url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); + } } - for url in rest_urls { + for (idx, url) in rest_urls.iter().enumerate() { if result.is_ok() { break; } result = try_checkpoint_url(url, genesis.genesis_time, &validators).await; if let Err(err) = &result { - warn!( - %url, - %err, - "Checkpoint sync failed for this peer; trying next URL" - ); + let has_more = idx + 1 < rest_urls.len(); + if has_more { + warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); + } else { + warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); + } } } From 1427999fb40f4796ed17194baa7ea608e42db26c Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Fri, 15 May 2026 06:50:30 +0100 Subject: [PATCH 03/12] fix(checkpoint-sync): redact credentials from checkpoint URL logs --- bin/ethlambda/src/main.rs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 98327d63..b7144715 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -596,6 +596,21 @@ async fn try_checkpoint_url( } } +/// Strip userinfo, query, and fragment from a URL so embedded credentials +/// (basic-auth or token query params) don't leak into logs. Unparseable +/// inputs are replaced with a placeholder rather than logged raw. +fn redact_url(url: &str) -> String { + reqwest::Url::parse(url) + .map(|mut u| { + let _ = u.set_username(""); + let _ = u.set_password(None); + u.set_query(None); + u.set_fragment(None); + u.to_string() + }) + .unwrap_or_else(|_| "".to_string()) +} + /// Fetch the initial state for the node. /// /// If `checkpoint_urls` is empty, creates a genesis state from the local @@ -633,14 +648,20 @@ async fn fetch_initial_state( }; // Checkpoint sync path: try URLs in order, fail over to the next on error. - info!(urls = ?checkpoint_urls, "Starting checkpoint sync"); + // Log only the count — URLs may carry basic-auth credentials or token query + // parameters; per-URL log lines below redact those before emission. + info!( + url_count = checkpoint_urls.len(), + "Starting checkpoint sync" + ); let mut result = try_checkpoint_url(first_url, genesis.genesis_time, &validators).await; if let Err(err) = &result { + let url = redact_url(first_url); if !rest_urls.is_empty() { - warn!(%first_url, %err, "Checkpoint sync failed for this peer; trying next URL"); + warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); } else { - warn!(%first_url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); + warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); } } @@ -650,6 +671,7 @@ async fn fetch_initial_state( } result = try_checkpoint_url(url, genesis.genesis_time, &validators).await; if let Err(err) = &result { + let url = redact_url(url); let has_more = idx + 1 < rest_urls.len(); if has_more { warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); From f2b7a77c6c57a974a3cd09bafce8259e9912734b Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Fri, 22 May 2026 15:13:44 +0100 Subject: [PATCH 04/12] Fix: Implement refactors and remove url redacting --- bin/ethlambda/src/main.rs | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 371cb655..83d4b7c6 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -596,21 +596,6 @@ async fn try_checkpoint_url( } } -/// Strip userinfo, query, and fragment from a URL so embedded credentials -/// (basic-auth or token query params) don't leak into logs. Unparseable -/// inputs are replaced with a placeholder rather than logged raw. -fn redact_url(url: &str) -> String { - reqwest::Url::parse(url) - .map(|mut u| { - let _ = u.set_username(""); - let _ = u.set_password(None); - u.set_query(None); - u.set_fragment(None); - u.to_string() - }) - .unwrap_or_else(|_| "".to_string()) -} - /// Fetch the initial state for the node. /// /// If `checkpoint_urls` is empty, creates a genesis state from the local @@ -654,14 +639,12 @@ async fn fetch_initial_state( url_count = checkpoint_urls.len(), "Starting checkpoint sync" ); - let mut result = try_checkpoint_url(first_url, genesis.genesis_time, &validators).await; if let Err(err) = &result { - let url = redact_url(first_url); - if !rest_urls.is_empty() { - warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); + if rest_urls.is_empty() { + warn!(%first_url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); } else { - warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); + warn!(%first_url, %err, "Checkpoint sync failed for this peer; trying next URL"); } } @@ -671,7 +654,6 @@ async fn fetch_initial_state( } result = try_checkpoint_url(url, genesis.genesis_time, &validators).await; if let Err(err) = &result { - let url = redact_url(url); let has_more = idx + 1 < rest_urls.len(); if has_more { warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); From 06a26934144b32ed6452d6aef72a46c163f258ee Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Fri, 22 May 2026 15:55:06 +0100 Subject: [PATCH 05/12] Fix: Refactor checkpoint_url sync --- bin/ethlambda/src/checkpoint_sync.rs | 2 ++ bin/ethlambda/src/main.rs | 41 ++++++++++++---------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/bin/ethlambda/src/checkpoint_sync.rs b/bin/ethlambda/src/checkpoint_sync.rs index 8a81edc2..940d4614 100644 --- a/bin/ethlambda/src/checkpoint_sync.rs +++ b/bin/ethlambda/src/checkpoint_sync.rs @@ -58,6 +58,8 @@ pub enum CheckpointSyncError { BlockHeaderJustifiedRootMismatch, #[error("anchor block does not match anchor state")] AnchorPairingMismatch, + #[error("all checkpoint peers failed")] + AllPeersFailed, } /// Build the HTTP client used for checkpoint sync fetches. diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 83d4b7c6..7b9359eb 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -626,7 +626,7 @@ async fn fetch_initial_state( ) -> Result { let validators = genesis.validators(); - let Some((first_url, rest_urls)) = checkpoint_urls.split_first() else { + if checkpoint_urls.is_empty() { info!("No checkpoint sync URL provided, initializing from genesis state"); let genesis_state = State::from_genesis(genesis.genesis_time, validators); return Ok(Store::from_anchor_state(backend, genesis_state)); @@ -639,31 +639,26 @@ async fn fetch_initial_state( url_count = checkpoint_urls.len(), "Starting checkpoint sync" ); - let mut result = try_checkpoint_url(first_url, genesis.genesis_time, &validators).await; - if let Err(err) = &result { - if rest_urls.is_empty() { - warn!(%first_url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); - } else { - warn!(%first_url, %err, "Checkpoint sync failed for this peer; trying next URL"); - } - } - for (idx, url) in rest_urls.iter().enumerate() { - if result.is_ok() { - break; - } - result = try_checkpoint_url(url, genesis.genesis_time, &validators).await; - if let Err(err) = &result { - let has_more = idx + 1 < rest_urls.len(); - if has_more { - warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); - } else { - warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); + let mut iter = checkpoint_urls.iter().peekable(); + let (state, signed_block) = loop { + let Some(url) = iter.next() else { + return Err(checkpoint_sync::CheckpointSyncError::AllPeersFailed); + }; + match try_checkpoint_url(url, genesis.genesis_time, &validators).await { + Ok(pair) => { + info!(%url, "Checkpoint sync successful with this peer"); + break pair; + } + Err(err) => { + if iter.peek().is_some() { + warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); + } else { + warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); + } } } - } - - let (state, signed_block) = result?; + }; info!( slot = state.slot, From e6cc02c01c9bb60b0fbe291dcb1f66d02ceb1069 Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Thu, 28 May 2026 00:28:43 +0100 Subject: [PATCH 06/12] Code refactor as recommended by @MegaRedHand --- bin/ethlambda/src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 7b9359eb..4ddcdc4c 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -27,7 +27,7 @@ use ethlambda_types::{ aggregator::AggregatorController, genesis::GenesisConfig, signature::ValidatorSecretKey, - state::{State, ValidatorPubkeyBytes}, + state::{State, Validator, ValidatorPubkeyBytes}, }; use serde::Deserialize; use tracing::{error, info, warn}; @@ -570,7 +570,7 @@ fn read_hex_file_bytes(path: impl AsRef) -> Vec { async fn try_checkpoint_url( url: &str, genesis_time: u64, - validators: &[ethlambda_types::state::Validator], + validators: &[Validator], ) -> Result<(State, ethlambda_types::block::SignedBlock), checkpoint_sync::CheckpointSyncError> { const MAX_ANCHOR_FETCH_ATTEMPTS: u32 = 3; const ANCHOR_FETCH_RETRY_DELAY: std::time::Duration = std::time::Duration::from_secs(1); From a8905c7a43f14d51be05894214cfba6cb8357036 Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Thu, 28 May 2026 00:32:55 +0100 Subject: [PATCH 07/12] Code refactor as recommended by @MegaRedHand --- bin/ethlambda/src/main.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 4ddcdc4c..80bd7788 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -15,6 +15,7 @@ use std::{ net::{IpAddr, SocketAddr}, path::{Path, PathBuf}, sync::Arc, + time::Duration, }; use tokio_util::sync::CancellationToken; @@ -25,6 +26,7 @@ use ethlambda_p2p::{Bootnode, P2P, PeerId, SwarmConfig, build_swarm, parse_enrs} use ethlambda_types::primitives::{H256, HashTreeRoot as _}; use ethlambda_types::{ aggregator::AggregatorController, + block::SignedBlock, genesis::GenesisConfig, signature::ValidatorSecretKey, state::{State, Validator, ValidatorPubkeyBytes}, @@ -571,9 +573,9 @@ async fn try_checkpoint_url( url: &str, genesis_time: u64, validators: &[Validator], -) -> Result<(State, ethlambda_types::block::SignedBlock), checkpoint_sync::CheckpointSyncError> { +) -> Result<(State, SignedBlock), checkpoint_sync::CheckpointSyncError> { const MAX_ANCHOR_FETCH_ATTEMPTS: u32 = 3; - const ANCHOR_FETCH_RETRY_DELAY: std::time::Duration = std::time::Duration::from_secs(1); + const ANCHOR_FETCH_RETRY_DELAY: Duration = Duration::from_secs(1); let mut attempt = 1; loop { From 11796a294956a9b8dc2457b7d4ce3b490686b7bb Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Thu, 28 May 2026 00:39:42 +0100 Subject: [PATCH 08/12] Refactor: Move to --- bin/ethlambda/src/checkpoint_sync.rs | 33 ++++++++++++++++++++++++ bin/ethlambda/src/main.rs | 38 ++-------------------------- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/bin/ethlambda/src/checkpoint_sync.rs b/bin/ethlambda/src/checkpoint_sync.rs index 940d4614..02b67702 100644 --- a/bin/ethlambda/src/checkpoint_sync.rs +++ b/bin/ethlambda/src/checkpoint_sync.rs @@ -5,6 +5,7 @@ use ethlambda_types::primitives::HashTreeRoot as _; use ethlambda_types::state::{State, Validator, anchor_pair_is_consistent}; use libssz::{DecodeError, SszDecode}; use reqwest::Client; +use tracing::warn; /// Timeout for establishing the HTTP connection to the checkpoint peer. /// Fail fast if the peer is unreachable. @@ -271,6 +272,38 @@ fn verify_checkpoint_state( Ok(()) } +/// Fetch the finalized anchor from a single checkpoint URL, retrying transient +/// races where the peer advances finalization between the state and block +/// fetches. +pub async fn try_checkpoint_url( + url: &str, + genesis_time: u64, + validators: &[Validator], +) -> Result<(State, SignedBlock), CheckpointSyncError> { + const MAX_ANCHOR_FETCH_ATTEMPTS: u32 = 3; + const ANCHOR_FETCH_RETRY_DELAY: Duration = Duration::from_secs(1); + + let mut attempt = 1; + loop { + match fetch_finalized_anchor(url, genesis_time, validators).await { + Ok(pair) => return Ok(pair), + Err(CheckpointSyncError::AnchorPairingMismatch) + if attempt < MAX_ANCHOR_FETCH_ATTEMPTS => + { + warn!( + %url, + attempt, + max = MAX_ANCHOR_FETCH_ATTEMPTS, + "Anchor state and block disagree (peer likely advanced finalization mid-fetch); retrying" + ); + tokio::time::sleep(ANCHOR_FETCH_RETRY_DELAY).await; + attempt += 1; + } + Err(err) => return Err(err), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 80bd7788..a2a67e89 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -15,7 +15,6 @@ use std::{ net::{IpAddr, SocketAddr}, path::{Path, PathBuf}, sync::Arc, - time::Duration, }; use tokio_util::sync::CancellationToken; @@ -26,10 +25,9 @@ use ethlambda_p2p::{Bootnode, P2P, PeerId, SwarmConfig, build_swarm, parse_enrs} use ethlambda_types::primitives::{H256, HashTreeRoot as _}; use ethlambda_types::{ aggregator::AggregatorController, - block::SignedBlock, genesis::GenesisConfig, signature::ValidatorSecretKey, - state::{State, Validator, ValidatorPubkeyBytes}, + state::{State, ValidatorPubkeyBytes}, }; use serde::Deserialize; use tracing::{error, info, warn}; @@ -566,38 +564,6 @@ fn read_hex_file_bytes(path: impl AsRef) -> Vec { bytes } -/// Fetch the finalized anchor from a single checkpoint URL, retrying transient -/// races where the peer advances finalization between the state and block -/// fetches. -async fn try_checkpoint_url( - url: &str, - genesis_time: u64, - validators: &[Validator], -) -> Result<(State, SignedBlock), checkpoint_sync::CheckpointSyncError> { - const MAX_ANCHOR_FETCH_ATTEMPTS: u32 = 3; - const ANCHOR_FETCH_RETRY_DELAY: Duration = Duration::from_secs(1); - - let mut attempt = 1; - loop { - match checkpoint_sync::fetch_finalized_anchor(url, genesis_time, validators).await { - Ok(pair) => return Ok(pair), - Err(checkpoint_sync::CheckpointSyncError::AnchorPairingMismatch) - if attempt < MAX_ANCHOR_FETCH_ATTEMPTS => - { - warn!( - %url, - attempt, - max = MAX_ANCHOR_FETCH_ATTEMPTS, - "Anchor state and block disagree (peer likely advanced finalization mid-fetch); retrying" - ); - tokio::time::sleep(ANCHOR_FETCH_RETRY_DELAY).await; - attempt += 1; - } - Err(err) => return Err(err), - } - } -} - /// Fetch the initial state for the node. /// /// If `checkpoint_urls` is empty, creates a genesis state from the local @@ -647,7 +613,7 @@ async fn fetch_initial_state( let Some(url) = iter.next() else { return Err(checkpoint_sync::CheckpointSyncError::AllPeersFailed); }; - match try_checkpoint_url(url, genesis.genesis_time, &validators).await { + match checkpoint_sync::try_checkpoint_url(url, genesis.genesis_time, &validators).await { Ok(pair) => { info!(%url, "Checkpoint sync successful with this peer"); break pair; From 6abb30426217b74c3495d6511aee89f54d6643ec Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Thu, 28 May 2026 00:46:47 +0100 Subject: [PATCH 09/12] Refactor: Move checkpoint-sync peer iteration into checkpoint_sync.rs --- bin/ethlambda/src/checkpoint_sync.rs | 33 ++++++++++++++++++++++++++-- bin/ethlambda/src/main.rs | 25 +++++---------------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/bin/ethlambda/src/checkpoint_sync.rs b/bin/ethlambda/src/checkpoint_sync.rs index 02b67702..43871f92 100644 --- a/bin/ethlambda/src/checkpoint_sync.rs +++ b/bin/ethlambda/src/checkpoint_sync.rs @@ -5,7 +5,7 @@ use ethlambda_types::primitives::HashTreeRoot as _; use ethlambda_types::state::{State, Validator, anchor_pair_is_consistent}; use libssz::{DecodeError, SszDecode}; use reqwest::Client; -use tracing::warn; +use tracing::{info, warn}; /// Timeout for establishing the HTTP connection to the checkpoint peer. /// Fail fast if the peer is unreachable. @@ -275,7 +275,7 @@ fn verify_checkpoint_state( /// Fetch the finalized anchor from a single checkpoint URL, retrying transient /// races where the peer advances finalization between the state and block /// fetches. -pub async fn try_checkpoint_url( +async fn try_checkpoint_url( url: &str, genesis_time: u64, validators: &[Validator], @@ -304,6 +304,35 @@ pub async fn try_checkpoint_url( } } +/// Try each checkpoint URL in order, returning the first successful anchor +/// pair. Logs per-peer success/failure and returns `AllPeersFailed` only when +/// every URL has been exhausted. +pub async fn fetch_anchor_from_peers( + checkpoint_urls: &[String], + genesis_time: u64, + validators: &[Validator], +) -> Result<(State, SignedBlock), CheckpointSyncError> { + let mut iter = checkpoint_urls.iter().peekable(); + loop { + let Some(url) = iter.next() else { + return Err(CheckpointSyncError::AllPeersFailed); + }; + match try_checkpoint_url(url, genesis_time, validators).await { + Ok(pair) => { + info!(%url, "Checkpoint sync successful with this peer"); + return Ok(pair); + } + Err(err) => { + if iter.peek().is_some() { + warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); + } else { + warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); + } + } + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index a2a67e89..007a7251 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -608,25 +608,12 @@ async fn fetch_initial_state( "Starting checkpoint sync" ); - let mut iter = checkpoint_urls.iter().peekable(); - let (state, signed_block) = loop { - let Some(url) = iter.next() else { - return Err(checkpoint_sync::CheckpointSyncError::AllPeersFailed); - }; - match checkpoint_sync::try_checkpoint_url(url, genesis.genesis_time, &validators).await { - Ok(pair) => { - info!(%url, "Checkpoint sync successful with this peer"); - break pair; - } - Err(err) => { - if iter.peek().is_some() { - warn!(%url, %err, "Checkpoint sync failed for this peer; trying next URL"); - } else { - warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); - } - } - } - }; + let (state, signed_block) = checkpoint_sync::fetch_anchor_from_peers( + checkpoint_urls, + genesis.genesis_time, + &validators, + ) + .await?; info!( slot = state.slot, From 1ae8584f59b9782542f0f90a1796bb974e8d55a7 Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Thu, 28 May 2026 02:54:18 +0100 Subject: [PATCH 10/12] Refactor: Preserve underlying cause in AllPeersFailed --- bin/ethlambda/src/checkpoint_sync.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bin/ethlambda/src/checkpoint_sync.rs b/bin/ethlambda/src/checkpoint_sync.rs index 43871f92..3f6bcbb8 100644 --- a/bin/ethlambda/src/checkpoint_sync.rs +++ b/bin/ethlambda/src/checkpoint_sync.rs @@ -59,8 +59,10 @@ pub enum CheckpointSyncError { BlockHeaderJustifiedRootMismatch, #[error("anchor block does not match anchor state")] AnchorPairingMismatch, - #[error("all checkpoint peers failed")] - AllPeersFailed, + #[error("all checkpoint peers failed; last error: {0}")] + AllPeersFailed(Box), + #[error("no checkpoint peers configured")] + NoPeersConfigured, } /// Build the HTTP client used for checkpoint sync fetches. @@ -305,17 +307,22 @@ async fn try_checkpoint_url( } /// Try each checkpoint URL in order, returning the first successful anchor -/// pair. Logs per-peer success/failure and returns `AllPeersFailed` only when -/// every URL has been exhausted. +/// pair. Logs per-peer success/failure. On total failure, returns +/// `AllPeersFailed` wrapping the last peer's error so the underlying cause +/// is preserved in the propagated error (not just the per-peer `warn!` logs). pub async fn fetch_anchor_from_peers( checkpoint_urls: &[String], genesis_time: u64, validators: &[Validator], ) -> Result<(State, SignedBlock), CheckpointSyncError> { let mut iter = checkpoint_urls.iter().peekable(); + let mut last_err: Option = None; loop { let Some(url) = iter.next() else { - return Err(CheckpointSyncError::AllPeersFailed); + return Err(match last_err { + Some(err) => CheckpointSyncError::AllPeersFailed(Box::new(err)), + None => CheckpointSyncError::NoPeersConfigured, + }); }; match try_checkpoint_url(url, genesis_time, validators).await { Ok(pair) => { @@ -328,6 +335,7 @@ pub async fn fetch_anchor_from_peers( } else { warn!(%url, %err, "Checkpoint sync failed for this peer; no more URLs to try"); } + last_err = Some(err); } } } From 82a38afe5e9c91470decb19252d2e0fd41a8581d Mon Sep 17 00:00:00 2001 From: Emeka Allison Date: Thu, 28 May 2026 03:13:06 +0100 Subject: [PATCH 11/12] Refactor: Simplify checkpoint URL parsing and trim empty entries --- bin/ethlambda/src/main.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 007a7251..a6130e53 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -89,7 +89,7 @@ struct CliOptions { /// in order; the first one that succeeds is used and any failures fall /// over to the next URL. Startup only aborts if every URL fails. #[arg(long, value_delimiter = ',')] - checkpoint_sync_url: Option>, + checkpoint_sync_url: Vec, /// Whether this node acts as a committee aggregator. /// /// Seeds the initial value of the live aggregator flag shared by the @@ -212,13 +212,16 @@ async fn main() -> eyre::Result<()> { std::fs::create_dir_all(&data_dir).expect("Failed to create data directory"); let backend = Arc::new(RocksDBBackend::open(&data_dir).expect("Failed to open RocksDB")); - let store = fetch_initial_state( - options.checkpoint_sync_url.as_deref().unwrap_or(&[]), - &genesis_config, - backend.clone(), - ) - .await - .inspect_err(|err| error!(%err, "Failed to initialize state"))?; + let clean_checkpoint_urls: Vec = options + .checkpoint_sync_url + .into_iter() + .map(|url| url.trim().to_string()) + .filter(|url| !url.is_empty()) + .collect(); + + let store = fetch_initial_state(&clean_checkpoint_urls, &genesis_config, backend.clone()) + .await + .inspect_err(|err| error!(%err, "Failed to initialize state"))?; let validator_ids: Vec = validator_keys.keys().copied().collect(); From 000e56d462cff6950632ffe3743ec7d3b54dde43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Gr=C3=BCner?= <47506558+MegaRedHand@users.noreply.github.com> Date: Thu, 28 May 2026 18:54:43 -0300 Subject: [PATCH 12/12] chore: remove comment --- bin/ethlambda/src/main.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/ethlambda/src/main.rs b/bin/ethlambda/src/main.rs index 7ee32dc9..c2e022c4 100644 --- a/bin/ethlambda/src/main.rs +++ b/bin/ethlambda/src/main.rs @@ -609,8 +609,6 @@ async fn fetch_initial_state( }; // Checkpoint sync path: try URLs in order, fail over to the next on error. - // Log only the count — URLs may carry basic-auth credentials or token query - // parameters; per-URL log lines below redact those before emission. info!( url_count = checkpoint_urls.len(), "Starting checkpoint sync"