espresso_node/startup_catchup.rs
1//! Startup stake-table catchup for new-protocol nodes.
2//!
3//! Cliquenet only connects to validators in the current epoch's stake table
4//! window (`N-1`, `N`, `N+1`). On a fresh-join or cold-restart node, no
5//! consensus messages can be received until those stake tables are populated,
6//! so the existing reactive catchup (triggered by an unknown-epoch proposal)
7//! never fires.
8//!
9//! [`bootstrap_epoch_window`] drives the existing catchup machinery
10//! synchronously at startup: it walks forward one epoch at a time from the
11//! highest already-known epoch (loaded from persistence by `reload_stake`)
12//! until peers can no longer serve the next epoch root leaf — which is the
13//! point at which the live network currently is.
14
15use std::time::Duration;
16
17use anyhow::{Context, ensure};
18use espresso_types::SeqTypes;
19use hotshot_types::{
20 data::EpochNumber, epoch_membership::EpochMembershipCoordinator, traits::election::Membership,
21};
22
23/// Walk forward from the highest already-known epoch until peers can no
24/// longer serve the next epoch root leaf, populating the membership with
25/// stake tables for every epoch up through `N+1` (where `N` is the current
26/// epoch). Returns `N`.
27///
28/// Preconditions: `reload_stake` should have run before this — it populates
29/// the membership from local persistence so the walk skips epochs we
30/// already know.
31pub async fn bootstrap_epoch_window(
32 coordinator: &EpochMembershipCoordinator<SeqTypes>,
33 epoch_height: u64,
34 step_timeout: Duration,
35) -> anyhow::Result<EpochNumber> {
36 if epoch_height == 0 {
37 // Pre-epoch chain: epochs aren't enabled yet, the non-epoch
38 // committee path is what gets used.
39 return Ok(EpochNumber::genesis());
40 }
41
42 let membership = coordinator.membership();
43 let first_epoch = membership
44 .first_epoch()
45 .context("first_epoch not seeded; genesis stake table missing")?;
46
47 // Find the highest contiguous pair `(H, H-1)` already in memory. Both
48 // are needed as the starting point of the forward walk: `add_epoch_root`
49 // for epoch `K+2` requires the stake table at `K`, so to derive both
50 // `H+1` (needs `H-1`) and `H+2` (needs `H`) we need `H` and `H-1`
51 // present. If only `H` is present (e.g. `set_first_epoch` ran without a
52 // matching reload, or persistence has gaps near the tip), the walk's
53 // first iteration would otherwise fall into a deep walk-back that may
54 // be unfillable from peers and would silently terminate the bootstrap
55 // at a stale epoch.
56 //
57 // `set_first_epoch` always seeds `first_epoch` and `first_epoch + 1`,
58 // so the scan terminates at worst at `first_epoch + 1`.
59 let mut highest = {
60 let initial = membership.highest_known_epoch().unwrap_or(first_epoch + 1);
61 let mut h = initial;
62 while h > first_epoch + 1
63 && !(membership.snapshot(h).is_some() && membership.snapshot(h - 1).is_some())
64 {
65 h = h - 1;
66 }
67 h
68 };
69
70 tracing::info!(
71 %first_epoch,
72 starting_from = %highest,
73 "bootstrap_epoch_window: walking forward",
74 );
75
76 // Walk forward; each successful iteration drives `add_epoch_root` via
77 // the existing catchup machinery, persisting the new stake table. The
78 // walk terminates when the catchup chain returns an error
79 // (`Ok(Err(_))`) or a single step exceeds the hang bound (`Err(_)`).
80 loop {
81 let target = highest + 1;
82 let result =
83 tokio::time::timeout(step_timeout, coordinator.wait_for_stake_table(target)).await;
84 match result {
85 Ok(Ok(_)) => {
86 tracing::info!(%target, "bootstrap_epoch_window: derived stake table");
87 highest = target;
88 },
89 Ok(Err(err)) => {
90 tracing::info!(
91 %target,
92 %err,
93 "bootstrap_epoch_window: catchup failed; treating as live tip",
94 );
95 break;
96 },
97 Err(_) => {
98 tracing::info!(
99 %target,
100 timeout_secs = step_timeout.as_secs(),
101 "bootstrap_epoch_window: catchup timed out; treating as live tip",
102 );
103 break;
104 },
105 }
106 }
107
108 // `highest` corresponds to N+1 (the leaf at root_block_in_epoch(N-1) is
109 // the last finalized one peers can serve). So current epoch N = highest - 1.
110 let current = EpochNumber::new(highest.saturating_sub(1));
111
112 ensure!(
113 membership.snapshot(current).is_some(),
114 "missing stake table for current epoch {current} after bootstrap"
115 );
116 ensure!(
117 membership.snapshot(highest).is_some(),
118 "missing stake table for next epoch {highest} after bootstrap"
119 );
120
121 tracing::info!(%current, "bootstrap_epoch_window: complete");
122 Ok(current)
123}