Skip to main content

espresso_node/
startup_catchup.rs

1//! Startup stake-table catchup for new-protocol nodes.
2//!
3//! Cliquenet only connects to validators in the current epoch's stake table
4//! window (`N-1`, `N`, `N+1`). On a fresh-join or cold-restart node, no
5//! consensus messages can be received until those stake tables are populated,
6//! so the existing reactive catchup (triggered by an unknown-epoch proposal)
7//! never fires.
8//!
9//! [`bootstrap_epoch_window`] drives the existing catchup machinery
10//! synchronously at startup: it walks forward one epoch at a time from the
11//! highest already-known epoch (loaded from persistence by `reload_stake`)
12//! until peers can no longer serve the next epoch root leaf — which is the
13//! point at which the live network currently is.
14
15use std::time::Duration;
16
17use anyhow::{Context, ensure};
18use espresso_types::SeqTypes;
19use hotshot_types::{
20    data::EpochNumber, epoch_membership::EpochMembershipCoordinator, traits::election::Membership,
21};
22
23/// Walk forward from the highest already-known epoch until peers can no
24/// longer serve the next epoch root leaf, populating the membership with
25/// stake tables for every epoch up through `N+1` (where `N` is the current
26/// epoch). Returns `N`.
27///
28/// Preconditions: `reload_stake` should have run before this — it populates
29/// the membership from local persistence so the walk skips epochs we
30/// already know.
31pub async fn bootstrap_epoch_window(
32    coordinator: &EpochMembershipCoordinator<SeqTypes>,
33    epoch_height: u64,
34    step_timeout: Duration,
35) -> anyhow::Result<EpochNumber> {
36    if epoch_height == 0 {
37        // Pre-epoch chain: epochs aren't enabled yet, the non-epoch
38        // committee path is what gets used.
39        return Ok(EpochNumber::genesis());
40    }
41
42    let membership = coordinator.membership();
43    let first_epoch = membership
44        .first_epoch()
45        .context("first_epoch not seeded; genesis stake table missing")?;
46
47    // Find the highest contiguous pair `(H, H-1)` already in memory. Both
48    // are needed as the starting point of the forward walk: `add_epoch_root`
49    // for epoch `K+2` requires the stake table at `K`, so to derive both
50    // `H+1` (needs `H-1`) and `H+2` (needs `H`) we need `H` and `H-1`
51    // present. If only `H` is present (e.g. `set_first_epoch` ran without a
52    // matching reload, or persistence has gaps near the tip), the walk's
53    // first iteration would otherwise fall into a deep walk-back that may
54    // be unfillable from peers and would silently terminate the bootstrap
55    // at a stale epoch.
56    //
57    // `set_first_epoch` always seeds `first_epoch` and `first_epoch + 1`,
58    // so the scan terminates at worst at `first_epoch + 1`.
59    let mut highest = {
60        let initial = membership.highest_known_epoch().unwrap_or(first_epoch + 1);
61        let mut h = initial;
62        while h > first_epoch + 1
63            && !(membership.snapshot(h).is_some() && membership.snapshot(h - 1).is_some())
64        {
65            h = h - 1;
66        }
67        h
68    };
69
70    tracing::info!(
71        %first_epoch,
72        starting_from = %highest,
73        "bootstrap_epoch_window: walking forward",
74    );
75
76    // Walk forward; each successful iteration drives `add_epoch_root` via
77    // the existing catchup machinery, persisting the new stake table. The
78    // walk terminates when the catchup chain returns an error
79    // (`Ok(Err(_))`) or a single step exceeds the hang bound (`Err(_)`).
80    loop {
81        let target = highest + 1;
82        let result =
83            tokio::time::timeout(step_timeout, coordinator.wait_for_stake_table(target)).await;
84        match result {
85            Ok(Ok(_)) => {
86                tracing::info!(%target, "bootstrap_epoch_window: derived stake table");
87                highest = target;
88            },
89            Ok(Err(err)) => {
90                tracing::info!(
91                    %target,
92                    %err,
93                    "bootstrap_epoch_window: catchup failed; treating as live tip",
94                );
95                break;
96            },
97            Err(_) => {
98                tracing::info!(
99                    %target,
100                    timeout_secs = step_timeout.as_secs(),
101                    "bootstrap_epoch_window: catchup timed out; treating as live tip",
102                );
103                break;
104            },
105        }
106    }
107
108    // `highest` corresponds to N+1 (the leaf at root_block_in_epoch(N-1) is
109    // the last finalized one peers can serve). So current epoch N = highest - 1.
110    let current = EpochNumber::new(highest.saturating_sub(1));
111
112    ensure!(
113        membership.snapshot(current).is_some(),
114        "missing stake table for current epoch {current} after bootstrap"
115    );
116    ensure!(
117        membership.snapshot(highest).is_some(),
118        "missing stake table for next epoch {highest} after bootstrap"
119    );
120
121    tracing::info!(%current, "bootstrap_epoch_window: complete");
122    Ok(current)
123}