hotshot_query_service/data_source/storage/
sql.rs

1// Copyright (c) 2022 Espresso Systems (espressosys.com)
2// This file is part of the HotShot Query Service library.
3//
4// This program is free software: you can redistribute it and/or modify it under the terms of the GNU
5// General Public License as published by the Free Software Foundation, either version 3 of the
6// License, or (at your option) any later version.
7// This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
8// even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9// General Public License for more details.
10// You should have received a copy of the GNU General Public License along with this program. If not,
11// see <https://www.gnu.org/licenses/>.
12
13#![cfg(feature = "sql-data-source")]
14use std::{cmp::min, fmt::Debug, future::Future, str::FromStr, time::Duration};
15
16use anyhow::Context;
17use async_trait::async_trait;
18use chrono::Utc;
19#[cfg(not(feature = "embedded-db"))]
20use futures::future::FutureExt;
21use hotshot_types::{
22    data::VidShare,
23    traits::{metrics::Metrics, node_implementation::NodeType},
24};
25use itertools::Itertools;
26use log::LevelFilter;
27use rand::Rng;
28#[cfg(not(feature = "embedded-db"))]
29use sqlx::postgres::{PgConnectOptions, PgSslMode};
30#[cfg(feature = "embedded-db")]
31use sqlx::sqlite::SqliteConnectOptions;
32use sqlx::{
33    ConnectOptions, Row,
34    pool::{Pool, PoolOptions},
35};
36use tokio::time::sleep;
37use tracing::instrument;
38
39use crate::{
40    Header, QueryError, QueryResult,
41    availability::{QueryableHeader, QueryablePayload, VidCommonMetadata, VidCommonQueryData},
42    data_source::{
43        VersionedDataSource,
44        storage::{
45            SerializableRetry,
46            pruning::{PruneStorage, PrunedHeightStorage, PrunerCfg, PrunerConfig},
47        },
48        update::Transaction as _,
49    },
50    metrics::PrometheusMetrics,
51    node::BlockId,
52    status::HasMetrics,
53};
54pub extern crate sqlx;
55pub use sqlx::{Database, Sqlite};
56
57mod db;
58mod migrate;
59mod queries;
60mod transaction;
61
62pub use anyhow::Error;
63pub use db::*;
64pub use include_dir::include_dir;
65pub use queries::QueryBuilder;
66pub use refinery::Migration;
67pub use transaction::*;
68
69use self::{migrate::Migrator, transaction::PoolMetrics};
70use super::{AvailabilityStorage, NodeStorage};
71// This needs to be reexported so that we can reference it by absolute path relative to this crate
72// in the expansion of `include_migrations`, even when `include_migrations` is invoked from another
73// crate which doesn't have `include_dir` as a dependency.
74pub use crate::include_migrations;
75
76/// Embed migrations from the given directory into the current binary for PostgreSQL or SQLite.
77///
78/// The macro invocation `include_migrations!(path)` evaluates to an expression of type `impl
79/// Iterator<Item = Migration>`. Each migration must be a text file which is an immediate child of
80/// `path`, and there must be no non-migration files in `path`. The migration files must have names
81/// of the form `V${version}__${name}.sql`, where `version` is a positive integer indicating how the
82/// migration is to be ordered relative to other migrations, and `name` is a descriptive name for
83/// the migration.
84///
85/// `path` should be an absolute path. It is possible to give a path relative to the root of the
86/// invoking crate by using environment variable expansions and the `CARGO_MANIFEST_DIR` environment
87/// variable.
88///
89/// As an example, this is the invocation used to load the default migrations from the
90/// `hotshot-query-service` crate. The migrations are located in a directory called `migrations` at
91/// - PostgreSQL migrations are in `/migrations/postgres`.
92/// - SQLite migrations are in `/migrations/sqlite`.
93///
94/// ```
95/// # use hotshot_query_service::data_source::sql::{include_migrations, Migration};
96/// // For PostgreSQL
97/// #[cfg(not(feature = "embedded-db"))]
98///  let mut migrations: Vec<Migration> =
99///     include_migrations!("$CARGO_MANIFEST_DIR/migrations/postgres").collect();
100/// // For SQLite
101/// #[cfg(feature = "embedded-db")]
102/// let mut migrations: Vec<Migration> =
103///     include_migrations!("$CARGO_MANIFEST_DIR/migrations/sqlite").collect();
104///
105///     migrations.sort();
106///     assert_eq!(migrations[0].version(), 10);
107///     assert_eq!(migrations[0].name(), "init_schema");
108/// ```
109///
110/// Note that a similar macro is available from Refinery:
111/// [embed_migrations](https://docs.rs/refinery/0.8.11/refinery/macro.embed_migrations.html). This
112/// macro differs in that it evaluates to an iterator of [migrations](Migration), making it an
113/// expression macro, while `embed_migrations` is a statement macro that defines a module which
114/// provides access to the embedded migrations only indirectly via a
115/// [`Runner`](https://docs.rs/refinery/0.8.11/refinery/struct.Runner.html). The direct access to
116/// migrations provided by [`include_migrations`] makes this macro easier to use with
117/// [`Config::migrations`], for combining custom migrations with [`default_migrations`].
118#[macro_export]
119macro_rules! include_migrations {
120    ($dir:tt) => {
121        $crate::data_source::storage::sql::include_dir!($dir)
122            .files()
123            .map(|file| {
124                let path = file.path();
125                let name = path
126                    .file_name()
127                    .and_then(std::ffi::OsStr::to_str)
128                    .unwrap_or_else(|| {
129                        panic!(
130                            "migration file {} must have a non-empty UTF-8 name",
131                            path.display()
132                        )
133                    });
134                let sql = file
135                    .contents_utf8()
136                    .unwrap_or_else(|| panic!("migration file {name} must use UTF-8 encoding"));
137                $crate::data_source::storage::sql::Migration::unapplied(name, sql)
138                    .expect("invalid migration")
139            })
140    };
141}
142
143/// The migrations required to build the default schema for this version of [`SqlStorage`].
144pub fn default_migrations() -> Vec<Migration> {
145    #[cfg(not(feature = "embedded-db"))]
146    let mut migrations =
147        include_migrations!("$CARGO_MANIFEST_DIR/migrations/postgres").collect::<Vec<_>>();
148
149    #[cfg(feature = "embedded-db")]
150    let mut migrations =
151        include_migrations!("$CARGO_MANIFEST_DIR/migrations/sqlite").collect::<Vec<_>>();
152
153    // Check version uniqueness and sort by version.
154    validate_migrations(&mut migrations).expect("default migrations are invalid");
155
156    // Check that all migration versions are multiples of 100, so that custom migrations can be
157    // inserted in between.
158    for m in &migrations {
159        if m.version() <= 30 {
160            // An older version of this software used intervals of 10 instead of 100. This was
161            // changed to allow more custom migrations between each default migration, but we must
162            // still accept older migrations that followed the older rule.
163            assert!(
164                m.version() > 0 && m.version() % 10 == 0,
165                "legacy default migration version {} is not a positive multiple of 10",
166                m.version()
167            );
168        } else {
169            assert!(
170                m.version() % 100 == 0,
171                "default migration version {} is not a multiple of 100",
172                m.version()
173            );
174        }
175    }
176
177    migrations
178}
179
180/// Validate and preprocess a sequence of migrations.
181///
182/// * Ensure all migrations have distinct versions
183/// * Ensure migrations are sorted by increasing version
184fn validate_migrations(migrations: &mut [Migration]) -> Result<(), Error> {
185    migrations.sort_by_key(|m| m.version());
186
187    // Check version uniqueness.
188    for (prev, next) in migrations.iter().zip(migrations.iter().skip(1)) {
189        if next <= prev {
190            return Err(Error::msg(format!(
191                "migration versions are not strictly increasing ({prev}->{next})"
192            )));
193        }
194    }
195
196    Ok(())
197}
198
199/// Add custom migrations to a default migration sequence.
200///
201/// Migrations in `custom` replace migrations in `default` with the same version. Otherwise, the two
202/// sequences `default` and `custom` are merged so that the resulting sequence is sorted by
203/// ascending version number. Each of `default` and `custom` is assumed to be the output of
204/// [`validate_migrations`]; that is, each is sorted by version and contains no duplicate versions.
205fn add_custom_migrations(
206    default: impl IntoIterator<Item = Migration>,
207    custom: impl IntoIterator<Item = Migration>,
208) -> impl Iterator<Item = Migration> {
209    default
210        .into_iter()
211        // Merge sorted lists, joining pairs of equal version into `EitherOrBoth::Both`.
212        .merge_join_by(custom, |l, r| l.version().cmp(&r.version()))
213        // Prefer the custom migration for a given version when both default and custom versions
214        // are present.
215        .map(|pair| pair.reduce(|_, custom| custom))
216}
217
218#[derive(Clone)]
219pub struct Config {
220    #[cfg(feature = "embedded-db")]
221    db_opt: SqliteConnectOptions,
222
223    #[cfg(not(feature = "embedded-db"))]
224    db_opt: PgConnectOptions,
225
226    pool_opt: PoolOptions<Db>,
227
228    /// Extra pool_opt to allow separately configuring the connection pool for query service
229    #[cfg(not(feature = "embedded-db"))]
230    pool_opt_query: PoolOptions<Db>,
231
232    #[cfg(not(feature = "embedded-db"))]
233    schema: String,
234    reset: bool,
235    migrations: Vec<Migration>,
236    no_migrations: bool,
237    pruner_cfg: Option<PrunerCfg>,
238    archive: bool,
239    serializable_retry_config: SerializableRetryConfig,
240    pool: Option<Pool<Db>>,
241}
242
243#[cfg(not(feature = "embedded-db"))]
244impl Default for Config {
245    fn default() -> Self {
246        PgConnectOptions::default()
247            .username("postgres")
248            .password("password")
249            .host("localhost")
250            .port(5432)
251            .into()
252    }
253}
254
255#[cfg(feature = "embedded-db")]
256impl Default for Config {
257    fn default() -> Self {
258        crate::sqlite_options::sqlite_options().into()
259    }
260}
261
262#[cfg(feature = "embedded-db")]
263impl From<SqliteConnectOptions> for Config {
264    fn from(db_opt: SqliteConnectOptions) -> Self {
265        Self {
266            db_opt,
267            pool_opt: PoolOptions::default(),
268            reset: false,
269            migrations: vec![],
270            no_migrations: false,
271            pruner_cfg: None,
272            archive: false,
273            serializable_retry_config: SerializableRetryConfig::default(),
274            pool: None,
275        }
276    }
277}
278
279#[cfg(not(feature = "embedded-db"))]
280impl From<PgConnectOptions> for Config {
281    fn from(db_opt: PgConnectOptions) -> Self {
282        Self {
283            db_opt,
284            pool_opt: PoolOptions::default(),
285            pool_opt_query: PoolOptions::default(),
286            schema: "hotshot".into(),
287            reset: false,
288            migrations: vec![],
289            no_migrations: false,
290            pruner_cfg: None,
291            archive: false,
292            serializable_retry_config: SerializableRetryConfig::default(),
293            pool: None,
294        }
295    }
296}
297
298#[cfg(not(feature = "embedded-db"))]
299impl FromStr for Config {
300    type Err = <PgConnectOptions as FromStr>::Err;
301
302    fn from_str(s: &str) -> Result<Self, Self::Err> {
303        Ok(PgConnectOptions::from_str(s)?.into())
304    }
305}
306
307#[cfg(feature = "embedded-db")]
308impl FromStr for Config {
309    type Err = <SqliteConnectOptions as FromStr>::Err;
310
311    fn from_str(s: &str) -> Result<Self, Self::Err> {
312        Ok(SqliteConnectOptions::from_str(s)?.into())
313    }
314}
315
316#[cfg(feature = "embedded-db")]
317impl Config {
318    pub fn busy_timeout(mut self, timeout: Duration) -> Self {
319        self.db_opt = self.db_opt.busy_timeout(timeout);
320        self
321    }
322
323    pub fn db_path(mut self, path: std::path::PathBuf) -> Self {
324        self.db_opt = self.db_opt.filename(path);
325        self
326    }
327}
328
329#[cfg(not(feature = "embedded-db"))]
330impl Config {
331    /// Set the hostname of the database server.
332    ///
333    /// The default is `localhost`.
334    pub fn host(mut self, host: impl Into<String>) -> Self {
335        self.db_opt = self.db_opt.host(&host.into());
336        self
337    }
338
339    /// Set the port on which to connect to the database.
340    ///
341    /// The default is 5432, the default Postgres port.
342    pub fn port(mut self, port: u16) -> Self {
343        self.db_opt = self.db_opt.port(port);
344        self
345    }
346
347    /// Set the DB user to connect as.
348    pub fn user(mut self, user: &str) -> Self {
349        self.db_opt = self.db_opt.username(user);
350        self
351    }
352
353    /// Set a password for connecting to the database.
354    pub fn password(mut self, password: &str) -> Self {
355        self.db_opt = self.db_opt.password(password);
356        self
357    }
358
359    /// Set the name of the database to connect to.
360    pub fn database(mut self, database: &str) -> Self {
361        self.db_opt = self.db_opt.database(database);
362        self
363    }
364
365    /// Use TLS for an encrypted connection to the database.
366    ///
367    /// Note that an encrypted connection may be established even if this option is not set, as long
368    /// as both the client and server support it. This option merely causes connection to fail if an
369    /// encrypted stream cannot be established.
370    pub fn tls(mut self) -> Self {
371        self.db_opt = self.db_opt.ssl_mode(PgSslMode::Require);
372        self
373    }
374
375    /// Set the name of the schema to use for queries.
376    ///
377    /// The default schema is named `hotshot` and is created via the default migrations.
378    pub fn schema(mut self, schema: impl Into<String>) -> Self {
379        self.schema = schema.into();
380        self
381    }
382}
383
384impl Config {
385    /// Sets the database connection pool
386    /// This allows reusing an existing connection pool when building a new `SqlStorage` instance.
387    pub fn pool(mut self, pool: Pool<Db>) -> Self {
388        self.pool = Some(pool);
389        self
390    }
391
392    /// Set the retry policy for transactions aborted by PostgreSQL serialization conflicts.
393    pub fn serializable_retry(mut self, cfg: SerializableRetryConfig) -> Self {
394        self.serializable_retry_config = cfg;
395        self
396    }
397
398    /// Reset the schema on connection.
399    ///
400    /// When this [`Config`] is used to [`connect`](Self::connect) a
401    /// [`SqlDataSource`](crate::data_source::SqlDataSource), if this option is set, the relevant
402    /// [`schema`](Self::schema) will first be dropped and then recreated, yielding a completely
403    /// fresh instance of the query service.
404    ///
405    /// This is a particularly useful capability for development and staging environments. Still, it
406    /// must be used with extreme caution, as using this will irrevocably delete any data pertaining
407    /// to the query service in the database.
408    pub fn reset_schema(mut self) -> Self {
409        self.reset = true;
410        self
411    }
412
413    /// Add custom migrations to run when connecting to the database.
414    pub fn migrations(mut self, migrations: impl IntoIterator<Item = Migration>) -> Self {
415        self.migrations.extend(migrations);
416        self
417    }
418
419    /// Skip all migrations when connecting to the database.
420    pub fn no_migrations(mut self) -> Self {
421        self.no_migrations = true;
422        self
423    }
424
425    /// Enable pruning with a given configuration.
426    ///
427    /// If [`archive`](Self::archive) was previously specified, this will override it.
428    pub fn pruner_cfg(mut self, cfg: PrunerCfg) -> Result<Self, Error> {
429        cfg.validate()?;
430        self.pruner_cfg = Some(cfg);
431        self.archive = false;
432        Ok(self)
433    }
434
435    /// Disable pruning and reconstruct previously pruned data.
436    ///
437    /// While running without pruning is the default behavior, the default will not try to
438    /// reconstruct data that was pruned in a previous run where pruning was enabled. This option
439    /// instructs the service to run without pruning _and_ reconstruct all previously pruned data by
440    /// fetching from peers.
441    ///
442    /// If [`pruner_cfg`](Self::pruner_cfg) was previously specified, this will override it.
443    pub fn archive(mut self) -> Self {
444        self.pruner_cfg = None;
445        self.archive = true;
446        self
447    }
448
449    /// Set the maximum idle time of a connection.
450    ///
451    /// Any connection which has been open and unused longer than this duration will be
452    /// automatically closed to reduce load on the server.
453    pub fn idle_connection_timeout(mut self, timeout: Duration) -> Self {
454        self.pool_opt = self.pool_opt.idle_timeout(Some(timeout));
455
456        #[cfg(not(feature = "embedded-db"))]
457        {
458            self.pool_opt_query = self.pool_opt_query.idle_timeout(Some(timeout));
459        }
460
461        self
462    }
463
464    /// Set the maximum lifetime of a connection.
465    ///
466    /// Any connection which has been open longer than this duration will be automatically closed
467    /// (and, if needed, replaced), even if it is otherwise healthy. It is good practice to refresh
468    /// even healthy connections once in a while (e.g. daily) in case of resource leaks in the
469    /// server implementation.
470    pub fn connection_timeout(mut self, timeout: Duration) -> Self {
471        self.pool_opt = self.pool_opt.max_lifetime(Some(timeout));
472
473        #[cfg(not(feature = "embedded-db"))]
474        {
475            self.pool_opt = self.pool_opt.max_lifetime(Some(timeout));
476        }
477
478        self
479    }
480
481    /// Set the minimum number of connections to maintain at any time.
482    ///
483    /// The data source will, to the best of its ability, maintain at least `min` open connections
484    /// at all times. This can be used to reduce the latency hit of opening new connections when at
485    /// least this many simultaneous connections are frequently needed.
486    pub fn min_connections(mut self, min: u32) -> Self {
487        self.pool_opt = self.pool_opt.min_connections(min);
488        self
489    }
490
491    #[cfg(not(feature = "embedded-db"))]
492    pub fn query_min_connections(mut self, min: u32) -> Self {
493        self.pool_opt_query = self.pool_opt_query.min_connections(min);
494        self
495    }
496
497    /// Set the maximum number of connections to maintain at any time.
498    ///
499    /// Once `max` connections are in use simultaneously, further attempts to acquire a connection
500    /// (or begin a transaction) will block until one of the existing connections is released.
501    pub fn max_connections(mut self, max: u32) -> Self {
502        self.pool_opt = self.pool_opt.max_connections(max);
503        self
504    }
505
506    #[cfg(not(feature = "embedded-db"))]
507    pub fn query_max_connections(mut self, max: u32) -> Self {
508        self.pool_opt_query = self.pool_opt_query.max_connections(max);
509        self
510    }
511
512    /// Log at WARN level any time a SQL statement takes longer than `threshold`.
513    ///
514    /// The default threshold is 1s.
515    pub fn slow_statement_threshold(mut self, threshold: Duration) -> Self {
516        self.db_opt = self
517            .db_opt
518            .log_slow_statements(LevelFilter::Warn, threshold);
519        self
520    }
521
522    /// Set the maximum time a single SQL statement is allowed to run before being canceled.
523    ///
524    /// This helps prevent queries from running indefinitely even when the client is dropped
525    #[cfg(not(feature = "embedded-db"))]
526    pub fn statement_timeout(mut self, timeout: Duration) -> Self {
527        // Format duration as milliseconds
528        // PostgreSQL interprets values without units as milliseconds
529        let timeout_ms = timeout.as_millis();
530        self.db_opt = self
531            .db_opt
532            .options([("statement_timeout", timeout_ms.to_string())]);
533        self
534    }
535
536    /// not supported for SQLite.
537    #[cfg(feature = "embedded-db")]
538    pub fn statement_timeout(self, _timeout: Duration) -> Self {
539        self
540    }
541}
542
543/// Storage for the APIs provided in this crate, backed by a remote PostgreSQL database.
544#[derive(Clone, Debug)]
545pub struct SqlStorage {
546    pool: Pool<Db>,
547    metrics: PrometheusMetrics,
548    pool_metrics: PoolMetrics,
549    pruner_cfg: Option<PrunerCfg>,
550    serializable_retry_config: SerializableRetryConfig,
551}
552
553#[derive(Debug)]
554struct PruneState {
555    min_height: u64,
556    target_height: u64,
557    minimum_retention_height: u64,
558}
559
560impl PruneState {
561    fn next_target_batch(&self, batch_size: u64) -> Option<u64> {
562        if self.min_height < self.target_height {
563            Some(min(self.min_height + batch_size, self.target_height) - 1)
564        } else {
565            None
566        }
567    }
568
569    fn next_extra_batch(&self, batch_size: u64) -> Option<u64> {
570        if self.min_height < self.minimum_retention_height {
571            Some(min(self.min_height + batch_size, self.minimum_retention_height) - 1)
572        } else {
573            None
574        }
575    }
576}
577
578#[derive(Debug)]
579pub struct Pruner<'a> {
580    data: PruneState,
581    state: PruneState,
582    cfg: &'a PrunerCfg,
583    extra_pruning: bool,
584}
585
586#[derive(Clone, Copy, Debug)]
587enum PruneCategory {
588    Data,
589    State,
590}
591
592impl<'a> Pruner<'a> {
593    /// Get the next batch to delete of data older than its target retention.
594    ///
595    /// If such a batch is available, returns the type of data to delete (either consensus data or
596    /// derived state) and the block height to delete up to (inclusive).
597    fn next_target_batch(&self) -> Option<(PruneCategory, u64)> {
598        // Delete the oldest batch which is older than its target retention; whether state or
599        // consensus data. All else equal, we always delete state before the corresponding consensus
600        // data, to honor the dependency (state is derived from corresponding consensus data).
601        if let Some(batch) = self.state.next_target_batch(self.cfg.batch_size()) {
602            return Some((PruneCategory::State, batch));
603        }
604        self.data
605            .next_target_batch(self.cfg.batch_size())
606            .map(|batch| (PruneCategory::Data, batch))
607    }
608
609    /// Get the next available batch to delete past the target retention, to reclaim space.
610    ///
611    /// Returns a batch deleting up to the minimum retention for each type of data, if such a batch
612    /// is available.
613    fn next_extra_batch(&self) -> Option<(PruneCategory, u64)> {
614        if let Some(batch) = self.state.next_extra_batch(self.cfg.batch_size()) {
615            return Some((PruneCategory::State, batch));
616        }
617        self.data
618            .next_extra_batch(self.cfg.batch_size())
619            .map(|batch| (PruneCategory::Data, batch))
620    }
621
622    fn set_pruned_height(&mut self, category: PruneCategory, height: u64) {
623        match category {
624            PruneCategory::State => self.state.min_height = height + 1,
625            PruneCategory::Data => self.data.min_height = height + 1,
626        }
627    }
628}
629
630#[derive(PartialEq)]
631pub enum StorageConnectionType {
632    Sequencer,
633    Query,
634}
635
636impl SqlStorage {
637    pub fn pool(&self) -> Pool<Db> {
638        self.pool.clone()
639    }
640
641    /// Connect to a remote database.
642    #[allow(unused_variables)]
643    pub async fn connect(
644        mut config: Config,
645        connection_type: StorageConnectionType,
646    ) -> Result<Self, Error> {
647        let metrics = PrometheusMetrics::default();
648        let pool_metrics = PoolMetrics::new(&*metrics.subgroup("sql".into()));
649
650        #[cfg(feature = "embedded-db")]
651        let pool = config.pool_opt.clone();
652        #[cfg(not(feature = "embedded-db"))]
653        let pool = match connection_type {
654            StorageConnectionType::Sequencer => config.pool_opt.clone(),
655            StorageConnectionType::Query => config.pool_opt_query.clone(),
656        };
657
658        let pruner_cfg = config.pruner_cfg;
659        let serializable_retry_config = config.serializable_retry_config;
660
661        // Only reuse the same pool if we're using sqlite
662        if cfg!(feature = "embedded-db") || connection_type == StorageConnectionType::Sequencer {
663            // re-use the same pool if present and return early
664            if let Some(pool) = config.pool {
665                return Ok(Self {
666                    metrics,
667                    pool_metrics,
668                    pool,
669                    pruner_cfg,
670                    serializable_retry_config,
671                });
672            }
673        } else if config.pool.is_some() {
674            tracing::info!("not reusing existing pool for query connection");
675        }
676
677        #[cfg(not(feature = "embedded-db"))]
678        let schema = config.schema.clone();
679        #[cfg(not(feature = "embedded-db"))]
680        let pool = pool.after_connect(move |conn, _| {
681            let schema = config.schema.clone();
682            async move {
683                query(&format!("SET search_path TO {schema}"))
684                    .execute(conn)
685                    .await?;
686                Ok(())
687            }
688            .boxed()
689        });
690
691        #[cfg(feature = "embedded-db")]
692        if config.reset {
693            std::fs::remove_file(config.db_opt.get_filename())?;
694        }
695
696        let pool = pool.connect_with(config.db_opt).await?;
697
698        // Create or connect to the schema for this query service.
699        let mut conn = pool.acquire().await?;
700
701        // Disable statement timeout for migrations, as they can take a long time
702        #[cfg(not(feature = "embedded-db"))]
703        query("SET statement_timeout = 0")
704            .execute(conn.as_mut())
705            .await?;
706
707        #[cfg(not(feature = "embedded-db"))]
708        if config.reset {
709            query(&format!("DROP SCHEMA IF EXISTS {schema} CASCADE"))
710                .execute(conn.as_mut())
711                .await?;
712        }
713
714        #[cfg(not(feature = "embedded-db"))]
715        query(&format!("CREATE SCHEMA IF NOT EXISTS {schema}"))
716            .execute(conn.as_mut())
717            .await?;
718
719        // Get migrations and interleave with custom migrations, sorting by version number.
720        validate_migrations(&mut config.migrations)?;
721        let migrations =
722            add_custom_migrations(default_migrations(), config.migrations).collect::<Vec<_>>();
723
724        // Get a migration runner. Depending on the config, we can either use this to actually run
725        // the migrations or just check if the database is up to date.
726        let runner = refinery::Runner::new(&migrations).set_grouped(true);
727
728        if config.no_migrations {
729            // We've been asked not to run any migrations. Abort if the DB is not already up to
730            // date.
731            let last_applied = runner
732                .get_last_applied_migration_async(&mut Migrator::from(&mut conn))
733                .await?;
734            let last_expected = migrations.last();
735            if last_applied.as_ref() != last_expected {
736                return Err(Error::msg(format!(
737                    "DB is out of date: last applied migration is {last_applied:?}, but expected \
738                     {last_expected:?}"
739                )));
740            }
741        } else {
742            // Run migrations using `refinery`.
743            match runner.run_async(&mut Migrator::from(&mut conn)).await {
744                Ok(report) => {
745                    tracing::info!("ran DB migrations: {report:?}");
746                },
747                Err(err) => {
748                    tracing::error!("DB migrations failed: {:?}", err.report());
749                    Err(err)?;
750                },
751            }
752        }
753
754        if config.archive {
755            // If running in archive mode, ensure the pruned height is set to 0, so the fetcher will
756            // reconstruct previously pruned data.
757            query("DELETE FROM pruned_height")
758                .execute(conn.as_mut())
759                .await?;
760        }
761
762        conn.close().await?;
763
764        Ok(Self {
765            pool,
766            pool_metrics,
767            metrics,
768            pruner_cfg,
769            serializable_retry_config,
770        })
771    }
772}
773
774/// Retry policy for transactions aborted by PostgreSQL serialization conflicts (SQLSTATE 40001).
775#[derive(Clone, Copy, Debug)]
776pub struct SerializableRetryConfig {
777    /// Initial delay before the first retry.
778    base: Duration,
779    /// Maximum delay between retries.
780    max: Duration,
781    /// Multiplier applied to the delay between successive retries.
782    factor: u32,
783    /// Backoff jitter as a `(numerator, denominator)` ratio of the delay.
784    jitter: (u64, u64),
785    /// Maximum number of retries before giving up.
786    retry_max: u32,
787    /// When set, each retried conflict spawns a background `pg_stat_activity` / `pg_locks` snapshot.
788    pg_stat_diag: bool,
789}
790
791impl Default for SerializableRetryConfig {
792    fn default() -> Self {
793        Self {
794            base: Duration::from_millis(10),
795            max: Duration::from_millis(500),
796            factor: 2,
797            jitter: (5, 10),
798            retry_max: 100,
799            pg_stat_diag: false,
800        }
801    }
802}
803
804impl SerializableRetryConfig {
805    /// Construct a retry policy. `jitter` is a `(numerator, denominator)` ratio of the delay.
806    pub const fn new(
807        base: Duration,
808        max: Duration,
809        factor: u32,
810        jitter: (u64, u64),
811        retry_max: u32,
812        pg_stat_diag: bool,
813    ) -> Self {
814        Self {
815            base,
816            max,
817            factor,
818            jitter,
819            retry_max,
820            pg_stat_diag,
821        }
822    }
823
824    /// Run `f`, retrying (up to `retry_max` times with exponential backoff + jitter) whenever
825    /// `should_retry` returns `true` for the error. `f` is re-run from scratch on each attempt.
826    async fn retry_if<F, Fut, T, E>(
827        &self,
828        op: &'static str,
829        mut should_retry: impl FnMut(&E) -> bool,
830        f: F,
831    ) -> Result<T, E>
832    where
833        F: Fn() -> Fut,
834        Fut: Future<Output = Result<T, E>>,
835    {
836        let mut delay = self.base;
837        for i in 0..=self.retry_max {
838            match f().await {
839                Ok(res) => return Ok(res),
840                Err(err) if i < self.retry_max && should_retry(&err) => {
841                    tracing::warn!(
842                        op,
843                        attempt = i + 1,
844                        max_retries = self.retry_max,
845                        delay_ms = delay.as_millis(),
846                        "serialization conflict, retrying transaction after {delay:?}"
847                    );
848                    sleep(delay).await;
849                    delay = self.backoff(delay);
850                },
851                Err(err) => return Err(err),
852            }
853        }
854        unreachable!()
855    }
856
857    /// Compute the next backoff delay: `delay * factor` plus random jitter, capped at `max`.
858    fn backoff(&self, delay: Duration) -> Duration {
859        if delay >= self.max {
860            return self.max;
861        }
862        let ms = delay
863            .saturating_mul(self.factor)
864            .as_millis()
865            .min(u64::MAX as u128) as u64;
866        let (jitter_num, jitter_den) = self.jitter;
867        let jitter = if jitter_num == 0 || jitter_den == 0 {
868            0
869        } else {
870            let mut rng = rand::thread_rng();
871            ms * rng.gen_range(0..jitter_num) / jitter_den
872        };
873        min(Duration::from_millis(ms + jitter), self.max)
874    }
875}
876
877/// Returns `true` if `err`'s [`Display`](std::fmt::Display) output identifies it as a PostgreSQL
878/// serialization conflict (SQLSTATE 40001, message "could not serialize access").
879fn is_serialization_conflict_err<E: std::fmt::Display>(err: &E) -> bool {
880    format!("{err:#}").contains("could not serialize access")
881}
882
883/// Like [`is_serialization_conflict_err`] but also logs concurrent DB sessions for diagnostics.
884fn serialization_conflict_with_diag<E: std::fmt::Display>(
885    pool: Pool<Db>,
886    op: &'static str,
887) -> impl FnMut(&E) -> bool {
888    let mut first = true;
889    move |err| {
890        if is_serialization_conflict_err(err) {
891            #[cfg(not(feature = "embedded-db"))]
892            if first {
893                first = false;
894                spawn_pg_stat_activity_log(pool.clone(), op);
895            }
896            #[cfg(feature = "embedded-db")]
897            let _ = (&pool, op, &mut first);
898            true
899        } else {
900            false
901        }
902    }
903}
904
905/// Spawn a background task that queries `pg_stat_activity` and `pg_locks` to identify the
906/// connections and predicate locks involved in a serialization conflict, logging them at `warn`.
907#[cfg(not(feature = "embedded-db"))]
908fn spawn_pg_stat_activity_log(pool: Pool<Db>, op: &'static str) {
909    use sqlx::Row as _;
910    tokio::spawn(async move {
911        match sqlx::query(
912            "SELECT pid, COALESCE(state, 'unknown') AS state, left(COALESCE(query, ''), 200) AS \
913             query FROM pg_stat_activity WHERE pid != pg_backend_pid() AND state IS DISTINCT FROM \
914             'idle' AND usename = current_user",
915        )
916        .fetch_all(&pool)
917        .await
918        {
919            Ok(rows) if rows.is_empty() => {
920                tracing::warn!(op, "serialization conflict: no other non-idle DB sessions");
921            },
922            Ok(rows) => {
923                for row in &rows {
924                    let pid: i32 = row.try_get("pid").unwrap_or(-1);
925                    let state: String = row.try_get("state").unwrap_or_default();
926                    let query_text: String = row.try_get("query").unwrap_or_default();
927                    tracing::warn!(
928                        op,
929                        pid,
930                        state,
931                        "serialization conflict: concurrent session: {query_text}",
932                    );
933                }
934            },
935            Err(e) => {
936                tracing::error!(op, "failed to query pg_stat_activity: {e:#}");
937            },
938        }
939
940        // Log SSI predicate locks held by all non-idle sessions
941        match sqlx::query(
942            "SELECT l.pid, l.locktype, CASE WHEN l.relation IS NOT NULL THEN c.relname ELSE NULL \
943             END AS relation, l.page, l.tuple, left(COALESCE(a.query, ''), 100) AS query FROM \
944             pg_locks l JOIN pg_stat_activity a ON a.pid = l.pid LEFT JOIN pg_class c ON c.oid = \
945             l.relation WHERE l.mode = 'SIReadLock' AND a.state IS DISTINCT FROM 'idle' AND l.pid \
946             != pg_backend_pid() ORDER BY l.pid, c.relname",
947        )
948        .fetch_all(&pool)
949        .await
950        {
951            Ok(rows) if rows.is_empty() => {
952                tracing::warn!(op, "serialization conflict: no SIReadLocks held");
953            },
954            Ok(rows) => {
955                for row in &rows {
956                    let pid: i32 = row.try_get("pid").unwrap_or(-1);
957                    let locktype: String = row.try_get("locktype").unwrap_or_default();
958                    let relation: Option<String> = row.try_get("relation").unwrap_or(None);
959                    let page: Option<i32> = row.try_get("page").unwrap_or(None);
960                    let tuple: Option<i16> = row.try_get("tuple").unwrap_or(None);
961                    let query_text: String = row.try_get("query").unwrap_or_default();
962                    tracing::warn!(
963                        op,
964                        pid,
965                        locktype,
966                        relation,
967                        page,
968                        tuple,
969                        "serialization conflict: SIReadLock: {query_text}",
970                    );
971                }
972            },
973            Err(e) => {
974                tracing::warn!(op, "failed to query pg_locks: {e:#}");
975            },
976        }
977    });
978}
979
980#[async_trait]
981impl SerializableRetry for SqlStorage {
982    async fn serializable_retry<T, E, F, Fut>(&self, op: &'static str, f: F) -> Result<T, E>
983    where
984        T: Send,
985        E: std::fmt::Display + Send,
986        F: Fn() -> Fut + Send + Sync,
987        Fut: Future<Output = Result<T, E>> + Send,
988    {
989        if self.serializable_retry_config.pg_stat_diag {
990            self.serializable_retry_config
991                .retry_if(op, serialization_conflict_with_diag(self.pool(), op), f)
992                .await
993        } else {
994            self.serializable_retry_config
995                .retry_if(op, is_serialization_conflict_err, f)
996                .await
997        }
998    }
999}
1000
1001#[cfg(test)]
1002mod serializable_retry_tests {
1003    use std::sync::{
1004        Arc,
1005        atomic::{AtomicU32, Ordering},
1006    };
1007
1008    use super::{Duration, SerializableRetryConfig, is_serialization_conflict_err};
1009
1010    /// A retry policy with small delays and a low retry cap, so the exhaustion test runs quickly.
1011    const TEST_RETRY: SerializableRetryConfig = SerializableRetryConfig::new(
1012        Duration::from_millis(1),
1013        Duration::from_millis(5),
1014        2,
1015        (5, 10),
1016        5,
1017        false,
1018    );
1019
1020    /// An error whose message matches a PostgreSQL serialization conflict.
1021    fn mock_serialization_error() -> anyhow::Error {
1022        anyhow::anyhow!(
1023            "could not serialize access due to read/write dependencies among transactions"
1024        )
1025    }
1026
1027    #[test]
1028    fn test_is_serialization_conflict_err() {
1029        // The PostgreSQL serialization-conflict message must be recognised.
1030        assert!(is_serialization_conflict_err(&mock_serialization_error()));
1031        // Other database errors must NOT match.
1032        assert!(!is_serialization_conflict_err(&anyhow::anyhow!(
1033            "duplicate key value violates unique constraint"
1034        )));
1035        // Non-database errors must not match.
1036        assert!(!is_serialization_conflict_err(&anyhow::anyhow!(
1037            "plain error"
1038        )));
1039    }
1040
1041    #[test_log::test(tokio::test(flavor = "multi_thread"))]
1042    async fn test_retry_if_succeeds_immediately() {
1043        let calls = Arc::new(AtomicU32::new(0));
1044        let calls_clone = calls.clone();
1045
1046        let result: anyhow::Result<()> = TEST_RETRY
1047            .retry_if("test", is_serialization_conflict_err, || {
1048                let calls = calls_clone.clone();
1049                async move {
1050                    calls.fetch_add(1, Ordering::SeqCst);
1051                    Ok(())
1052                }
1053            })
1054            .await;
1055
1056        assert!(result.is_ok());
1057        assert_eq!(calls.load(Ordering::SeqCst), 1);
1058    }
1059
1060    #[test_log::test(tokio::test(flavor = "multi_thread"))]
1061    async fn test_retry_if_retries_on_serialization_error() {
1062        let calls = Arc::new(AtomicU32::new(0));
1063        let calls_clone = calls.clone();
1064
1065        // The closure fails twice with a serialization error, then succeeds on the third attempt.
1066        let result: anyhow::Result<()> = TEST_RETRY
1067            .retry_if("test", is_serialization_conflict_err, || {
1068                let calls = calls_clone.clone();
1069                async move {
1070                    let n = calls.fetch_add(1, Ordering::SeqCst);
1071                    if n < 2 {
1072                        Err(mock_serialization_error())
1073                    } else {
1074                        Ok(())
1075                    }
1076                }
1077            })
1078            .await;
1079
1080        assert!(result.is_ok());
1081        assert_eq!(calls.load(Ordering::SeqCst), 3);
1082    }
1083
1084    #[test_log::test(tokio::test(flavor = "multi_thread"))]
1085    async fn test_retry_if_exhausts_retries() {
1086        let calls = Arc::new(AtomicU32::new(0));
1087        let calls_clone = calls.clone();
1088
1089        // The closure always fails; retry must give up after TEST_RETRY.retry_max (5) retries.
1090        let result: anyhow::Result<()> = TEST_RETRY
1091            .retry_if("test", is_serialization_conflict_err, || {
1092                let calls = calls_clone.clone();
1093                async move {
1094                    calls.fetch_add(1, Ordering::SeqCst);
1095                    Err(mock_serialization_error())
1096                }
1097            })
1098            .await;
1099
1100        assert!(result.is_err());
1101        // 1 initial attempt + 5 retries = 6 total calls.
1102        assert_eq!(calls.load(Ordering::SeqCst), 6);
1103    }
1104
1105    #[test_log::test(tokio::test(flavor = "multi_thread"))]
1106    async fn test_retry_if_no_retry_on_other_errors() {
1107        let calls = Arc::new(AtomicU32::new(0));
1108        let calls_clone = calls.clone();
1109
1110        // Non-serialization errors must not be retried.
1111        let result: anyhow::Result<()> = TEST_RETRY
1112            .retry_if("test", is_serialization_conflict_err, || {
1113                let calls = calls_clone.clone();
1114                async move {
1115                    calls.fetch_add(1, Ordering::SeqCst);
1116                    Err(anyhow::anyhow!("unrelated error"))
1117                }
1118            })
1119            .await;
1120
1121        assert!(result.is_err());
1122        assert_eq!(calls.load(Ordering::SeqCst), 1);
1123    }
1124
1125    /// Verify that [`function_name!`](crate::function_name) resolves to the unqualified name of the
1126    /// enclosing function, both from a nested helper and from the test fn itself.
1127    #[test]
1128    fn test_function_name_macro() {
1129        fn outer_test_fn() -> &'static str {
1130            crate::function_name!()
1131        }
1132        assert_eq!(outer_test_fn(), "outer_test_fn");
1133        assert_eq!(crate::function_name!(), "test_function_name_macro");
1134
1135        // Non-ASCII identifiers must not panic: the macro slices the `type_name` output, and
1136        // a multi-byte codepoint near a slice boundary would panic if the macro used fixed
1137        // byte offsets instead of char-boundary-safe operations.
1138        fn télécharger() -> &'static str {
1139            crate::function_name!()
1140        }
1141        assert_eq!(télécharger(), "télécharger");
1142    }
1143
1144    /// Verify that [`function_name!`](crate::function_name) resolves to the function name even from
1145    /// `async` contexts, where the body is lowered into a generator/closure. This is the case that
1146    /// every production call site hits, and a naive macro reports `{{closure}}` here.
1147    #[test_log::test(tokio::test)]
1148    async fn test_function_name_macro_async() {
1149        // Plain `async fn`: the body becomes a generator, adding a `{{closure}}` path segment.
1150        async fn plain_async_fn() -> &'static str {
1151            crate::function_name!()
1152        }
1153        assert_eq!(plain_async_fn().await, "plain_async_fn");
1154
1155        // Closure returning an async block inside an `async fn`, mirroring the real
1156        // `serializable_retry!(self, || async { .. })` call sites: adds multiple `{{closure}}`
1157        // segments to the path.
1158        async fn nested_async_blocks() -> &'static str {
1159            let f = || async { crate::function_name!() };
1160            f().await
1161        }
1162        assert_eq!(nested_async_blocks().await, "nested_async_blocks");
1163
1164        // `#[async_trait]` method: the body is rewritten to `Box::pin(async move { .. })`, exactly
1165        // as the real `serializable_retry!` call sites are.
1166        struct S;
1167        #[async_trait::async_trait]
1168        trait T {
1169            async fn async_trait_method(&self) -> &'static str;
1170        }
1171        #[async_trait::async_trait]
1172        impl T for S {
1173            async fn async_trait_method(&self) -> &'static str {
1174                crate::function_name!()
1175            }
1176        }
1177        assert_eq!(S.async_trait_method().await, "async_trait_method");
1178    }
1179}
1180
1181impl PrunerConfig for SqlStorage {
1182    fn set_pruning_config(&mut self, cfg: PrunerCfg) {
1183        self.pruner_cfg = Some(cfg);
1184    }
1185
1186    fn get_pruning_config(&self) -> Option<PrunerCfg> {
1187        self.pruner_cfg.clone()
1188    }
1189}
1190
1191impl HasMetrics for SqlStorage {
1192    fn metrics(&self) -> &PrometheusMetrics {
1193        &self.metrics
1194    }
1195}
1196
1197impl SqlStorage {
1198    async fn prune_write(&self) -> anyhow::Result<Transaction<Prune>> {
1199        Transaction::new(&self.pool, self.pool_metrics.clone()).await
1200    }
1201
1202    /// Open a transaction for a deferred-migration batch.
1203    ///
1204    /// Backfill transactions run under READ COMMITTED on Postgres so long-running batches don't
1205    /// trip SSI predicate-lock conflicts against concurrent consensus writes. See [`Backfill`].
1206    pub async fn backfill(&self) -> anyhow::Result<Transaction<Backfill>> {
1207        Transaction::new(&self.pool, self.pool_metrics.clone()).await
1208    }
1209
1210    async fn new_pruner<'a>(&'a self) -> anyhow::Result<Pruner<'a>> {
1211        let cfg = self
1212            .pruner_cfg
1213            .as_ref()
1214            .context("pruning config not found")?;
1215        let now = Utc::now().timestamp();
1216
1217        let (min_height, state_min_height) = {
1218            let mut tx = self
1219                .read()
1220                .await
1221                .context("opening transaction to load pruned heights")?;
1222            (
1223                tx.load_pruned_height()
1224                    .await?
1225                    .map_or(0, |pruned| pruned + 1),
1226                tx.load_state_pruned_height()
1227                    .await?
1228                    .map_or(0, |pruned| pruned + 1),
1229            )
1230        };
1231        Ok(Pruner {
1232            data: PruneState {
1233                min_height,
1234                target_height: self
1235                    .get_height_by_timestamp(now - (cfg.target_retention().as_secs()) as i64)
1236                    .await
1237                    .context("getting height for target retention")?
1238                    .map_or(min_height, |to_prune| to_prune + 1),
1239                minimum_retention_height: self
1240                    .get_height_by_timestamp(now - (cfg.minimum_retention().as_secs()) as i64)
1241                    .await
1242                    .context("getting height for minimum retention")?
1243                    .map_or(min_height, |to_prune| to_prune + 1),
1244            },
1245            state: PruneState {
1246                min_height: state_min_height,
1247                target_height: self
1248                    .get_height_by_timestamp(now - (cfg.state_target_retention().as_secs()) as i64)
1249                    .await
1250                    .context("getting height for state target retention")?
1251                    .map_or(state_min_height, |to_prune| to_prune + 1),
1252                minimum_retention_height: self
1253                    .get_height_by_timestamp(now - (cfg.state_minimum_retention().as_secs()) as i64)
1254                    .await
1255                    .context("getting height for state minimum retention")?
1256                    .map_or(state_min_height, |to_prune| to_prune + 1),
1257            },
1258            cfg,
1259            extra_pruning: false,
1260        })
1261    }
1262
1263    #[instrument(skip(self, pruner))]
1264    async fn prune_batch(
1265        &self,
1266        pruner: &mut Pruner<'_>,
1267        category: PruneCategory,
1268        to: u64,
1269    ) -> anyhow::Result<()> {
1270        tracing::info!("pruning batch");
1271
1272        // Update pruned height first so the fetcher does not try to fetch data that we are about to
1273        // delete.
1274        let mut tx = self
1275            .write()
1276            .await
1277            .context("opening transaction for pruned height")?;
1278        match category {
1279            PruneCategory::Data => tx.save_pruned_height(to).await?,
1280            PruneCategory::State => tx.save_state_pruned_height(to).await?,
1281        }
1282        tx.commit().await.context("committing pruned height")?;
1283
1284        let mut tx = self
1285            .prune_write()
1286            .await
1287            .context("opening pruning transaction")?;
1288        match category {
1289            PruneCategory::Data => tx.delete_batch(to).await?,
1290            PruneCategory::State => tx.delete_state_batch(pruner.cfg.state_tables(), to).await?,
1291        }
1292        tx.commit().await.context("committing deleted batch")?;
1293
1294        pruner.set_pruned_height(category, to);
1295        Ok(())
1296    }
1297
1298    async fn get_disk_usage(&self) -> anyhow::Result<u64> {
1299        let mut tx = self.read().await?;
1300
1301        #[cfg(not(feature = "embedded-db"))]
1302        let query = "SELECT pg_database_size(current_database())";
1303
1304        #[cfg(feature = "embedded-db")]
1305        let query = "
1306            SELECT( (SELECT page_count FROM pragma_page_count) * (SELECT * FROM pragma_page_size)) \
1307                     AS total_bytes";
1308
1309        let row = tx.fetch_one(query).await.context("getting disk usage")?;
1310        let size: i64 = row.get(0);
1311
1312        Ok(size as u64)
1313    }
1314
1315    /// Trigger incremental vacuum to free up space in the SQLite database.
1316    async fn vacuum(&self) -> anyhow::Result<()> {
1317        // Note: We don't vacuum the Postgres database, as there is no manual trigger for
1318        // incremental vacuum, and a full vacuum can take a lot of time.
1319        if cfg!(feature = "embedded-db") {
1320            let config = self.get_pruning_config().ok_or(QueryError::Error {
1321                message: "Pruning config not found".to_string(),
1322            })?;
1323            let mut conn = self.pool().acquire().await?;
1324            query(&format!(
1325                "PRAGMA incremental_vacuum({})",
1326                config.incremental_vacuum_pages()
1327            ))
1328            .execute(conn.as_mut())
1329            .await
1330            .context("triggering vacuum")?;
1331            conn.close().await?;
1332        }
1333        Ok(())
1334    }
1335
1336    async fn get_height_by_timestamp(&self, timestamp: i64) -> QueryResult<Option<u64>> {
1337        let mut tx = self.read().await.map_err(|err| QueryError::Error {
1338            message: err.to_string(),
1339        })?;
1340
1341        // We order by timestamp and then height, even though logically this is no different than
1342        // just ordering by height, since timestamps are monotonic. The reason is that this order
1343        // allows the query planner to efficiently solve the where clause and presort the results
1344        // based on the timestamp index. The remaining sort on height, which guarantees a unique
1345        // block if multiple blocks have the same timestamp, is very efficient, because there are
1346        // never more than a handful of blocks with the same timestamp.
1347        let Some((height,)) = query_as::<(i64,)>(
1348            "SELECT height FROM header
1349              WHERE timestamp <= $1
1350              ORDER BY timestamp DESC, height DESC
1351              LIMIT 1",
1352        )
1353        .bind(timestamp)
1354        .fetch_optional(tx.as_mut())
1355        .await?
1356        else {
1357            return Ok(None);
1358        };
1359        Ok(Some(height as u64))
1360    }
1361
1362    /// Get the stored VID share for a given block, if one exists.
1363    pub async fn get_vid_share<Types>(&self, block_id: BlockId<Types>) -> QueryResult<VidShare>
1364    where
1365        Types: NodeType,
1366        Header<Types>: QueryableHeader<Types>,
1367    {
1368        let mut tx = self.read().await.map_err(|err| QueryError::Error {
1369            message: err.to_string(),
1370        })?;
1371        let share = tx.vid_share(block_id).await?;
1372        Ok(share)
1373    }
1374
1375    /// Get the stored VID common data for a given block, if one exists.
1376    pub async fn get_vid_common<Types: NodeType>(
1377        &self,
1378        block_id: BlockId<Types>,
1379    ) -> QueryResult<VidCommonQueryData<Types>>
1380    where
1381        <Types as NodeType>::BlockPayload: QueryablePayload<Types>,
1382        <Types as NodeType>::BlockHeader: QueryableHeader<Types>,
1383    {
1384        let mut tx = self.read().await.map_err(|err| QueryError::Error {
1385            message: err.to_string(),
1386        })?;
1387        let common = tx.get_vid_common(block_id).await?;
1388        Ok(common)
1389    }
1390
1391    /// Get the stored VID common metadata for a given block, if one exists.
1392    pub async fn get_vid_common_metadata<Types: NodeType>(
1393        &self,
1394        block_id: BlockId<Types>,
1395    ) -> QueryResult<VidCommonMetadata<Types>>
1396    where
1397        <Types as NodeType>::BlockPayload: QueryablePayload<Types>,
1398        <Types as NodeType>::BlockHeader: QueryableHeader<Types>,
1399    {
1400        let mut tx = self.read().await.map_err(|err| QueryError::Error {
1401            message: err.to_string(),
1402        })?;
1403        let common = tx.get_vid_common_metadata(block_id).await?;
1404        Ok(common)
1405    }
1406}
1407
1408#[async_trait]
1409impl PruneStorage for SqlStorage {
1410    type Pruner<'a> = Option<Pruner<'a>>;
1411
1412    /// Note: The prune operation may not immediately free up space even after rows are deleted.
1413    /// This is because a vacuum operation may be necessary to reclaim more space.
1414    /// PostgreSQL already performs auto vacuuming, so we are not including it here
1415    /// as running a vacuum operation can be resource-intensive.
1416    #[instrument(skip(self))]
1417    async fn prune<'a>(&'a self, pruner: &mut Option<Pruner<'a>>) -> anyhow::Result<Option<u64>> {
1418        let pruner = match pruner {
1419            Some(pruner) => pruner,
1420            None => pruner.get_or_insert(self.new_pruner().await?),
1421        };
1422
1423        // Prune data exceeding target retention in batches
1424        if let Some((category, to)) = pruner.next_target_batch() {
1425            tracing::info!("pruning to target retention");
1426            self.prune_batch(pruner, category, to).await?;
1427            return Ok(Some(to));
1428        }
1429
1430        // If threshold is set, prune data exceeding minimum retention in batches. This parameter is
1431        // needed for SQL storage as there is no direct way to get free space.
1432        let Some(threshold) = pruner.cfg.pruning_threshold() else {
1433            return Ok(None);
1434        };
1435        let usage = self.get_disk_usage().await?;
1436
1437        // Pruning beyond the target retention is triggered when usage exceeds the threshold.
1438        if usage > threshold {
1439            tracing::warn!(usage, threshold, "Disk usage exceeds pruning threshold");
1440            pruner.extra_pruning = true;
1441        }
1442        if !pruner.extra_pruning {
1443            return Ok(None);
1444        }
1445
1446        // Once extra pruning is triggered, we continue until the usage ratio drops below
1447        // `max_usage`.
1448        if (usage as f64 / threshold as f64) <= (f64::from(pruner.cfg.max_usage()) / 10000.0) {
1449            tracing::info!(
1450                usage,
1451                threshold,
1452                "space reclaimed makes usage less than threshold"
1453            );
1454            return Ok(None);
1455        }
1456
1457        // Prune the next extra batch if possible.
1458        let Some((category, to)) = pruner.next_extra_batch() else {
1459            return Ok(None);
1460        };
1461
1462        tracing::info!("pruning beyond target retention");
1463        self.prune_batch(pruner, category, to).await?;
1464        self.vacuum().await?;
1465        Ok(Some(to))
1466    }
1467}
1468
1469impl VersionedDataSource for SqlStorage {
1470    type Transaction<'a>
1471        = Transaction<Write>
1472    where
1473        Self: 'a;
1474    type ReadOnly<'a>
1475        = Transaction<Read>
1476    where
1477        Self: 'a;
1478
1479    async fn write(&self) -> anyhow::Result<Transaction<Write>> {
1480        Transaction::new(&self.pool, self.pool_metrics.clone()).await
1481    }
1482
1483    async fn read(&self) -> anyhow::Result<Transaction<Read>> {
1484        Transaction::new(&self.pool, self.pool_metrics.clone()).await
1485    }
1486}
1487
1488// These tests run the `postgres` Docker image, which doesn't work on Windows.
1489#[cfg(all(any(test, feature = "testing"), not(target_os = "windows")))]
1490pub mod testing {
1491    #![allow(unused_imports)]
1492    use std::{
1493        env,
1494        process::{Child, Command, Stdio},
1495        time::Duration,
1496    };
1497
1498    use refinery::Migration;
1499    use test_utils::reserve_tcp_port;
1500    use tokio::time::timeout;
1501
1502    use super::Config;
1503    use crate::testing::sleep;
1504    #[derive(Debug)]
1505    pub struct TmpDb {
1506        #[cfg(not(feature = "embedded-db"))]
1507        host: String,
1508        #[cfg(not(feature = "embedded-db"))]
1509        port: u16,
1510        #[cfg(not(feature = "embedded-db"))]
1511        data_dir: std::path::PathBuf,
1512        #[cfg(not(feature = "embedded-db"))]
1513        postgres: Option<Child>,
1514        #[cfg(feature = "embedded-db")]
1515        db_path: std::path::PathBuf,
1516        #[allow(dead_code)]
1517        persistent: bool,
1518    }
1519    impl TmpDb {
1520        #[cfg(feature = "embedded-db")]
1521        fn init_sqlite_db(persistent: bool) -> Self {
1522            let file = tempfile::Builder::new()
1523                .prefix("sqlite-")
1524                .suffix(".db")
1525                .tempfile()
1526                .unwrap();
1527
1528            let (_, db_path) = file.keep().unwrap();
1529
1530            Self {
1531                db_path,
1532                persistent,
1533            }
1534        }
1535        pub async fn init() -> Self {
1536            #[cfg(feature = "embedded-db")]
1537            return Self::init_sqlite_db(false);
1538
1539            #[cfg(not(feature = "embedded-db"))]
1540            Self::init_postgres(false).await
1541        }
1542
1543        pub async fn persistent() -> Self {
1544            #[cfg(feature = "embedded-db")]
1545            return Self::init_sqlite_db(true);
1546
1547            #[cfg(not(feature = "embedded-db"))]
1548            Self::init_postgres(true).await
1549        }
1550
1551        #[cfg(not(feature = "embedded-db"))]
1552        async fn init_postgres(persistent: bool) -> Self {
1553            let port = reserve_tcp_port().unwrap();
1554            let host = "127.0.0.1".to_string();
1555
1556            // initdb requires an empty target; clear any dir left by a crashed run
1557            // that reused this (recycled) ephemeral port.
1558            let data_dir = env::temp_dir().join(format!("espresso-tmpdb-{port}"));
1559            let _ = std::fs::remove_dir_all(&data_dir);
1560
1561            let output = Command::new("initdb")
1562                .arg("-D")
1563                .arg(&data_dir)
1564                .args(["-U", "postgres", "--auth=trust"])
1565                .output()
1566                .expect("initdb failed to run; is postgres installed and on PATH?");
1567            assert!(
1568                output.status.success(),
1569                "initdb failed for {data_dir:?}: {}",
1570                String::from_utf8_lossy(&output.stderr)
1571            );
1572
1573            let mut db = Self {
1574                host,
1575                port,
1576                data_dir,
1577                postgres: None,
1578                persistent,
1579            };
1580
1581            db.start_postgres().await;
1582            db
1583        }
1584
1585        #[cfg(not(feature = "embedded-db"))]
1586        pub fn host(&self) -> String {
1587            self.host.clone()
1588        }
1589
1590        #[cfg(not(feature = "embedded-db"))]
1591        pub fn port(&self) -> u16 {
1592            self.port
1593        }
1594
1595        #[cfg(feature = "embedded-db")]
1596        pub fn path(&self) -> std::path::PathBuf {
1597            self.db_path.clone()
1598        }
1599
1600        pub fn config(&self) -> Config {
1601            #[cfg(feature = "embedded-db")]
1602            let mut cfg = Config::default().db_path(self.db_path.clone());
1603
1604            #[cfg(not(feature = "embedded-db"))]
1605            let mut cfg = Config::default()
1606                .user("postgres")
1607                .password("password")
1608                .host(self.host())
1609                .port(self.port());
1610
1611            cfg = cfg.migrations(vec![
1612                Migration::unapplied(
1613                    "V101__create_test_merkle_tree_table.sql",
1614                    &TestMerkleTreeMigration::create("test_tree"),
1615                )
1616                .unwrap(),
1617            ]);
1618
1619            cfg
1620        }
1621
1622        #[cfg(not(feature = "embedded-db"))]
1623        pub fn stop_postgres(&mut self) {
1624            let Some(mut postgres) = self.postgres.take() else {
1625                return;
1626            };
1627            tracing::info!(port = self.port, "stopping postgres");
1628            // Fast shutdown (SIGINT to the postmaster) releases the datadir before
1629            // Drop removes it; fall back to SIGKILL if pg_ctl is unavailable.
1630            let stopped = Command::new("pg_ctl")
1631                .arg("-D")
1632                .arg(&self.data_dir)
1633                .args(["stop", "-m", "fast", "-w"])
1634                .stdout(Stdio::null())
1635                .stderr(Stdio::null())
1636                .status()
1637                .map(|status| status.success())
1638                .unwrap_or(false);
1639            if !stopped {
1640                let _ = postgres.kill();
1641            }
1642            let _ = postgres.wait();
1643        }
1644
1645        #[cfg(not(feature = "embedded-db"))]
1646        pub async fn start_postgres(&mut self) {
1647            self.stop_postgres();
1648            tracing::info!(port = self.port, "starting postgres");
1649            let postgres = Command::new("postgres")
1650                .arg("-D")
1651                .arg(&self.data_dir)
1652                .args(["-p", &self.port.to_string()])
1653                .args(["-h", &self.host])
1654                // Keep the unix socket inside the data dir instead of a shared /tmp.
1655                .arg("-k")
1656                .arg(&self.data_dir)
1657                .stdout(Stdio::null())
1658                .stderr(Stdio::null())
1659                .spawn()
1660                .expect("failed to start postgres; is it installed and on PATH?");
1661            self.postgres = Some(postgres);
1662
1663            self.wait_for_ready().await;
1664        }
1665
1666        #[cfg(not(feature = "embedded-db"))]
1667        async fn wait_for_ready(&self) {
1668            let timeout_duration = Duration::from_secs(
1669                env::var("SQL_TMP_DB_CONNECT_TIMEOUT")
1670                    .unwrap_or("60".to_string())
1671                    .parse()
1672                    .expect("SQL_TMP_DB_CONNECT_TIMEOUT must be an integer number of seconds"),
1673            );
1674
1675            if let Err(err) = timeout(timeout_duration, async {
1676                while !Command::new("pg_isready")
1677                    .args(["-h", &self.host])
1678                    .args(["-p", &self.port.to_string()])
1679                    .args(["-U", "postgres"])
1680                    .stdout(Stdio::null())
1681                    .stderr(Stdio::null())
1682                    .status()
1683                    .map(|status| status.success())
1684                    .unwrap_or(false)
1685                {
1686                    tracing::warn!("database is not ready");
1687                    sleep(Duration::from_secs(1)).await;
1688                }
1689            })
1690            .await
1691            {
1692                panic!(
1693                    "failed to connect to TmpDb within configured timeout {timeout_duration:?}: \
1694                     {err:#}\n{}",
1695                    "Consider increasing the timeout by setting SQL_TMP_DB_CONNECT_TIMEOUT"
1696                );
1697            }
1698        }
1699    }
1700
1701    #[cfg(not(feature = "embedded-db"))]
1702    impl Drop for TmpDb {
1703        fn drop(&mut self) {
1704            self.stop_postgres();
1705            if !self.persistent {
1706                let _ = std::fs::remove_dir_all(&self.data_dir);
1707            }
1708        }
1709    }
1710
1711    #[cfg(feature = "embedded-db")]
1712    impl Drop for TmpDb {
1713        fn drop(&mut self) {
1714            if !self.persistent {
1715                std::fs::remove_file(self.db_path.clone()).unwrap();
1716            }
1717        }
1718    }
1719
1720    pub struct TestMerkleTreeMigration;
1721
1722    impl TestMerkleTreeMigration {
1723        fn create(name: &str) -> String {
1724            let (bit_vec, binary, hash_pk, root_stored_column) = if cfg!(feature = "embedded-db") {
1725                (
1726                    "TEXT",
1727                    "BLOB",
1728                    "INTEGER PRIMARY KEY AUTOINCREMENT",
1729                    " (json_extract(data, '$.test_merkle_tree_root'))",
1730                )
1731            } else {
1732                (
1733                    "BIT(8)",
1734                    "BYTEA",
1735                    "BIGSERIAL PRIMARY KEY",
1736                    "(data->>'test_merkle_tree_root')",
1737                )
1738            };
1739
1740            format!(
1741                "CREATE TABLE IF NOT EXISTS hash_bigint
1742            (
1743                id {hash_pk},
1744                value {binary}  NOT NULL UNIQUE
1745            );
1746
1747            ALTER TABLE header
1748            ADD column test_merkle_tree_root text
1749            GENERATED ALWAYS as {root_stored_column} STORED;
1750
1751            CREATE TABLE {name}
1752            (
1753                path JSONB NOT NULL,
1754                created BIGINT NOT NULL,
1755                hash_id BIGINT NOT NULL,
1756                children JSONB,
1757                children_bitvec {bit_vec},
1758                idx JSONB,
1759                entry JSONB,
1760                PRIMARY KEY (path, created)
1761            );
1762            CREATE INDEX {name}_created ON {name} (created);"
1763            )
1764        }
1765    }
1766}
1767
1768// These tests run the `postgres` Docker image, which doesn't work on Windows.
1769#[cfg(all(test, not(target_os = "windows")))]
1770mod test {
1771    use std::time::Duration;
1772
1773    use hotshot_example_types::{
1774        node_types::TEST_VERSIONS,
1775        state_types::{TestInstanceState, TestValidatedState},
1776    };
1777    use jf_merkle_tree_compat::{
1778        MerkleTreeScheme, ToTraversalPath, UniversalMerkleTreeScheme, prelude::UniversalMerkleTree,
1779    };
1780    use tokio::time::sleep;
1781
1782    use super::{testing::TmpDb, *};
1783    use crate::{
1784        availability::{BlockQueryData, LeafQueryData},
1785        data_source::storage::{
1786            MerklizedStateStorage, UpdateAvailabilityStorage, pruning::PrunedHeightStorage,
1787        },
1788        merklized_state::{MerklizedState, Snapshot, UpdateStateData},
1789        testing::mocks::{MockMerkleTree, MockTypes},
1790    };
1791
1792    impl SqlStorage {
1793        async fn get_minimum_height(&self) -> QueryResult<Option<u64>> {
1794            let mut tx = self.read().await.map_err(|err| QueryError::Error {
1795                message: err.to_string(),
1796            })?;
1797            let (Some(height),) =
1798                query_as::<(Option<i64>,)>("SELECT MIN(height) as height FROM header")
1799                    .fetch_one(tx.as_mut())
1800                    .await?
1801            else {
1802                return Ok(None);
1803            };
1804            Ok(Some(height as u64))
1805        }
1806    }
1807
1808    #[test_log::test(tokio::test(flavor = "multi_thread"))]
1809    async fn test_migrations() {
1810        let db = TmpDb::init().await;
1811        let cfg = db.config();
1812
1813        let connect = |migrations: bool, custom_migrations| {
1814            let cfg = cfg.clone();
1815            async move {
1816                let mut cfg = cfg.migrations(custom_migrations);
1817                if !migrations {
1818                    cfg = cfg.no_migrations();
1819                }
1820                let client = SqlStorage::connect(cfg, StorageConnectionType::Query).await?;
1821                Ok::<_, Error>(client)
1822            }
1823        };
1824
1825        // Connecting with migrations disabled should fail if the database is not already up to date
1826        // (since we've just created a fresh database, it isn't).
1827        let err = connect(false, vec![]).await.unwrap_err();
1828        tracing::info!("connecting without running migrations failed as expected: {err}");
1829
1830        // Now connect and run migrations to bring the database up to date.
1831        connect(true, vec![]).await.unwrap();
1832        // Now connecting without migrations should work.
1833        connect(false, vec![]).await.unwrap();
1834
1835        // Connect with some custom migrations, to advance the schema even further. Pass in the
1836        // custom migrations out of order; they should still execute in order of version number.
1837        // The SQL commands used here will fail if not run in order.
1838        let migrations = vec![
1839            Migration::unapplied(
1840                "V9999__create_test_table.sql",
1841                "ALTER TABLE test ADD COLUMN data INTEGER;",
1842            )
1843            .unwrap(),
1844            Migration::unapplied(
1845                "V9998__create_test_table.sql",
1846                "CREATE TABLE test (x bigint);",
1847            )
1848            .unwrap(),
1849        ];
1850        connect(true, migrations.clone()).await.unwrap();
1851
1852        // Connect using the default schema (no custom migrations) and not running migrations. This
1853        // should fail because the database is _ahead_ of the client in terms of schema.
1854        let err = connect(false, vec![]).await.unwrap_err();
1855        tracing::info!("connecting without running migrations failed as expected: {err}");
1856
1857        // Connecting with the customized schema should work even without running migrations.
1858        connect(true, migrations).await.unwrap();
1859    }
1860
1861    #[test]
1862    #[cfg(not(feature = "embedded-db"))]
1863    fn test_config_from_str() {
1864        let cfg = Config::from_str("postgresql://user:password@host:8080").unwrap();
1865        assert_eq!(cfg.db_opt.get_username(), "user");
1866        assert_eq!(cfg.db_opt.get_host(), "host");
1867        assert_eq!(cfg.db_opt.get_port(), 8080);
1868    }
1869
1870    #[test]
1871    #[cfg(feature = "embedded-db")]
1872    fn test_config_from_str() {
1873        let cfg = Config::from_str("sqlite://data.db").unwrap();
1874        assert_eq!(cfg.db_opt.get_filename().to_string_lossy(), "data.db");
1875    }
1876
1877    async fn vacuum(storage: &SqlStorage) {
1878        #[cfg(feature = "embedded-db")]
1879        let query = "PRAGMA incremental_vacuum(16000)";
1880        #[cfg(not(feature = "embedded-db"))]
1881        let query = "VACUUM";
1882        storage
1883            .pool
1884            .acquire()
1885            .await
1886            .unwrap()
1887            .execute(query)
1888            .await
1889            .unwrap();
1890    }
1891
1892    #[test_log::test(tokio::test(flavor = "multi_thread"))]
1893    async fn test_target_period_pruning() {
1894        let db = TmpDb::init().await;
1895        let cfg = db.config();
1896
1897        let mut storage = SqlStorage::connect(cfg, StorageConnectionType::Query)
1898            .await
1899            .unwrap();
1900        let mut leaf = LeafQueryData::<MockTypes>::genesis(
1901            &TestValidatedState::default(),
1902            &TestInstanceState::default(),
1903            TEST_VERSIONS.test,
1904        )
1905        .await;
1906        // insert some mock data
1907        for i in 0..20 {
1908            leaf.leaf.block_header_mut().block_number = i;
1909            leaf.leaf.block_header_mut().timestamp = Utc::now().timestamp() as u64;
1910            let mut tx = storage.write().await.unwrap();
1911            tx.insert_leaf(&leaf).await.unwrap();
1912            tx.commit().await.unwrap();
1913        }
1914
1915        let height_before_pruning = storage.get_minimum_height().await.unwrap().unwrap();
1916
1917        // Set pruner config to default which has minimum retention set to 1 day
1918        storage.set_pruning_config(PrunerCfg::new());
1919        // No data will be pruned
1920        let pruned_height = storage.prune(&mut Default::default()).await.unwrap();
1921
1922        // Vacuum the database to reclaim space.
1923        // This is necessary to ensure the test passes.
1924        // Note: We don't perform a vacuum after each pruner run in production because the auto vacuum job handles it automatically.
1925        vacuum(&storage).await;
1926        // Pruned height should be none
1927        assert!(pruned_height.is_none());
1928
1929        let height_after_pruning = storage.get_minimum_height().await.unwrap().unwrap();
1930
1931        assert_eq!(
1932            height_after_pruning, height_before_pruning,
1933            "some data has been pruned"
1934        );
1935
1936        // Set pruner config to target retention set to 1s
1937        storage.set_pruning_config(PrunerCfg::new().with_target_retention(Duration::from_secs(1)));
1938        sleep(Duration::from_secs(2)).await;
1939        let usage_before_pruning = storage.get_disk_usage().await.unwrap();
1940        // All of the data is now older than 1s.
1941        // This would prune all the data as the target retention is set to 1s
1942        let pruned_height = storage.prune(&mut Default::default()).await.unwrap();
1943        // Vacuum the database to reclaim space.
1944        // This is necessary to ensure the test passes.
1945        // Note: We don't perform a vacuum after each pruner run in production because the auto vacuum job handles it automatically.
1946        vacuum(&storage).await;
1947
1948        // Pruned height should be some
1949        assert!(pruned_height.is_some());
1950        let usage_after_pruning = storage.get_disk_usage().await.unwrap();
1951        // All the tables should be empty
1952        // counting rows in header table
1953        let header_rows = storage
1954            .read()
1955            .await
1956            .unwrap()
1957            .fetch_one("select count(*) as count from header")
1958            .await
1959            .unwrap()
1960            .get::<i64, _>("count");
1961        // the table should be empty
1962        assert_eq!(header_rows, 0);
1963
1964        // counting rows in leaf table.
1965        // Deleting rows from header table would delete rows in all the tables
1966        // as each of table implement "ON DELETE CASCADE" fk constraint with the header table.
1967        let leaf_rows = storage
1968            .read()
1969            .await
1970            .unwrap()
1971            .fetch_one("select count(*) as count from leaf")
1972            .await
1973            .unwrap()
1974            .get::<i64, _>("count");
1975        // the table should be empty
1976        assert_eq!(leaf_rows, 0);
1977
1978        assert!(
1979            usage_before_pruning > usage_after_pruning,
1980            " disk usage should decrease after pruning"
1981        )
1982    }
1983
1984    #[test_log::test(tokio::test(flavor = "multi_thread"))]
1985    async fn test_merklized_state_pruning() {
1986        let db = TmpDb::init().await;
1987        let storage = SqlStorage::connect(db.config(), StorageConnectionType::Query)
1988            .await
1989            .unwrap();
1990
1991        let num_blocks = 10_000u64;
1992        let mut test_tree: UniversalMerkleTree<_, _, _, 8, _> =
1993            MockMerkleTree::new(MockMerkleTree::tree_height());
1994
1995        // Insert entries and merkle nodes for each block height.
1996        let mut tx = storage.write().await.unwrap();
1997        for height in 0..num_blocks {
1998            test_tree.update(height as usize, height as usize).unwrap();
1999
2000            let test_data = serde_json::json!({
2001                MockMerkleTree::header_state_commitment_field():
2002                    serde_json::to_value(test_tree.commitment()).unwrap()
2003            });
2004            tx.upsert(
2005                "header",
2006                [
2007                    "height",
2008                    "hash",
2009                    "payload_hash",
2010                    "timestamp",
2011                    "data",
2012                    "ns_table",
2013                ],
2014                ["height"],
2015                [(
2016                    height as i64,
2017                    format!("hash{height}"),
2018                    "ph".to_string(),
2019                    0,
2020                    test_data,
2021                    "ns".to_string(),
2022                )],
2023            )
2024            .await
2025            .unwrap();
2026
2027            let (_, proof) = test_tree.lookup(height as usize).expect_ok().unwrap();
2028            let traversal_path = <usize as ToTraversalPath<8>>::to_traversal_path(
2029                &(height as usize),
2030                test_tree.height(),
2031            );
2032            UpdateStateData::<_, MockMerkleTree, 8>::insert_merkle_nodes(
2033                &mut tx,
2034                proof.clone(),
2035                traversal_path,
2036                height,
2037            )
2038            .await
2039            .unwrap();
2040        }
2041        UpdateStateData::<_, MockMerkleTree, 8>::set_last_state_height(
2042            &mut tx,
2043            num_blocks as usize,
2044        )
2045        .await
2046        .unwrap();
2047        tx.commit().await.unwrap();
2048
2049        // Prune up to height 500, keeping only the newest version of each node.
2050        let prune_height = 5678u64;
2051        let mut tx = storage.prune_write().await.unwrap();
2052        tx.delete_state_batch(vec!["test_tree".to_string()], prune_height)
2053            .await
2054            .unwrap();
2055        tx.commit().await.unwrap();
2056
2057        // Verify no paths have multiple versions at or below the prune height.
2058        let mut tx = storage.read().await.unwrap();
2059        let (duplicates,) = query_as::<(i64,)>(
2060            "SELECT count(*) FROM (SELECT count(*) FROM test_tree WHERE created <= $1 GROUP BY \
2061             path HAVING count(*) > 1) AS s",
2062        )
2063        .bind(prune_height as i64)
2064        .fetch_one(tx.as_mut())
2065        .await
2066        .unwrap();
2067        assert_eq!(
2068            duplicates, 0,
2069            "found {duplicates} paths with duplicate versions at or below prune height"
2070        );
2071
2072        // Verify get_path still works for the latest snapshot and returns correct proofs.
2073        let commitment = test_tree.commitment();
2074        let mut tx = storage.read().await.unwrap();
2075        for key in 0..num_blocks as usize {
2076            let proof = MerklizedStateStorage::<MockTypes, MockMerkleTree, 8>::get_path(
2077                &mut tx,
2078                Snapshot::Index(num_blocks - 1),
2079                key,
2080            )
2081            .await
2082            .unwrap_or_else(|e| panic!("get_path failed for key {key} after pruning: {e:#}"));
2083            assert_eq!(
2084                proof.elem(),
2085                Some(&key),
2086                "proof for key {key} has wrong element: {:?}",
2087                proof.elem()
2088            );
2089            MockMerkleTree::verify(commitment, key, &proof)
2090                .unwrap()
2091                .unwrap();
2092        }
2093    }
2094
2095    #[test_log::test(tokio::test(flavor = "multi_thread"))]
2096    async fn test_minimum_retention_pruning() {
2097        let db = TmpDb::init().await;
2098
2099        let mut storage = SqlStorage::connect(db.config(), StorageConnectionType::Query)
2100            .await
2101            .unwrap();
2102        let mut leaf = LeafQueryData::<MockTypes>::genesis(
2103            &TestValidatedState::default(),
2104            &TestInstanceState::default(),
2105            TEST_VERSIONS.test,
2106        )
2107        .await;
2108        // insert some mock data
2109        for i in 0..20 {
2110            leaf.leaf.block_header_mut().block_number = i;
2111            leaf.leaf.block_header_mut().timestamp = Utc::now().timestamp() as u64;
2112            let mut tx = storage.write().await.unwrap();
2113            tx.insert_leaf(&leaf).await.unwrap();
2114            tx.commit().await.unwrap();
2115        }
2116
2117        let height_before_pruning = storage.get_minimum_height().await.unwrap().unwrap();
2118        let cfg = PrunerCfg::new();
2119        // Set pruning_threshold to 1
2120        // SQL storage size is more than 1000 bytes even without any data indexed
2121        // This would mean that the threshold would always be greater than the disk usage
2122        // However, minimum retention is set to 24 hours by default so the data would not be pruned
2123        storage.set_pruning_config(cfg.clone().with_pruning_threshold(1));
2124        println!("{:?}", storage.get_pruning_config().unwrap());
2125        // Pruning would not delete any data
2126        // All the data is younger than minimum retention period even though the usage > threshold
2127        let pruned_height = storage.prune(&mut Default::default()).await.unwrap();
2128        // Vacuum the database to reclaim space.
2129        // This is necessary to ensure the test passes.
2130        // Note: We don't perform a vacuum after each pruner run in production because the auto vacuum job handles it automatically.
2131        vacuum(&storage).await;
2132
2133        // Pruned height should be none
2134        assert!(pruned_height.is_none());
2135
2136        let height_after_pruning = storage.get_minimum_height().await.unwrap().unwrap();
2137
2138        assert_eq!(
2139            height_after_pruning, height_before_pruning,
2140            "some data has been pruned"
2141        );
2142
2143        // Change minimum retention to 1s
2144        storage.set_pruning_config(
2145            cfg.with_minimum_retention(Duration::from_secs(1))
2146                .with_pruning_threshold(1),
2147        );
2148        // sleep for 2s to make sure the data is older than minimum retention
2149        sleep(Duration::from_secs(2)).await;
2150        // This would prune all the data
2151        let pruned_height = storage.prune(&mut Default::default()).await.unwrap();
2152        // Vacuum the database to reclaim space.
2153        // This is necessary to ensure the test passes.
2154        // Note: We don't perform a vacuum after each pruner run in production because the auto vacuum job handles it automatically.
2155        vacuum(&storage).await;
2156
2157        // Pruned height should be some
2158        assert!(pruned_height.is_some());
2159        // All the tables should be empty
2160        // counting rows in header table
2161        let header_rows = storage
2162            .read()
2163            .await
2164            .unwrap()
2165            .fetch_one("select count(*) as count from header")
2166            .await
2167            .unwrap()
2168            .get::<i64, _>("count");
2169        // the table should be empty
2170        assert_eq!(header_rows, 0);
2171    }
2172
2173    #[tokio::test]
2174    #[test_log::test]
2175    async fn test_payload_pruning() {
2176        let db = TmpDb::init().await;
2177        let mut storage = SqlStorage::connect(db.config(), StorageConnectionType::Query)
2178            .await
2179            .unwrap();
2180        storage.set_pruning_config(Default::default());
2181
2182        // Insert some mock data.
2183        let mut leaf = LeafQueryData::<MockTypes>::genesis(
2184            &TestValidatedState::default(),
2185            &TestInstanceState::default(),
2186            TEST_VERSIONS.test,
2187        )
2188        .await;
2189        let block = BlockQueryData::<MockTypes>::genesis(
2190            &Default::default(),
2191            &Default::default(),
2192            TEST_VERSIONS.test.base,
2193        )
2194        .await;
2195        let vid = VidCommonQueryData::<MockTypes>::genesis(
2196            &Default::default(),
2197            &Default::default(),
2198            TEST_VERSIONS.test.base,
2199        )
2200        .await;
2201        {
2202            let mut tx = storage.write().await.unwrap();
2203            tx.insert_leaf(&leaf).await.unwrap();
2204            tx.insert_block(&block).await.unwrap();
2205            tx.insert_vid(&vid, None).await.unwrap();
2206            tx.commit().await.unwrap();
2207        }
2208
2209        // Insert a second leaf sharing the same payload.
2210        leaf.leaf.block_header_mut().block_number += 1;
2211        {
2212            let mut tx = storage.write().await.unwrap();
2213            tx.insert_leaf(&leaf).await.unwrap();
2214            tx.commit().await.unwrap();
2215        }
2216        {
2217            let mut tx = storage.read().await.unwrap();
2218            let (num_payloads,): (i64,) = query_as("SELECT count(*) FROM payload")
2219                .fetch_one(tx.as_mut())
2220                .await
2221                .unwrap();
2222            assert_eq!(num_payloads, 1);
2223            let (num_vid,): (i64,) = query_as("SELECT count(*) FROM vid_common")
2224                .fetch_one(tx.as_mut())
2225                .await
2226                .unwrap();
2227            assert_eq!(num_vid, 1);
2228        }
2229
2230        // Prune the first leaf but not the second (and thus not the payload or VID).
2231        let mut pruner = Some(Pruner {
2232            data: PruneState {
2233                min_height: 0,
2234                target_height: 1,
2235                minimum_retention_height: 1,
2236            },
2237            state: PruneState {
2238                min_height: 0,
2239                target_height: 0,
2240                minimum_retention_height: 0,
2241            },
2242            cfg: &Default::default(),
2243            extra_pruning: false,
2244        });
2245        let pruned_height = storage.prune(&mut pruner).await.unwrap();
2246        tracing::info!(?pruned_height, "first pruning run complete");
2247        {
2248            let mut tx = storage.read().await.unwrap();
2249
2250            // First block is pruned.
2251            let err = tx
2252                .get_block(BlockId::<MockTypes>::Number(0))
2253                .await
2254                .unwrap_err();
2255            assert!(matches!(err, QueryError::NotFound), "{err:#}");
2256            let err = tx
2257                .get_vid_common(BlockId::<MockTypes>::Number(0))
2258                .await
2259                .unwrap_err();
2260            assert!(matches!(err, QueryError::NotFound), "{err:#}");
2261
2262            // Second block is still available.
2263            assert_eq!(
2264                tx.get_block(BlockId::<MockTypes>::Number(1)).await.unwrap(),
2265                BlockQueryData::new(leaf.header().clone(), block.payload)
2266            );
2267            assert_eq!(
2268                tx.get_vid_common(BlockId::<MockTypes>::Number(1))
2269                    .await
2270                    .unwrap(),
2271                VidCommonQueryData::new(leaf.header().clone(), vid.common)
2272            );
2273
2274            let (num_payloads,): (i64,) = query_as("SELECT count(*) FROM payload")
2275                .fetch_one(tx.as_mut())
2276                .await
2277                .unwrap();
2278            assert_eq!(num_payloads, 1);
2279
2280            let (num_vid,): (i64,) = query_as("SELECT count(*) FROM vid_common")
2281                .fetch_one(tx.as_mut())
2282                .await
2283                .unwrap();
2284            assert_eq!(num_vid, 1);
2285        }
2286
2287        // Now prune the second leaf, ensuring the payload and VID get deleted as well.
2288        pruner.as_mut().unwrap().data.target_height = 2;
2289        let pruned_height = storage.prune(&mut pruner).await.unwrap();
2290        tracing::info!(?pruned_height, "second pruning run complete");
2291
2292        let mut tx = storage.read().await.unwrap();
2293        for i in 0..2 {
2294            let err = tx
2295                .get_block(BlockId::<MockTypes>::Number(i))
2296                .await
2297                .unwrap_err();
2298            assert!(matches!(err, QueryError::NotFound), "{err:#}");
2299
2300            let err = tx
2301                .get_vid_common(BlockId::<MockTypes>::Number(i))
2302                .await
2303                .unwrap_err();
2304            assert!(matches!(err, QueryError::NotFound), "{err:#}");
2305        }
2306        let (num_payloads,): (i64,) = query_as("SELECT count(*) FROM payload")
2307            .fetch_one(tx.as_mut())
2308            .await
2309            .unwrap();
2310        assert_eq!(num_payloads, 0);
2311
2312        let (num_vid,): (i64,) = query_as("SELECT count(*) FROM vid_common")
2313            .fetch_one(tx.as_mut())
2314            .await
2315            .unwrap();
2316        assert_eq!(num_vid, 0);
2317    }
2318
2319    #[test_log::test(tokio::test(flavor = "multi_thread"))]
2320    async fn test_pruned_height_storage() {
2321        let db = TmpDb::init().await;
2322        let cfg = db.config();
2323
2324        let storage = SqlStorage::connect(cfg, StorageConnectionType::Query)
2325            .await
2326            .unwrap();
2327        assert!(
2328            storage
2329                .read()
2330                .await
2331                .unwrap()
2332                .load_pruned_height()
2333                .await
2334                .unwrap()
2335                .is_none()
2336        );
2337        for height in [10, 20, 30] {
2338            let mut tx = storage.write().await.unwrap();
2339            tx.save_pruned_height(height).await.unwrap();
2340            tx.commit().await.unwrap();
2341            assert_eq!(
2342                storage
2343                    .read()
2344                    .await
2345                    .unwrap()
2346                    .load_pruned_height()
2347                    .await
2348                    .unwrap(),
2349                Some(height)
2350            );
2351        }
2352    }
2353
2354    #[test_log::test(tokio::test(flavor = "multi_thread"))]
2355    async fn test_separate_state_data_pruning() {
2356        let db = TmpDb::init().await;
2357        let mut storage = SqlStorage::connect(db.config(), StorageConnectionType::Query)
2358            .await
2359            .unwrap();
2360
2361        let num_blocks = 10u64;
2362        let mut test_tree: UniversalMerkleTree<_, _, _, 8, _> =
2363            MockMerkleTree::new(MockMerkleTree::tree_height());
2364
2365        // Insert headers (consensus data) and merkle nodes (state) for each block height.
2366        let mut tx = storage.write().await.unwrap();
2367        for height in 0..num_blocks {
2368            test_tree.update(height as usize, height as usize).unwrap();
2369
2370            let test_data = serde_json::json!({
2371                MockMerkleTree::header_state_commitment_field():
2372                    serde_json::to_value(test_tree.commitment()).unwrap()
2373            });
2374            tx.upsert(
2375                "header",
2376                [
2377                    "height",
2378                    "hash",
2379                    "payload_hash",
2380                    "timestamp",
2381                    "data",
2382                    "ns_table",
2383                ],
2384                ["height"],
2385                [(
2386                    height as i64,
2387                    format!("hash{height}"),
2388                    "ph".to_string(),
2389                    0,
2390                    test_data,
2391                    "ns".to_string(),
2392                )],
2393            )
2394            .await
2395            .unwrap();
2396
2397            let (_, proof) = test_tree.lookup(height as usize).expect_ok().unwrap();
2398            let traversal_path = <usize as ToTraversalPath<8>>::to_traversal_path(
2399                &(height as usize),
2400                test_tree.height(),
2401            );
2402            UpdateStateData::<_, MockMerkleTree, 8>::insert_merkle_nodes(
2403                &mut tx,
2404                proof.clone(),
2405                traversal_path,
2406                height,
2407            )
2408            .await
2409            .unwrap();
2410        }
2411        UpdateStateData::<_, MockMerkleTree, 8>::set_last_state_height(
2412            &mut tx,
2413            num_blocks as usize,
2414        )
2415        .await
2416        .unwrap();
2417        tx.commit().await.unwrap();
2418
2419        // Verify all the data exists
2420        {
2421            let mut tx = storage.read().await.unwrap();
2422            assert_eq!(tx.load_pruned_height().await.unwrap(), None);
2423            assert_eq!(tx.load_state_pruned_height().await.unwrap(), None);
2424
2425            for height in 0..num_blocks {
2426                assert_eq!(
2427                    query_as::<(i64,)>("SELECT count(*) FROM header WHERE height = $1")
2428                        .bind(height as i64)
2429                        .fetch_one(tx.as_mut())
2430                        .await
2431                        .unwrap(),
2432                    (1,)
2433                );
2434                for i in 0..=height {
2435                    tx.get_path(
2436                        Snapshot::<_, MockMerkleTree, { MockMerkleTree::ARITY }>::Index(height),
2437                        i as usize,
2438                    )
2439                    .await
2440                    .unwrap();
2441                }
2442            }
2443        }
2444
2445        // Configure the pruner to prune state data aggressively, but not consensus data.
2446        storage.set_pruning_config(
2447            PrunerCfg::default()
2448                .with_state_target_retention(Duration::ZERO)
2449                .with_state_tables(vec![MockMerkleTree::state_type().into()]),
2450        );
2451        storage.prune(&mut Default::default()).await.unwrap();
2452
2453        // The headers still exist, but the Merkle state is pruned.
2454        {
2455            let mut tx = storage.read().await.unwrap();
2456            assert_eq!(tx.load_pruned_height().await.unwrap(), None);
2457            assert_eq!(
2458                tx.load_state_pruned_height().await.unwrap(),
2459                Some(num_blocks - 1)
2460            );
2461
2462            for height in 0..num_blocks {
2463                assert_eq!(
2464                    query_as::<(i64,)>("SELECT count(*) FROM header WHERE height = $1")
2465                        .bind(height as i64)
2466                        .fetch_one(tx.as_mut())
2467                        .await
2468                        .unwrap(),
2469                    (1,)
2470                );
2471
2472                for i in 0..=height {
2473                    let err = tx
2474                        .get_path(
2475                            Snapshot::<_, MockMerkleTree, { MockMerkleTree::ARITY }>::Index(height),
2476                            i as usize,
2477                        )
2478                        .await
2479                        .unwrap_err();
2480                    assert!(matches!(err, QueryError::NotFound), "{err:?}");
2481                }
2482            }
2483        }
2484    }
2485
2486    #[test_log::test(tokio::test(flavor = "multi_thread"))]
2487    async fn test_transaction_upsert_retries() {
2488        let db = TmpDb::init().await;
2489        let config = db.config();
2490
2491        let storage = SqlStorage::connect(config, StorageConnectionType::Query)
2492            .await
2493            .unwrap();
2494
2495        let mut tx = storage.write().await.unwrap();
2496
2497        // Try to upsert into a table that does not exist.
2498        // This will fail, so our `upsert` function will enter the retry loop.
2499        // Since the table does not exist, all retries will eventually
2500        // fail and we expect an error to be returned.
2501        //
2502        // Previously, this case would cause  a panic because we were calling
2503        // methods on `QueryBuilder` after `.build()` without first
2504        // calling `.reset()`and according to the sqlx docs, that always panics.
2505        // Now, since we are properly calling `.reset()` inside `upsert()` for
2506        // the query builder, the function returns an error instead of panicking.
2507        tx.upsert("does_not_exist", ["test"], ["test"], [(1_i64,)])
2508            .await
2509            .unwrap_err();
2510    }
2511}
hotshot_query_service/data_source/storage/sql.rs

hotshot_query_service/data_source/storage/
sql.rs