diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 425dbbd3..0a5a2779 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -26,3 +26,29 @@ jobs: - uses: dsherret/rust-toolchain-file@v1 - uses: Swatinem/rust-cache@v2 - run: cargo clippy --all-targets --all-features -- -D clippy::correctness + + fmt: + name: rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + # rustfmt.toml uses nightly-only options (imports_granularity), so check with nightly. + - uses: dtolnay/rust-toolchain@nightly + with: + components: rustfmt + - run: cargo +nightly fmt --all -- --check + + test: + name: Test + runs-on: ubuntu-latest + if: github.event_name == 'push' || !github.event.pull_request.draft + steps: + - uses: actions/checkout@v4 + with: + lfs: true + - uses: dsherret/rust-toolchain-file@v1 + - uses: Swatinem/rust-cache@v2 + - name: Fetch LFS files + run: git lfs fetch --all && git lfs checkout + - name: Run tests + run: cargo test --all-features --jobs 2 diff --git a/crates/archive/src/archive.rs b/crates/archive/src/archive.rs index 524ae6a3..f17dc26b 100644 --- a/crates/archive/src/archive.rs +++ b/crates/archive/src/archive.rs @@ -1,22 +1,25 @@ -use crate::cli::{Cli, NetworkKind}; -use crate::fs::create_fs; -use crate::ingest::ingest_from_service; -use crate::layout::Layout; -use crate::metrics; -use crate::proc::Proc; -use crate::server::run_server; -use crate::writer::Writer; +use std::time::Duration; + use anyhow::{ensure, Context}; use prometheus_client::registry::Registry; -use sqd_data::bitcoin::tables::BitcoinChunkBuilder; -use sqd_data::evm::tables::EvmChunkBuilder; -use sqd_data::hyperliquid_fills::tables::HyperliquidFillsChunkBuilder; -use sqd_data::hyperliquid_replica_cmds::tables::HyperliquidReplicaCmdsChunkBuilder; -use sqd_data::solana::tables::SolanaChunkBuilder; -use sqd_data::tron::tables::TronChunkBuilder; +use sqd_data::{ + bitcoin::tables::BitcoinChunkBuilder, evm::tables::EvmChunkBuilder, + hyperliquid_fills::tables::HyperliquidFillsChunkBuilder, + hyperliquid_replica_cmds::tables::HyperliquidReplicaCmdsChunkBuilder, solana::tables::SolanaChunkBuilder, + tron::tables::TronChunkBuilder +}; use sqd_primitives::BlockNumber; -use std::time::Duration; +use crate::{ + cli::{Cli, NetworkKind}, + fs::create_fs, + ingest::ingest_from_service, + layout::Layout, + metrics, + proc::Proc, + server::run_server, + writer::Writer +}; pub async fn run(args: Cli) -> anyhow::Result<()> { ensure!( @@ -27,12 +30,9 @@ pub async fn run(args: Cli) -> anyhow::Result<()> { let fs = create_fs(&args.dest).await?; let layout = Layout::new(fs.clone()); - let chunk_tracker = layout.create_chunk_tracker( - &chunk_check, - args.top_dir_size, - args.first_block, - args.last_block - ).await?; + let chunk_tracker = layout + .create_chunk_tracker(&chunk_check, args.top_dir_size, args.first_block, args.last_block) + .await?; if let Some(last_block) = args.last_block { if chunk_tracker.next_block() > last_block { @@ -78,27 +78,26 @@ pub async fn run(args: Cli) -> anyhow::Result<()> { let attach_idx_field = args.attach_idx_field; let write_task = tokio::spawn(async move { let mut writer = Writer::new(fs, chunk_receiver, attach_idx_field); - writer.start().await + writer.start().await }); - + match write_task.await.context("write task panicked") { Ok(Ok(_)) => { proc_task.await.context("processing task panicked")??; - }, + } Ok(Err(err)) => { proc_task.abort(); - return Err(err) - }, + return Err(err); + } Err(err) => { proc_task.abort(); - return Err(err) + return Err(err); } } Ok(()) } - fn chunk_check(filelist: &[String]) -> bool { for file in filelist { if file.starts_with("blocks.parquet") { @@ -106,4 +105,4 @@ fn chunk_check(filelist: &[String]) -> bool { } } false -} \ No newline at end of file +} diff --git a/crates/archive/src/chunk_writer.rs b/crates/archive/src/chunk_writer.rs index 0b8d622e..6aa54787 100644 --- a/crates/archive/src/chunk_writer.rs +++ b/crates/archive/src/chunk_writer.rs @@ -1,20 +1,18 @@ use sqd_data_core::{BlockChunkBuilder, ChunkProcessor, PreparedChunk}; use sqd_dataset::DatasetDescriptionRef; - pub struct ChunkWriter { chunk_builder: B, processor: ChunkProcessor, - memory_threshold: usize, + memory_threshold: usize } - impl ChunkWriter { pub fn new(chunk_builder: B) -> anyhow::Result { Ok(Self { processor: chunk_builder.new_chunk_processor()?, chunk_builder, - memory_threshold: 40 * 1024 * 1024, + memory_threshold: 40 * 1024 * 1024 }) } @@ -52,4 +50,4 @@ impl ChunkWriter { let new_processor = self.chunk_builder.new_chunk_processor()?; std::mem::replace(&mut self.processor, new_processor).finish() } -} \ No newline at end of file +} diff --git a/crates/archive/src/cli.rs b/crates/archive/src/cli.rs index b52d61a8..dc193419 100644 --- a/crates/archive/src/cli.rs +++ b/crates/archive/src/cli.rs @@ -2,7 +2,6 @@ use clap::{value_parser, Parser, ValueEnum}; use sqd_primitives::BlockNumber; use url::Url; - #[derive(ValueEnum, Clone, Debug)] pub enum NetworkKind { Bitcoin, @@ -13,7 +12,6 @@ pub enum NetworkKind { Tron } - #[derive(Parser, Debug)] #[command(version, about, long_about = None)] pub struct Cli { @@ -67,5 +65,5 @@ pub struct Cli { /// Whether to attach an index field to each record #[arg(long)] - pub attach_idx_field: bool, + pub attach_idx_field: bool } diff --git a/crates/archive/src/fs/local.rs b/crates/archive/src/fs/local.rs index 8aa9d038..89016277 100644 --- a/crates/archive/src/fs/local.rs +++ b/crates/archive/src/fs/local.rs @@ -1,27 +1,26 @@ -use crate::fs::{FSRef, Fs}; +use std::{ + path::{Path, PathBuf}, + sync::Arc +}; + use async_trait::async_trait; -use std::path::{Path, PathBuf}; -use std::sync::Arc; +use crate::fs::{FSRef, Fs}; pub struct LocalFs { - root: PathBuf, + root: PathBuf } - impl LocalFs { pub fn new(root: impl Into) -> LocalFs { Self { root: root.into() } } } - #[async_trait] impl Fs for LocalFs { fn cd(&self, path: &str) -> FSRef { - Arc::new(Self::new( - self.root.join(path) - )) + Arc::new(Self::new(self.root.join(path))) } async fn ls(&self) -> anyhow::Result> { @@ -57,4 +56,4 @@ impl Fs for LocalFs { } Ok(()) } -} \ No newline at end of file +} diff --git a/crates/archive/src/fs/mod.rs b/crates/archive/src/fs/mod.rs index cc6ff42d..500c4478 100644 --- a/crates/archive/src/fs/mod.rs +++ b/crates/archive/src/fs/mod.rs @@ -1,40 +1,34 @@ -use crate::fs::local::LocalFs; -use crate::fs::s3::S3Fs; +use std::{path::Path, sync::Arc}; + use anyhow::{anyhow, bail, ensure}; use async_trait::async_trait; -use std::path::Path; -use std::sync::Arc; use url::Url; +use crate::fs::{local::LocalFs, s3::S3Fs}; pub mod local; pub mod s3; - pub type FSRef = Arc; - #[async_trait] pub trait Fs { fn cd(&self, path: &str) -> FSRef; - + async fn ls(&self) -> anyhow::Result>; async fn move_local(&self, local_src: &Path, dest: &str) -> anyhow::Result<()>; - + async fn delete(&self, path: &str) -> anyhow::Result<()>; } - pub async fn create_fs(url: &str) -> anyhow::Result { match Url::parse(url) { Ok(u) => { if u.scheme() == "s3" { ensure!(!u.cannot_be_a_base(), "invalid s3 url - {}", url); - let bucket = u.host_str().ok_or_else(|| { - anyhow!("bucket is missing in {}", url) - })?; + let bucket = u.host_str().ok_or_else(|| anyhow!("bucket is missing in {}", url))?; let mut config_loader = aws_config::from_env(); if let Ok(s3_endpoint) = std::env::var("AWS_S3_ENDPOINT") { @@ -42,7 +36,7 @@ pub async fn create_fs(url: &str) -> anyhow::Result { } let config = config_loader.load().await; let s3_client = aws_sdk_s3::Client::new(&config); - + let fs = S3Fs::new(s3_client, bucket.to_string()).cd(u.path()); Ok(fs) } else { diff --git a/crates/archive/src/fs/s3.rs b/crates/archive/src/fs/s3.rs index daa25234..e86f0382 100644 --- a/crates/archive/src/fs/s3.rs +++ b/crates/archive/src/fs/s3.rs @@ -1,10 +1,10 @@ -use crate::fs::{FSRef, Fs}; +use std::{path::Path, sync::Arc}; + use anyhow::{ensure, Context}; use async_trait::async_trait; use aws_sdk_s3::primitives::ByteStream; -use std::path::Path; -use std::sync::Arc; +use crate::fs::{FSRef, Fs}; pub struct S3Fs { client: aws_sdk_s3::Client, @@ -12,12 +12,8 @@ pub struct S3Fs { path: Vec } - impl S3Fs { - pub fn new( - s3_client: aws_sdk_s3::Client, - bucket: String - ) -> Self { + pub fn new(s3_client: aws_sdk_s3::Client, bucket: String) -> Self { Self { client: s3_client, bucket, @@ -27,22 +23,22 @@ impl S3Fs { fn resolve(&self, mut path: &str) -> Vec { let mut result = self.path.clone(); - + if path.starts_with('/') { result.clear(); path = &path[1..]; } - + if path.ends_with('/') { path = &path[0..path.len() - 1]; } for seg in path.split('/') { match seg { - "." | "" => {}, + "." | "" => {} ".." => { result.pop(); - }, + } s => { result.push(s.to_string()); } @@ -51,36 +47,31 @@ impl S3Fs { result } - + fn resolve_key(&self, path: &str) -> String { let key_segments = self.resolve(path); - let mut key = String::with_capacity( - key_segments.len() + key_segments.iter().map(|s| s.len()).sum::() + 1 - ); + let mut key = + String::with_capacity(key_segments.len() + key_segments.iter().map(|s| s.len()).sum::() + 1); for seg in key_segments { key.push_str(&seg); key.push('/'); } key.pop(); - + key } fn resolve_item_key(&self, path: &str) -> anyhow::Result { let key = self.resolve_key(path); - ensure!( - !key.is_empty(), - "'{}' resolves to a root path, not to an item", - path - ); + ensure!(!key.is_empty(), "'{}' resolves to a root path, not to an item", path); Ok(key) } async fn upload_directory(&self, local_src: &Path, dest: &str) -> anyhow::Result<()> { let mut dir = tokio::fs::read_dir(local_src).await?; - + while let Some(entry) = dir.next_entry().await? { let path = entry.path(); let file_name = entry.file_name().into_string().unwrap(); @@ -108,7 +99,6 @@ impl S3Fs { } } - #[async_trait] impl Fs for S3Fs { fn cd(&self, path: &str) -> FSRef { @@ -121,7 +111,7 @@ impl Fs for S3Fs { async fn ls(&self) -> anyhow::Result> { let mut prefix = self.resolve_key("."); - + if !self.path.is_empty() { prefix.push('/'); } @@ -155,10 +145,7 @@ impl Fs for S3Fs { for object in output.contents() { if let Some(key) = object.key() { - let file = key - .strip_prefix(&prefix) - .context("unexpected file name")? - .to_string(); + let file = key.strip_prefix(&prefix).context("unexpected file name")?.to_string(); items.push(file); } } @@ -203,7 +190,7 @@ impl Fs for S3Fs { .prefix(&prefix) .send() .await?; - + ensure!( !output.is_truncated.unwrap_or(false), "too many items under '{}', can't delete that much", @@ -213,12 +200,7 @@ impl Fs for S3Fs { if let Some(contents) = output.contents { for object in contents { if let Some(key) = object.key() { - self.client - .delete_object() - .bucket(&self.bucket) - .key(key) - .send() - .await?; + self.client.delete_object().bucket(&self.bucket).key(key).send().await?; } } } else { @@ -233,4 +215,4 @@ impl Fs for S3Fs { Ok(()) } -} \ No newline at end of file +} diff --git a/crates/archive/src/ingest.rs b/crates/archive/src/ingest.rs index b0880e30..a55a9bab 100644 --- a/crates/archive/src/ingest.rs +++ b/crates/archive/src/ingest.rs @@ -1,31 +1,30 @@ +use std::{ + io::{Error, ErrorKind}, + pin::pin, + time::Duration +}; + use async_stream::try_stream; use futures::{Stream, StreamExt, TryStreamExt}; -use serde::de::DeserializeOwned; -use serde::Serialize; +use serde::{de::DeserializeOwned, Serialize}; use sqd_primitives::{Block, BlockNumber}; -use std::io::{Error, ErrorKind}; -use std::pin::pin; -use std::time::Duration; use tokio::io::AsyncBufReadExt; use tokio_util::io::StreamReader; use tracing::{error, info}; use url::Url; - #[derive(Serialize)] struct BlockRange { from: BlockNumber, - to: Option, + to: Option } - pub fn ingest_from_service( url: Url, from: BlockNumber, to: Option, block_stream_interval: Duration -) -> impl Stream> -{ +) -> impl Stream> { try_stream! { let mut first_block = from; @@ -65,13 +64,11 @@ pub fn ingest_from_service( } } - fn fetch( url: Url, from: BlockNumber, to: Option -) -> impl Stream> -{ +) -> impl Stream> { try_stream! { let byte_stream = reqwest::Client::new() .post(url) @@ -81,9 +78,9 @@ fn fetch( .error_for_status()? .bytes_stream() .map_err(|err| Error::new(ErrorKind::Other, err)); - + let mut line_stream = StreamReader::new(byte_stream).lines(); - + while let Some(line) = line_stream.next_line().await? { let block = serde_json::from_str(&line)?; yield block; diff --git a/crates/archive/src/layout.rs b/crates/archive/src/layout.rs index f1d84be8..bf234f77 100644 --- a/crates/archive/src/layout.rs +++ b/crates/archive/src/layout.rs @@ -1,44 +1,36 @@ -use crate::fs::FSRef; +use std::{pin::pin, sync::LazyLock}; + use anyhow::ensure; use async_stream::try_stream; use futures::{Stream, StreamExt, TryStreamExt}; use regex::Regex; use sqd_primitives::BlockNumber; -use std::pin::pin; -use std::sync::LazyLock; +use crate::fs::FSRef; #[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq)] pub struct DataChunk { pub first_block: BlockNumber, pub last_block: BlockNumber, pub last_hash: String, - pub top: BlockNumber, + pub top: BlockNumber } - impl DataChunk { pub fn path(&self) -> String { format!( "{:010}/{:010}-{:010}-{}", - self.top, - self.first_block, - self.last_block, - self.last_hash + self.top, self.first_block, self.last_block, self.last_hash ) } } - fn format_block_number(block_number: BlockNumber) -> String { format!("{:010}", block_number) } - fn parse_range(dirname: &str) -> Option<(BlockNumber, BlockNumber, String)> { - static RE: LazyLock = LazyLock::new(|| { - Regex::new(r"^(\d+)-(\d+)-(\w+)$").unwrap() - }); + static RE: LazyLock = LazyLock::new(|| Regex::new(r"^(\d+)-(\d+)-(\w+)$").unwrap()); RE.captures(dirname).map(|caps| { let beg = caps[1].parse::().unwrap(); @@ -48,19 +40,19 @@ fn parse_range(dirname: &str) -> Option<(BlockNumber, BlockNumber, String)> { }) } - pub struct Layout { fs: FSRef } - impl Layout { pub fn new(fs: FSRef) -> Self { Self { fs } } pub async fn get_tops(&self) -> anyhow::Result> { - let mut tops: Vec = self.fs.ls() + let mut tops: Vec = self + .fs + .ls() .await? .into_iter() .filter(|s| s.parse::().is_ok()) @@ -79,18 +71,22 @@ impl Layout { } async fn get_top_chunks(&self, top: u64) -> anyhow::Result> { - self.fs.cd(&format_block_number(top)) + self.fs + .cd(&format_block_number(top)) .ls() .await? .into_iter() .filter_map(|s| { parse_range(&s).map(|(first_block, last_block, last_hash)| { - (s, DataChunk { - first_block, - last_block, - last_hash, - top - }) + ( + s, + DataChunk { + first_block, + last_block, + last_hash, + top + } + ) }) }) .map(|(s, chunk)| { @@ -114,8 +110,7 @@ impl Layout { &self, first_block: BlockNumber, last_block: Option - ) -> impl Stream> + '_ - { + ) -> impl Stream> + '_ { try_stream! { let last_block = last_block.unwrap_or(u64::MAX); if first_block > last_block { @@ -149,8 +144,7 @@ impl Layout { &self, first_block: BlockNumber, last_block: Option - ) -> impl Stream> + '_ - { + ) -> impl Stream> + '_ { try_stream! { let last_block = last_block.unwrap_or(u64::MAX); if first_block > last_block { @@ -179,9 +173,8 @@ impl Layout { chunk_check: &dyn Fn(&[String]) -> bool, top_dir_size: usize, first_block: BlockNumber, - last_block: Option, - ) -> anyhow::Result - { + last_block: Option + ) -> anyhow::Result { ensure!(first_block <= last_block.unwrap_or(BlockNumber::MAX)); let mut chunks = pin!(self.get_chunks(first_block, last_block)); @@ -237,40 +230,27 @@ impl Layout { } } - pub struct ChunkTracker { top_dir_size: usize, base_chunk_hash: Option, last_block_limit: BlockNumber, top: BlockNumber, - chunks: Vec, + chunks: Vec } - impl ChunkTracker { pub fn prev_chunk_hash(&self) -> Option<&str> { self.chunks .last() .map(|c| c.last_hash.as_ref()) - .or_else(|| { - self.base_chunk_hash.as_ref().map(|s| s.as_ref()) - }) + .or_else(|| self.base_chunk_hash.as_ref().map(|s| s.as_ref())) } pub fn next_block(&self) -> BlockNumber { - self.chunks - .last() - .map(|c| c.last_block + 1) - .unwrap_or(self.top) + self.chunks.last().map(|c| c.last_block + 1).unwrap_or(self.top) } - pub fn next_chunk( - &mut self, - first_block: BlockNumber, - last_block: BlockNumber, - last_hash: String - ) -> DataChunk - { + pub fn next_chunk(&mut self, first_block: BlockNumber, last_block: BlockNumber, last_hash: String) -> DataChunk { assert_eq!(self.next_block(), first_block); assert!(first_block <= last_block); assert!(last_block <= self.last_block_limit); @@ -284,7 +264,7 @@ impl ChunkTracker { first_block, last_block, last_hash, - top: self.top, + top: self.top }; self.chunks.push(chunk.clone()); diff --git a/crates/archive/src/main.rs b/crates/archive/src/main.rs index cca50457..1e55987b 100644 --- a/crates/archive/src/main.rs +++ b/crates/archive/src/main.rs @@ -10,7 +10,6 @@ mod progress; mod server; mod writer; - fn main() -> anyhow::Result<()> { let args = ::parse(); @@ -22,11 +21,10 @@ fn main() -> anyhow::Result<()> { .block_on(archive::run(args)) } - fn init_logging(json: bool) { let env_filter = tracing_subscriber::EnvFilter::builder().parse_lossy( std::env::var(tracing_subscriber::EnvFilter::DEFAULT_ENV) - .unwrap_or(format!("{}=info", env!("CARGO_CRATE_NAME"))), + .unwrap_or(format!("{}=info", env!("CARGO_CRATE_NAME"))) ); if json { diff --git a/crates/archive/src/metrics.rs b/crates/archive/src/metrics.rs index ae8cc747..962ad3a4 100644 --- a/crates/archive/src/metrics.rs +++ b/crates/archive/src/metrics.rs @@ -1,9 +1,12 @@ -use prometheus_client::metrics::counter::Counter; -use prometheus_client::metrics::gauge::Gauge; -use prometheus_client::registry::Registry; -use std::sync::atomic::{AtomicI64, AtomicU64}; -use std::sync::LazyLock; +use std::sync::{ + atomic::{AtomicI64, AtomicU64}, + LazyLock +}; +use prometheus_client::{ + metrics::{counter::Counter, gauge::Gauge}, + registry::Registry +}; pub static PROGRESS: LazyLock> = LazyLock::new(Gauge::default); pub static LATEST_BLOCK_TIMESTAMP: LazyLock> = LazyLock::new(Gauge::default); @@ -12,7 +15,6 @@ pub static LATEST_SAVED_BLOCK: LazyLock> = LazyLock::new(G pub static LAST_BLOCK: LazyLock = LazyLock::new(Counter::default); pub static LAST_SAVED_BLOCK: LazyLock = LazyLock::new(Counter::default); - pub fn register_metrics(registry: &mut Registry) { registry.register( "sqd_progress_blocks_per_second", @@ -36,14 +38,6 @@ pub fn register_metrics(registry: &mut Registry) { ); // kept for compatibility with the old metrics - registry.register( - "sqd_last_block", - "Last ingested block", - LAST_BLOCK.clone() - ); - registry.register( - "sqd_last_saved_block", - "Last saved block", - LAST_SAVED_BLOCK.clone() - ); + registry.register("sqd_last_block", "Last ingested block", LAST_BLOCK.clone()); + registry.register("sqd_last_saved_block", "Last saved block", LAST_SAVED_BLOCK.clone()); } diff --git a/crates/archive/src/proc.rs b/crates/archive/src/proc.rs index f6d4b116..3b9eb788 100644 --- a/crates/archive/src/proc.rs +++ b/crates/archive/src/proc.rs @@ -1,18 +1,17 @@ -use crate::chunk_writer::ChunkWriter; -use crate::layout::ChunkTracker; -use crate::metrics; -use crate::progress::Progress; -use crate::writer::WriterItem; +use std::{ + num::NonZeroUsize, + pin::pin, + time::{Duration, Instant} +}; + use anyhow::ensure; use futures::{Stream, TryStreamExt}; use prometheus_client::metrics::gauge::Atomic; use sqd_data_core::BlockChunkBuilder; use sqd_primitives::{Block, BlockNumber, DataMask}; -use std::num::NonZeroUsize; -use std::pin::pin; -use std::time::{Duration, Instant}; use tracing::{enabled, info, Level}; +use crate::{chunk_writer::ChunkWriter, layout::ChunkTracker, metrics, progress::Progress, writer::WriterItem}; pub struct Proc { chunk_writer: ChunkWriter, @@ -31,14 +30,12 @@ pub struct Proc { last_progress_report: Instant } - impl> Proc { pub fn new( chunk_builder: B, chunk_tracker: ChunkTracker, chunk_sender: tokio::sync::mpsc::Sender - ) -> anyhow::Result - { + ) -> anyhow::Result { let chunk_writer = ChunkWriter::new(chunk_builder)?; let first_block = chunk_tracker.next_block(); Ok(Self { @@ -80,12 +77,12 @@ impl> Proc { start = false; if let Some(hash) = self.chunk_tracker.prev_chunk_hash() { ensure!( - hash == short_hash(block.parent_hash()) || hash == fallback_short_hash(block.parent_hash()), - "previous chunk hash {} does not match parent hash {} of block {}", - hash, - block.parent_hash(), - block.number() - ); + hash == short_hash(block.parent_hash()) || hash == fallback_short_hash(block.parent_hash()), + "previous chunk hash {} does not match parent hash {} of block {}", + hash, + block.parent_hash(), + block.number() + ); } } else { if self.validate_chain_continuity { @@ -133,7 +130,7 @@ impl> Proc { async fn submit_chunk(&mut self) -> anyhow::Result<()> { if self.blocks_buffered == 0 { - return Ok(()) + return Ok(()); } let mut data = self.chunk_writer.finish()?; @@ -145,13 +142,13 @@ impl> Proc { short_hash(&self.last_block_hash).to_string() ); - self.chunk_sender.send( - WriterItem { + self.chunk_sender + .send(WriterItem { data, chunk, description: self.chunk_writer.dataset_description() - } - ).await?; + }) + .await?; self.blocks_buffered = 0; self.first_block = self.last_block + 1; @@ -179,13 +176,11 @@ impl> Proc { } } - fn short_hash(value: &str) -> &str { let offset = value.len().saturating_sub(8); value.get(offset..).unwrap_or_default() } - fn fallback_short_hash(value: &str) -> &str { value.get(2..10).unwrap_or_default() -} \ No newline at end of file +} diff --git a/crates/archive/src/progress.rs b/crates/archive/src/progress.rs index 6dd5d065..64603b21 100644 --- a/crates/archive/src/progress.rs +++ b/crates/archive/src/progress.rs @@ -1,20 +1,20 @@ -use std::time::{Instant, Duration}; -use std::num::NonZeroUsize; - +use std::{ + num::NonZeroUsize, + time::{Duration, Instant} +}; #[derive(Clone, Debug)] struct ProgressUnit { value: u64, - time: Instant, + time: Instant } - pub struct Progress { window: Vec, tail: usize, size: usize, granularity: Duration, - has_news: bool, + has_news: bool } impl Progress { @@ -25,7 +25,7 @@ impl Progress { tail: 0, size: window_size.get() + 1, granularity: window_granularity, - has_news: false, + has_news: false } } diff --git a/crates/archive/src/server.rs b/crates/archive/src/server.rs index c037eaeb..5a64fef8 100644 --- a/crates/archive/src/server.rs +++ b/crates/archive/src/server.rs @@ -1,12 +1,15 @@ -use axum::http::header::CONTENT_TYPE; -use axum::http::HeaderMap; -use axum::response::IntoResponse; -use axum::routing::get; -use axum::{Extension, Router}; +use std::{ + net::SocketAddr, + sync::{Arc, LazyLock} +}; + +use axum::{ + http::{header::CONTENT_TYPE, HeaderMap}, + response::IntoResponse, + routing::get, + Extension, Router +}; use prometheus_client::registry::Registry; -use std::net::SocketAddr; -use std::sync::{Arc, LazyLock}; - async fn get_metrics(Extension(registry): Extension>) -> impl IntoResponse { static HEADERS: LazyLock = LazyLock::new(|| { @@ -15,7 +18,7 @@ async fn get_metrics(Extension(registry): Extension>) -> impl Into CONTENT_TYPE, "application/openmetrics-text; version=1.0.0; charset=utf-8" .parse() - .unwrap(), + .unwrap() ); headers }); @@ -25,7 +28,6 @@ async fn get_metrics(Extension(registry): Extension>) -> impl Into (HEADERS.clone(), buffer) } - pub async fn run_server(registry: Registry, port: u16) -> anyhow::Result<()> { let app = Router::new() .route("/metrics", get(get_metrics)) diff --git a/crates/archive/src/writer.rs b/crates/archive/src/writer.rs index 22d5b5b3..4e896171 100644 --- a/crates/archive/src/writer.rs +++ b/crates/archive/src/writer.rs @@ -1,38 +1,41 @@ -use crate::fs::FSRef; -use crate::layout::DataChunk; -use crate::metrics; -use arrow::array::{ArrayRef, Int32Array}; -use arrow::datatypes::{DataType, Field, Schema}; -use arrow::record_batch::RecordBatch; -use parquet::arrow::ArrowWriter; -use parquet::basic::{Compression, ZstdLevel}; -use parquet::file::properties::{EnabledStatistics, WriterProperties}; +use std::{fs::File, path::Path, sync::Arc}; + +use arrow::{ + array::{ArrayRef, Int32Array}, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch +}; +use parquet::{ + arrow::ArrowWriter, + basic::{Compression, ZstdLevel}, + file::properties::{EnabledStatistics, WriterProperties} +}; use prometheus_client::metrics::gauge::Atomic; use rayon::prelude::*; use sqd_data_core::PreparedChunk; use sqd_dataset::{DatasetDescriptionRef, TableDescription}; -use std::fs::File; -use std::path::Path; -use std::sync::Arc; +use crate::{fs::FSRef, layout::DataChunk, metrics}; pub struct WriterItem { pub chunk: DataChunk, pub data: PreparedChunk, - pub description: DatasetDescriptionRef, + pub description: DatasetDescriptionRef } - pub struct Writer { fs: FSRef, chunk_receiver: tokio::sync::mpsc::Receiver, - attach_index_field: bool, + attach_index_field: bool } - impl Writer { pub fn new(fs: FSRef, chunk_receiver: tokio::sync::mpsc::Receiver, attach_index_field: bool) -> Writer { - Writer { fs, chunk_receiver, attach_index_field } + Writer { + fs, + chunk_receiver, + attach_index_field + } } pub async fn start(&mut self) -> anyhow::Result<()> { @@ -68,7 +71,6 @@ impl Writer { } } - fn add_index_column(batch: &mut RecordBatch, offset: usize) -> anyhow::Result<()> { let num_rows = batch.num_rows(); let iter = offset as i32..(offset + num_rows) as i32; @@ -86,12 +88,11 @@ fn add_index_column(batch: &mut RecordBatch, offset: usize) -> anyhow::Result<() Ok(()) } - fn write_chunk( prepared_chunk: &mut PreparedChunk, dataset_description: &DatasetDescriptionRef, target_dir: &Path, - attach_index_field: bool, + attach_index_field: bool ) -> anyhow::Result> { let default_desc = TableDescription::default(); prepared_chunk @@ -103,10 +104,7 @@ fn write_chunk( new_fields.extend_from_slice(schema.fields()); schema = Arc::new(Schema::new(new_fields)); } - let desc = dataset_description - .tables - .get(name) - .unwrap_or(&default_desc); + let desc = dataset_description.tables.get(name).unwrap_or(&default_desc); let mut builder = WriterProperties::builder() .set_compression(Compression::ZSTD(ZstdLevel::try_new(6)?)) diff --git a/crates/array/benches/main.rs b/crates/array/benches/main.rs index 92ec6438..abf76e6e 100644 --- a/crates/array/benches/main.rs +++ b/crates/array/benches/main.rs @@ -2,41 +2,32 @@ use arrow::array::RecordBatch; use criterion::{criterion_group, criterion_main, Criterion}; use sqd_array::sort::sort_record_batch; - fn sort_setup(c: &mut Criterion) { let records = read_parquet("fixtures/solana-transactions.parquet").unwrap(); - + c.bench_function("sort solana transactions by idx: SQD", |bench| { - bench.iter(|| { - sort_record_batch(&records, ["_idx"]).expect("sorting failed") - }) + bench.iter(|| sort_record_batch(&records, ["_idx"]).expect("sorting failed")) }); - + c.bench_function("sort solana transactions by idx: ARROW", |bench| { bench.iter(|| { use arrow::compute::*; - - let indexes = sort_to_indices( - records.column_by_name("_idx").unwrap(), - None, - None - ).unwrap(); - + + let indexes = sort_to_indices(records.column_by_name("_idx").unwrap(), None, None).unwrap(); + take_record_batch(&records, &indexes).unwrap() }) }); } - criterion_group!(sorting, sort_setup); criterion_main!(sorting); - fn read_parquet(path: &str) -> anyhow::Result { - use std::fs::File; - use std::path::Path; + use std::{fs::File, path::Path}; + use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; - + let path = Path::new(env!("CARGO_MANIFEST_DIR")).join(path); let file = File::open(path)?; @@ -46,4 +37,4 @@ fn read_parquet(path: &str) -> anyhow::Result { let record_batch = reader.next().expect("no record batches")?; Ok(record_batch) -} \ No newline at end of file +} diff --git a/crates/array/src/access.rs b/crates/array/src/access.rs index c9eb601f..c816babd 100644 --- a/crates/array/src/access.rs +++ b/crates/array/src/access.rs @@ -1,8 +1,9 @@ -use arrow::array::{Array, ArrowPrimitiveType, BooleanArray, GenericByteArray, PrimitiveArray}; -use arrow::datatypes::ByteArrayType; +use arrow::{ + array::{Array, ArrowPrimitiveType, BooleanArray, GenericByteArray, PrimitiveArray}, + datatypes::ByteArrayType +}; use arrow_buffer::ArrowNativeType; - pub trait Access { type Value; @@ -18,8 +19,7 @@ pub trait Access { fn has_nulls(&self) -> bool; } - -impl <'a, T: Access> Access for &'a T { +impl<'a, T: Access> Access for &'a T { type Value = T::Value; #[inline] @@ -38,7 +38,6 @@ impl <'a, T: Access> Access for &'a T { } } - impl Access for BooleanArray { type Value = bool; @@ -57,8 +56,7 @@ impl Access for BooleanArray { } } - -impl Access for PrimitiveArray { +impl Access for PrimitiveArray { type Value = T::Native; #[inline] @@ -76,8 +74,7 @@ impl Access for PrimitiveArray { } } - -impl <'a, T: ByteArrayType> Access for &'a GenericByteArray { +impl<'a, T: ByteArrayType> Access for &'a GenericByteArray { type Value = &'a [u8]; fn get(&self, i: usize) -> Self::Value { diff --git a/crates/array/src/builder/aliases.rs b/crates/array/src/builder/aliases.rs index 1ed58bcf..0c8869f9 100644 --- a/crates/array/src/builder/aliases.rs +++ b/crates/array/src/builder/aliases.rs @@ -1,6 +1,9 @@ -use crate::builder::PrimitiveBuilder; -use arrow::datatypes::{Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimestampMillisecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type}; +use arrow::datatypes::{ + Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimestampMillisecondType, + TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type +}; +use crate::builder::PrimitiveBuilder; /// A signed 8-bit integer array builder. pub type Int8Builder = PrimitiveBuilder; @@ -28,4 +31,4 @@ pub type Float64Builder = PrimitiveBuilder; /// A timestamp second array builder. pub type TimestampSecondBuilder = PrimitiveBuilder; /// A timestamp millisecond array builder. -pub type TimestampMillisecondBuilder = PrimitiveBuilder; \ No newline at end of file +pub type TimestampMillisecondBuilder = PrimitiveBuilder; diff --git a/crates/array/src/builder/any.rs b/crates/array/src/builder/any.rs index eb44ffea..bc5d0328 100644 --- a/crates/array/src/builder/any.rs +++ b/crates/array/src/builder/any.rs @@ -1,11 +1,19 @@ -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::r#struct::AnyStructBuilder; -use crate::builder::{ArrayBuilder, BinaryBuilder, BooleanBuilder, ListBuilder, PrimitiveBuilder, StringBuilder, FixedSizeBinaryBuilder}; -use crate::slice::{AnySlice, AsSlice}; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::ArrayRef; -use arrow::datatypes::{DataType, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, TimeUnit, TimestampMillisecondType, TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type}; +use arrow::{ + array::ArrayRef, + datatypes::{ + DataType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, + TimestampMillisecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type + } +}; +use crate::{ + builder::{ + memory_writer::MemoryWriter, r#struct::AnyStructBuilder, ArrayBuilder, BinaryBuilder, BooleanBuilder, + FixedSizeBinaryBuilder, ListBuilder, PrimitiveBuilder, StringBuilder + }, + slice::{AnySlice, AsSlice}, + writer::{ArrayWriter, Writer} +}; pub enum AnyBuilder { Boolean(BooleanBuilder), @@ -28,7 +36,6 @@ pub enum AnyBuilder { Struct(AnyStructBuilder) } - impl ArrayWriter for AnyBuilder { type Writer = MemoryWriter; @@ -51,7 +58,7 @@ impl ArrayWriter for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.bitmask(buf), AnyBuilder::String(b) => b.bitmask(buf), AnyBuilder::List(b) => b.bitmask(buf), - AnyBuilder::Struct(b) => b.bitmask(buf), + AnyBuilder::Struct(b) => b.bitmask(buf) } } @@ -74,7 +81,7 @@ impl ArrayWriter for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.nullmask(buf), AnyBuilder::String(b) => b.nullmask(buf), AnyBuilder::List(b) => b.nullmask(buf), - AnyBuilder::Struct(b) => b.nullmask(buf), + AnyBuilder::Struct(b) => b.nullmask(buf) } } @@ -97,7 +104,7 @@ impl ArrayWriter for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.native(buf), AnyBuilder::String(b) => b.native(buf), AnyBuilder::List(b) => b.native(buf), - AnyBuilder::Struct(b) => b.native(buf), + AnyBuilder::Struct(b) => b.native(buf) } } @@ -120,12 +127,11 @@ impl ArrayWriter for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.offset(buf), AnyBuilder::String(b) => b.offset(buf), AnyBuilder::List(b) => b.offset(buf), - AnyBuilder::Struct(b) => b.offset(buf), + AnyBuilder::Struct(b) => b.offset(buf) } } } - impl AsSlice for AnyBuilder { type Slice<'a> = AnySlice<'a>; @@ -148,12 +154,11 @@ impl AsSlice for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.as_slice().into(), AnyBuilder::String(b) => b.as_slice().into(), AnyBuilder::List(b) => b.as_slice().into(), - AnyBuilder::Struct(b) => b.as_slice().into(), + AnyBuilder::Struct(b) => b.as_slice().into() } } } - impl ArrayBuilder for AnyBuilder { fn data_type(&self) -> DataType { match self { @@ -174,7 +179,7 @@ impl ArrayBuilder for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.data_type(), AnyBuilder::String(b) => b.data_type(), AnyBuilder::List(b) => b.data_type(), - AnyBuilder::Struct(b) => b.data_type(), + AnyBuilder::Struct(b) => b.data_type() } } @@ -197,7 +202,7 @@ impl ArrayBuilder for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.len(), AnyBuilder::String(b) => b.len(), AnyBuilder::List(b) => b.len(), - AnyBuilder::Struct(b) => b.len(), + AnyBuilder::Struct(b) => b.len() } } @@ -220,7 +225,7 @@ impl ArrayBuilder for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.byte_size(), AnyBuilder::String(b) => b.byte_size(), AnyBuilder::List(b) => b.byte_size(), - AnyBuilder::Struct(b) => b.byte_size(), + AnyBuilder::Struct(b) => b.byte_size() } } @@ -243,7 +248,7 @@ impl ArrayBuilder for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => b.clear(), AnyBuilder::String(b) => b.clear(), AnyBuilder::List(b) => b.clear(), - AnyBuilder::Struct(b) => b.clear(), + AnyBuilder::Struct(b) => b.clear() } } @@ -266,7 +271,7 @@ impl ArrayBuilder for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => ArrayBuilder::finish(b), AnyBuilder::String(b) => ArrayBuilder::finish(b), AnyBuilder::List(b) => ArrayBuilder::finish(*b), - AnyBuilder::Struct(b) => ArrayBuilder::finish(b), + AnyBuilder::Struct(b) => ArrayBuilder::finish(b) } } @@ -289,19 +294,17 @@ impl ArrayBuilder for AnyBuilder { AnyBuilder::FixedSizeBinary(b) => ArrayBuilder::finish_unchecked(b), AnyBuilder::String(b) => ArrayBuilder::finish_unchecked(b), AnyBuilder::List(b) => ArrayBuilder::finish_unchecked(*b), - AnyBuilder::Struct(b) => ArrayBuilder::finish_unchecked(b), + AnyBuilder::Struct(b) => ArrayBuilder::finish_unchecked(b) } } } - impl From for AnyBuilder { fn from(value: BooleanBuilder) -> Self { AnyBuilder::Boolean(value) } } - macro_rules! impl_from_primitive { ($kind:ident, $ty:ident) => { impl From> for AnyBuilder { @@ -324,42 +327,36 @@ impl_from_primitive!(Float64, Float64Type); impl_from_primitive!(TimestampSecond, TimestampSecondType); impl_from_primitive!(TimestampMillisecond, TimestampMillisecondType); - impl From for AnyBuilder { fn from(value: BinaryBuilder) -> Self { AnyBuilder::Binary(value) } } - impl From for AnyBuilder { fn from(value: StringBuilder) -> Self { AnyBuilder::String(value) } } - impl From for AnyBuilder { fn from(value: FixedSizeBinaryBuilder) -> Self { AnyBuilder::FixedSizeBinary(value) } } - impl From> for AnyBuilder { fn from(value: ListBuilder) -> Self { AnyBuilder::List(Box::new(value)) } } - impl From for AnyBuilder { fn from(value: AnyStructBuilder) -> Self { AnyBuilder::Struct(value) } } - impl AnyBuilder { pub fn new(data_type: &DataType) -> Self { match data_type { @@ -374,24 +371,16 @@ impl AnyBuilder { DataType::UInt64 => PrimitiveBuilder::::new(0).into(), DataType::Float32 => PrimitiveBuilder::::new(0).into(), DataType::Float64 => PrimitiveBuilder::::new(0).into(), - DataType::Timestamp(TimeUnit::Second, _) => { - PrimitiveBuilder::::new(0).into() - }, + DataType::Timestamp(TimeUnit::Second, _) => PrimitiveBuilder::::new(0).into(), DataType::Timestamp(TimeUnit::Millisecond, _) => { PrimitiveBuilder::::new(0).into() - }, + } DataType::Binary => BinaryBuilder::new(0, 0).into(), DataType::FixedSizeBinary(size) => FixedSizeBinaryBuilder::new(*size as usize, 0).into(), DataType::Utf8 => StringBuilder::new(0, 0).into(), - DataType::List(f) => { - ListBuilder::new( - 0, - Self::new(f.data_type()), - Some(f.name().to_string()) - ).into() - }, + DataType::List(f) => ListBuilder::new(0, Self::new(f.data_type()), Some(f.name().to_string())).into(), DataType::Struct(fields) => AnyStructBuilder::new(fields.iter().cloned().collect()).into(), ty => panic!("unsupported arrow type - {}", ty) } } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/binary.rs b/crates/array/src/builder/binary.rs index 824588cf..a44a53f4 100644 --- a/crates/array/src/builder/binary.rs +++ b/crates/array/src/builder/binary.rs @@ -1,15 +1,17 @@ -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::nullmask::NullmaskBuilder; -use crate::builder::offsets::OffsetsBuilder; -use crate::builder::ArrayBuilder; -use crate::slice::{AsSlice, ListSlice}; -use crate::util::invalid_buffer_access; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::{ArrayRef, BinaryArray, StringArray}; -use arrow::datatypes::DataType; -use arrow_buffer::MutableBuffer; use std::sync::Arc; +use arrow::{ + array::{ArrayRef, BinaryArray, StringArray}, + datatypes::DataType +}; +use arrow_buffer::MutableBuffer; + +use crate::{ + builder::{memory_writer::MemoryWriter, nullmask::NullmaskBuilder, offsets::OffsetsBuilder, ArrayBuilder}, + slice::{AsSlice, ListSlice}, + util::invalid_buffer_access, + writer::{ArrayWriter, Writer} +}; pub struct BinaryBuilder { nulls: NullmaskBuilder, @@ -17,7 +19,6 @@ pub struct BinaryBuilder { values: MutableBuffer } - impl BinaryBuilder { pub fn new(item_capacity: usize, content_capacity: usize) -> Self { Self { @@ -32,7 +33,7 @@ impl BinaryBuilder { self.nulls.append(true); self.offsets.append(self.values.len() as i32); } - + pub fn append_option(&mut self, val: Option<&[u8]>) { if let Some(val) = val { self.values.extend_from_slice(val); @@ -47,17 +48,12 @@ impl BinaryBuilder { self.nulls.append(false); self.offsets.append(self.values.len() as i32); } - + pub fn finish(self) -> BinaryArray { - BinaryArray::new( - self.offsets.finish(), - self.values.into(), - self.nulls.finish() - ) + BinaryArray::new(self.offsets.finish(), self.values.into(), self.nulls.finish()) } } - impl ArrayBuilder for BinaryBuilder { fn data_type(&self) -> DataType { DataType::Binary @@ -82,7 +78,6 @@ impl ArrayBuilder for BinaryBuilder { } } - impl ArrayWriter for BinaryBuilder { type Writer = MemoryWriter; @@ -118,7 +113,6 @@ impl ArrayWriter for BinaryBuilder { } } - impl AsSlice for BinaryBuilder { type Slice<'a> = ListSlice<'a, &'a [u8]>; @@ -131,14 +125,12 @@ impl AsSlice for BinaryBuilder { } } - impl Default for BinaryBuilder { fn default() -> Self { Self::new(0, 0) } } - pub struct StringBuilder { nulls: NullmaskBuilder, offsets: OffsetsBuilder, @@ -146,7 +138,6 @@ pub struct StringBuilder { validity: Option } - impl StringBuilder { pub fn new(item_capacity: usize, content_capacity: usize) -> Self { Self { @@ -177,7 +168,7 @@ impl StringBuilder { self.nulls.append(false); self.offsets.append(self.values.len() as i32); } - + fn mark_maybe_invalid(&mut self) { if self.validity.is_none() { self.validity = Some(self.offsets.as_slice().len()) @@ -185,23 +176,14 @@ impl StringBuilder { } pub fn finish(self) -> StringArray { - StringArray::new( - self.offsets.finish(), - self.values.into(), - self.nulls.finish() - ) + StringArray::new(self.offsets.finish(), self.values.into(), self.nulls.finish()) } - + pub unsafe fn finish_unchecked(self) -> StringArray { - StringArray::new_unchecked( - self.offsets.finish(), - self.values.into(), - self.nulls.finish() - ) + StringArray::new_unchecked(self.offsets.finish(), self.values.into(), self.nulls.finish()) } } - impl ArrayBuilder for StringBuilder { fn data_type(&self) -> DataType { DataType::Utf8 @@ -231,7 +213,6 @@ impl ArrayBuilder for StringBuilder { } } - impl ArrayWriter for StringBuilder { type Writer = MemoryWriter; @@ -269,7 +250,6 @@ impl ArrayWriter for StringBuilder { } } - impl AsSlice for StringBuilder { type Slice<'a> = ListSlice<'a, &'a [u8]>; @@ -282,18 +262,16 @@ impl AsSlice for StringBuilder { } } - impl Default for StringBuilder { fn default() -> Self { Self::new(0, 0) } } - impl std::fmt::Write for StringBuilder { #[inline] fn write_str(&mut self, s: &str) -> std::fmt::Result { self.values.extend_from_slice(s.as_bytes()); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/bitmask.rs b/crates/array/src/builder/bitmask.rs index 4e057445..7e493ecb 100644 --- a/crates/array/src/builder/bitmask.rs +++ b/crates/array/src/builder/bitmask.rs @@ -1,15 +1,12 @@ -use crate::index::RangeList; -use crate::slice::bitmask::BitmaskSlice; -use crate::writer::BitmaskWriter; use arrow_buffer::{bit_mask, bit_util, BooleanBuffer, MutableBuffer}; +use crate::{index::RangeList, slice::bitmask::BitmaskSlice, writer::BitmaskWriter}; pub struct BitmaskBuilder { buffer: MutableBuffer, - len: usize, + len: usize } - impl BitmaskBuilder { pub fn new(capacity: usize) -> Self { Self { @@ -26,12 +23,12 @@ impl BitmaskBuilder { self.buffer.clear(); self.len = 0 } - + pub fn reserve(&mut self, additional: usize) { let new_byte_len = bit_util::ceil(self.len + additional, 8); self.buffer.reserve(new_byte_len - self.buffer.len()) } - + #[inline] fn resize(&mut self, additional: usize) { let new_byte_len = bit_util::ceil(self.len + additional, 8); @@ -41,18 +38,12 @@ impl BitmaskBuilder { pub fn append_slice(&mut self, data: &[u8], offset: usize, len: usize) { self.resize(len); - bit_mask::set_bits( - self.buffer.as_slice_mut(), - data, - self.len, - offset, - len - ); + bit_mask::set_bits(self.buffer.as_slice_mut(), data, self.len, offset, len); self.len += len } - pub fn append_slice_indexes(&mut self, data: &[u8], mut indexes: impl Iterator) { + pub fn append_slice_indexes(&mut self, data: &[u8], mut indexes: impl Iterator) { let (min_bit_len, _) = indexes.size_hint(); self.resize(min_bit_len); @@ -71,18 +62,12 @@ impl BitmaskBuilder { self.append(bit_util::get_bit(data, i)) } } - + pub fn append_slice_ranges(&mut self, data: &[u8], ranges: &mut impl RangeList) { self.resize(ranges.span()); - + for r in ranges.iter() { - bit_mask::set_bits( - self.buffer.as_slice_mut(), - data, - self.len, - r.start, - r.len() - ); + bit_mask::set_bits(self.buffer.as_slice_mut(), data, self.len, r.start, r.len()); self.len += r.len(); } } @@ -98,10 +83,10 @@ impl BitmaskBuilder { // Pad last byte with 1s *self.buffer.as_slice_mut().last_mut().unwrap() |= !((1 << cur_remainder) - 1) } - + self.buffer.truncate(bit_util::ceil(self.len, 8)); self.buffer.resize(new_len_bytes, 0xFF); - + if new_remainder != 0 { // Clear remaining bits *self.buffer.as_slice_mut().last_mut().unwrap() &= (1 << new_remainder) - 1 @@ -123,7 +108,7 @@ impl BitmaskBuilder { pub fn data(&self) -> &[u8] { &self.buffer } - + pub fn as_slice(&self) -> BitmaskSlice<'_> { BitmaskSlice::new(self.data(), 0, self.len) } @@ -152,7 +137,6 @@ impl BitmaskBuilder { } } - impl BitmaskWriter for BitmaskBuilder { #[inline] fn write_slice(&mut self, data: &[u8], offset: usize, len: usize) -> anyhow::Result<()> { @@ -161,7 +145,7 @@ impl BitmaskWriter for BitmaskBuilder { } #[inline] - fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator) -> anyhow::Result<()> { + fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator) -> anyhow::Result<()> { self.append_slice_indexes(data, indexes); Ok(()) } @@ -177,4 +161,4 @@ impl BitmaskWriter for BitmaskBuilder { self.append_many(val, count); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/boolean.rs b/crates/array/src/builder/boolean.rs index 31c9b9e9..766104d1 100644 --- a/crates/array/src/builder/boolean.rs +++ b/crates/array/src/builder/boolean.rs @@ -1,21 +1,22 @@ -use crate::builder::bitmask::BitmaskBuilder; -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::nullmask::NullmaskBuilder; -use crate::builder::ArrayBuilder; -use crate::slice::{AsSlice, BooleanSlice}; -use crate::util::invalid_buffer_access; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::{ArrayRef, BooleanArray}; -use arrow::datatypes::DataType; use std::sync::Arc; +use arrow::{ + array::{ArrayRef, BooleanArray}, + datatypes::DataType +}; + +use crate::{ + builder::{bitmask::BitmaskBuilder, memory_writer::MemoryWriter, nullmask::NullmaskBuilder, ArrayBuilder}, + slice::{AsSlice, BooleanSlice}, + util::invalid_buffer_access, + writer::{ArrayWriter, Writer} +}; pub struct BooleanBuilder { nulls: NullmaskBuilder, values: BitmaskBuilder } - impl BooleanBuilder { pub fn new(capacity: usize) -> Self { Self { @@ -23,12 +24,12 @@ impl BooleanBuilder { values: BitmaskBuilder::new(capacity) } } - + pub fn append(&mut self, val: bool) { self.nulls.append(true); self.values.append(val) } - + pub fn append_option(&mut self, val: Option) { if let Some(val) = val { self.append(val) @@ -37,13 +38,12 @@ impl BooleanBuilder { self.values.append(false) } } - + pub fn finish(self) -> BooleanArray { BooleanArray::new(self.values.finish(), self.nulls.finish()) } } - impl ArrayBuilder for BooleanBuilder { fn data_type(&self) -> DataType { DataType::Boolean @@ -67,7 +67,6 @@ impl ArrayBuilder for BooleanBuilder { } } - impl ArrayWriter for BooleanBuilder { type Writer = MemoryWriter; @@ -98,7 +97,6 @@ impl ArrayWriter for BooleanBuilder { } } - impl AsSlice for BooleanBuilder { type Slice<'a> = BooleanSlice<'a>; @@ -107,9 +105,8 @@ impl AsSlice for BooleanBuilder { } } - impl Default for BooleanBuilder { fn default() -> Self { Self::new(0) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/fixed_size_binary.rs b/crates/array/src/builder/fixed_size_binary.rs index 3f302622..a36385f9 100644 --- a/crates/array/src/builder/fixed_size_binary.rs +++ b/crates/array/src/builder/fixed_size_binary.rs @@ -1,18 +1,22 @@ -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::nullmask::NullmaskBuilder; -use crate::builder::ArrayBuilder; -use crate::slice::{AsSlice, FixedSizeListSlice}; -use crate::util::invalid_buffer_access; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::{ArrayRef, FixedSizeBinaryArray}; -use arrow::datatypes::DataType; -use arrow_buffer::MutableBuffer; use std::sync::Arc; +use arrow::{ + array::{ArrayRef, FixedSizeBinaryArray}, + datatypes::DataType +}; +use arrow_buffer::MutableBuffer; + +use crate::{ + builder::{memory_writer::MemoryWriter, nullmask::NullmaskBuilder, ArrayBuilder}, + slice::{AsSlice, FixedSizeListSlice}, + util::invalid_buffer_access, + writer::{ArrayWriter, Writer} +}; + pub struct FixedSizeBinaryBuilder { size: usize, nulls: NullmaskBuilder, - values: MutableBuffer, + values: MutableBuffer } impl FixedSizeBinaryBuilder { @@ -20,7 +24,7 @@ impl FixedSizeBinaryBuilder { Self { size, nulls: NullmaskBuilder::new(item_capacity), - values: MutableBuffer::new(item_capacity * size), + values: MutableBuffer::new(item_capacity * size) } } @@ -106,11 +110,7 @@ impl AsSlice for FixedSizeBinaryBuilder { type Slice<'a> = FixedSizeListSlice<'a, &'a [u8]>; fn as_slice(&self) -> Self::Slice<'_> { - FixedSizeListSlice::new( - self.size, - self.values.as_slice(), - self.nulls.as_slice().bitmask(), - ) + FixedSizeListSlice::new(self.size, self.values.as_slice(), self.nulls.as_slice().bitmask()) } } diff --git a/crates/array/src/builder/list.rs b/crates/array/src/builder/list.rs index 3ba9fe7b..b25e929d 100644 --- a/crates/array/src/builder/list.rs +++ b/crates/array/src/builder/list.rs @@ -1,14 +1,16 @@ -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::nullmask::NullmaskBuilder; -use crate::builder::offsets::OffsetsBuilder; -use crate::builder::ArrayBuilder; -use crate::slice::{AsSlice, ListSlice}; -use crate::util::invalid_buffer_access; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::{ArrayRef, ListArray}; -use arrow::datatypes::{DataType, Field, FieldRef}; use std::sync::Arc; +use arrow::{ + array::{ArrayRef, ListArray}, + datatypes::{DataType, Field, FieldRef} +}; + +use crate::{ + builder::{memory_writer::MemoryWriter, nullmask::NullmaskBuilder, offsets::OffsetsBuilder, ArrayBuilder}, + slice::{AsSlice, ListSlice}, + util::invalid_buffer_access, + writer::{ArrayWriter, Writer} +}; pub struct ListBuilder { nulls: NullmaskBuilder, @@ -17,8 +19,7 @@ pub struct ListBuilder { field: FieldRef } - -impl ListBuilder { +impl ListBuilder { pub fn new(capacity: usize, values: T, field_name: Option) -> Self { let field = Field::new( field_name.unwrap_or_else(|| "item".to_string()), @@ -32,33 +33,32 @@ impl ListBuilder { field: Arc::new(field) } } - + pub fn append(&mut self) { self.nulls.append(true); self.offsets.append(self.values.len() as i32); } - + pub fn append_null(&mut self) { self.nulls.append(false); self.offsets.append(self.values.len() as i32); } - + pub fn values(&mut self) -> &mut T { &mut self.values } - + pub fn finish(self) -> ListArray { ListArray::new( - self.field, - self.offsets.finish(), - self.values.finish(), + self.field, + self.offsets.finish(), + self.values.finish(), self.nulls.finish() ) } } - -impl ArrayBuilder for ListBuilder { +impl ArrayBuilder for ListBuilder { fn data_type(&self) -> DataType { DataType::List(self.field.clone()) } @@ -82,8 +82,7 @@ impl ArrayBuilder for ListBuilder { } } - -impl > ArrayWriter for ListBuilder { +impl> ArrayWriter for ListBuilder { type Writer = MemoryWriter; #[inline] @@ -97,7 +96,7 @@ impl > ArrayWriter for ListBuilder { #[inline] fn nullmask(&mut self, buf: usize) -> &mut ::Nullmask { - match buf { + match buf { 0 => &mut self.nulls, 1 => invalid_buffer_access!(), i => self.values.nullmask(i - 2) @@ -115,7 +114,7 @@ impl > ArrayWriter for ListBuilder { #[inline] fn offset(&mut self, buf: usize) -> &mut ::Offset { - match buf { + match buf { 0 => invalid_buffer_access!(), 1 => &mut self.offsets, i => self.values.offset(i - 2) @@ -123,8 +122,7 @@ impl > ArrayWriter for ListBuilder { } } - -impl AsSlice for ListBuilder { +impl AsSlice for ListBuilder { type Slice<'a> = ListSlice<'a, T::Slice<'a>>; fn as_slice(&self) -> Self::Slice<'_> { @@ -136,9 +134,8 @@ impl AsSlice for ListBuilder { } } - -impl Default for ListBuilder { +impl Default for ListBuilder { fn default() -> Self { Self::new(0, T::default(), None) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/memory_writer.rs b/crates/array/src/builder/memory_writer.rs index 844b0f07..308883c2 100644 --- a/crates/array/src/builder/memory_writer.rs +++ b/crates/array/src/builder/memory_writer.rs @@ -1,16 +1,15 @@ use arrow_buffer::MutableBuffer; -use crate::builder::bitmask::BitmaskBuilder; -use crate::builder::nullmask::NullmaskBuilder; -use crate::builder::offsets::OffsetsBuilder; -use crate::writer::Writer; +use crate::{ + builder::{bitmask::BitmaskBuilder, nullmask::NullmaskBuilder, offsets::OffsetsBuilder}, + writer::Writer +}; pub struct MemoryWriter; - impl Writer for MemoryWriter { type Bitmask = BitmaskBuilder; type Nullmask = NullmaskBuilder; type Native = MutableBuffer; type Offset = OffsetsBuilder; -} \ No newline at end of file +} diff --git a/crates/array/src/builder/mod.rs b/crates/array/src/builder/mod.rs index 84e4c8a3..fe59cc2a 100644 --- a/crates/array/src/builder/mod.rs +++ b/crates/array/src/builder/mod.rs @@ -1,8 +1,7 @@ -use arrow::array::ArrayRef; -use arrow::datatypes::DataType; - +use arrow::{array::ArrayRef, datatypes::DataType}; mod aliases; +mod any; mod binary; pub mod bitmask; mod boolean; @@ -13,11 +12,9 @@ pub mod native; pub mod nullmask; pub mod offsets; mod primitive; -mod any; mod r#struct; mod table; - pub use aliases::*; pub use any::*; pub use binary::*; @@ -28,19 +25,18 @@ pub use primitive::*; pub use r#struct::*; pub use table::*; - pub trait ArrayBuilder: Sized { fn data_type(&self) -> DataType; - + fn len(&self) -> usize; - + fn byte_size(&self) -> usize; - + fn clear(&mut self); fn finish(self) -> ArrayRef; - + unsafe fn finish_unchecked(self) -> ArrayRef { self.finish() } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/native.rs b/crates/array/src/builder/native.rs index aab1e3c2..851f9602 100644 --- a/crates/array/src/builder/native.rs +++ b/crates/array/src/builder/native.rs @@ -1,7 +1,6 @@ -use crate::index::RangeList; -use crate::writer::NativeWriter; use arrow_buffer::{ArrowNativeType, MutableBuffer, ToByteSlice}; +use crate::{index::RangeList, writer::NativeWriter}; impl NativeWriter for MutableBuffer { #[inline] @@ -11,7 +10,7 @@ impl NativeWriter for MutableBuffer { } #[inline] - fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()> { + fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()> { self.extend(values); Ok(()) } @@ -25,9 +24,8 @@ impl NativeWriter for MutableBuffer { fn write_slice_indexes( &mut self, values: &[T], - mut indexes: impl Iterator - ) -> anyhow::Result<()> - { + mut indexes: impl Iterator + ) -> anyhow::Result<()> { let value_size = size_of::(); let min_byte_len = indexes.size_hint().0 * value_size; @@ -44,16 +42,12 @@ impl NativeWriter for MutableBuffer { byte_len += value_size; } } else { - unsafe { - self.set_len(byte_len) - } - return Ok(()) + unsafe { self.set_len(byte_len) } + return Ok(()); } } - unsafe { - self.set_len(byte_len) - } + unsafe { self.set_len(byte_len) } for i in indexes { self.push(values[i]) @@ -66,14 +60,13 @@ impl NativeWriter for MutableBuffer { &mut self, values: &[T], ranges: &mut impl RangeList - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { self.reserve(ranges.span() * size_of::()); - + for r in ranges.iter() { - self.extend_from_slice(&values[r]) + self.extend_from_slice(&values[r]) } - + Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/nullmask.rs b/crates/array/src/builder/nullmask.rs index a596205a..a1624cf6 100644 --- a/crates/array/src/builder/nullmask.rs +++ b/crates/array/src/builder/nullmask.rs @@ -1,10 +1,9 @@ -use crate::builder::bitmask::BitmaskBuilder; -use crate::index::RangeList; -use crate::slice::nullmask::NullmaskSlice; -use crate::util::bit_tools; -use crate::writer::BitmaskWriter; use arrow_buffer::NullBuffer; +use crate::{ + builder::bitmask::BitmaskBuilder, index::RangeList, slice::nullmask::NullmaskSlice, util::bit_tools, + writer::BitmaskWriter +}; pub struct NullmaskBuilder { nulls: BitmaskBuilder, @@ -13,21 +12,20 @@ pub struct NullmaskBuilder { capacity: usize } - impl NullmaskBuilder { pub fn new(capacity: usize) -> Self { Self { nulls: BitmaskBuilder::new(0), has_nulls: false, len: 0, - capacity, + capacity } } pub fn byte_size(&self) -> usize { self.nulls.bytes_size() } - + pub fn len(&self) -> usize { if self.has_nulls { self.nulls.len() @@ -41,7 +39,7 @@ impl NullmaskBuilder { self.len = 0; self.has_nulls = false } - + pub fn append_slice(&mut self, data: &[u8], offset: usize, len: usize) { if self.has_nulls { self.nulls.append_slice(data, offset, len); @@ -53,7 +51,7 @@ impl NullmaskBuilder { } } - pub fn append_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) { + pub fn append_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) { if self.has_nulls { self.nulls.append_slice_indexes(data, indexes); } else if let Some(len) = bit_tools::all_indexes_valid(data, indexes.clone()) { @@ -74,48 +72,46 @@ impl NullmaskBuilder { self.nulls.append_slice_ranges(data, ranges); } } - + pub fn append_many(&mut self, val: bool, count: usize) { if count == 0 { return; } match (self.has_nulls, val) { (true, val) => self.nulls.append_many(val, count), - (false, true) => { - self.len += count - }, + (false, true) => self.len += count, (false, false) => { self.init_nulls(count); self.nulls.append_many(false, count) } } } - + #[inline] pub fn append(&mut self, val: bool) { - match (self.has_nulls, val) { + match (self.has_nulls, val) { (true, val) => self.nulls.append(val), (false, true) => { self.len += 1; - }, + } (false, false) => { self.init_nulls(1); self.nulls.append(false) } } } - + fn init_nulls(&mut self, additional: usize) { let cap = std::cmp::max(self.capacity, self.len + additional); self.nulls.reserve(cap); self.nulls.append_many(true, self.len); self.has_nulls = true } - + pub fn finish(self) -> Option { self.has_nulls.then(|| NullBuffer::new(self.nulls.finish())) } - + pub fn as_slice(&self) -> NullmaskSlice<'_> { if self.has_nulls { NullmaskSlice::new(self.nulls.len(), Some(self.nulls.as_slice())) @@ -125,7 +121,6 @@ impl NullmaskBuilder { } } - impl BitmaskWriter for NullmaskBuilder { #[inline] fn write_slice(&mut self, data: &[u8], offset: usize, len: usize) -> anyhow::Result<()> { @@ -134,12 +129,7 @@ impl BitmaskWriter for NullmaskBuilder { } #[inline] - fn write_slice_indexes( - &mut self, - data: &[u8], - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) -> anyhow::Result<()> { self.append_slice_indexes(data, indexes); Ok(()) } @@ -155,4 +145,4 @@ impl BitmaskWriter for NullmaskBuilder { self.append_many(val, count); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/offsets.rs b/crates/array/src/builder/offsets.rs index fade7fe6..e6f2ec41 100644 --- a/crates/array/src/builder/offsets.rs +++ b/crates/array/src/builder/offsets.rs @@ -1,33 +1,27 @@ -use crate::index::RangeList; -use crate::offsets::Offsets; -use crate::writer::OffsetsWriter; use arrow_buffer::{MutableBuffer, OffsetBuffer, ScalarBuffer}; +use crate::{index::RangeList, offsets::Offsets, writer::OffsetsWriter}; pub struct OffsetsBuilder { buffer: MutableBuffer, last_offset: i32 } - impl OffsetsBuilder { pub fn new(capacity: usize) -> Self { let mut buffer = MutableBuffer::new(capacity + 1); buffer.push(0i32); - Self { - buffer, - last_offset: 0 - } + Self { buffer, last_offset: 0 } } - + pub fn byte_size(&self) -> usize { self.buffer.len() } - + pub fn len(&self) -> usize { self.buffer.len() / size_of::() - 1 } - + pub fn shift(&mut self, len: usize) { if len == 0 { return; @@ -38,7 +32,7 @@ impl OffsetsBuilder { let new_byte_len = self.buffer.len() - byte_len; self.buffer.truncate(new_byte_len) } - + pub fn clear(&mut self) { self.buffer.truncate(size_of::()); self.buffer.typed_data_mut::()[0] = 0; @@ -48,26 +42,25 @@ impl OffsetsBuilder { pub fn append_slice(&mut self, offsets: Offsets<'_>) { let beg = offsets.first_offset(); - self.buffer.extend(offsets.values()[1..].iter().map(|o| { - *o - beg + self.last_offset - })); + self.buffer + .extend(offsets.values()[1..].iter().map(|o| *o - beg + self.last_offset)); self.last_offset += offsets.last_offset() - beg; } - - pub fn append_slice_indexes(&mut self, offsets: Offsets<'_>, indexes: impl Iterator) { + + pub fn append_slice_indexes(&mut self, offsets: Offsets<'_>, indexes: impl Iterator) { self.buffer.reserve(indexes.size_hint().0 * size_of::()); - + for i in indexes { let len = offsets.values()[i + 1] - offsets.values()[i]; self.last_offset += len; self.buffer.push(self.last_offset) } } - + pub fn append_slice_ranges(&mut self, offsets: Offsets<'_>, ranges: &mut impl RangeList) { self.buffer.reserve(ranges.span() * size_of::()); - + for r in ranges.iter() { self.append_slice(offsets.slice(r.start, r.len())) } @@ -93,7 +86,7 @@ impl OffsetsBuilder { OffsetBuffer::new_unchecked(scalar) } } - + pub fn as_slice(&self) -> Offsets<'_> { unsafe { // SAFETY: monotonicity and non-emptiness are guaranteed by construction @@ -102,7 +95,6 @@ impl OffsetsBuilder { } } - impl OffsetsWriter for OffsetsBuilder { #[inline] fn write_slice(&mut self, offsets: Offsets<'_>) -> anyhow::Result<()> { @@ -111,7 +103,11 @@ impl OffsetsWriter for OffsetsBuilder { } #[inline] - fn write_slice_indexes(&mut self, offsets: Offsets<'_>, indexes: impl Iterator) -> anyhow::Result<()> { + fn write_slice_indexes( + &mut self, + offsets: Offsets<'_>, + indexes: impl Iterator + ) -> anyhow::Result<()> { self.append_slice_indexes(offsets, indexes); Ok(()) } @@ -127,4 +123,4 @@ impl OffsetsWriter for OffsetsBuilder { self.append_len(len); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/primitive.rs b/crates/array/src/builder/primitive.rs index d6562311..3ef0b337 100644 --- a/crates/array/src/builder/primitive.rs +++ b/crates/array/src/builder/primitive.rs @@ -1,15 +1,17 @@ -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::nullmask::NullmaskBuilder; -use crate::util::invalid_buffer_access; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}; +use std::{marker::PhantomData, sync::Arc}; + +use arrow::{ + array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray}, + datatypes::DataType +}; use arrow_buffer::{ArrowNativeType, MutableBuffer, ScalarBuffer}; -use std::marker::PhantomData; -use std::sync::Arc; -use arrow::datatypes::DataType; -use crate::builder::ArrayBuilder; -use crate::slice::{AsSlice, PrimitiveSlice}; +use crate::{ + builder::{memory_writer::MemoryWriter, nullmask::NullmaskBuilder, ArrayBuilder}, + slice::{AsSlice, PrimitiveSlice}, + util::invalid_buffer_access, + writer::{ArrayWriter, Writer} +}; pub struct PrimitiveBuilder { nulls: NullmaskBuilder, @@ -17,8 +19,7 @@ pub struct PrimitiveBuilder { phantom_data: PhantomData } - -impl PrimitiveBuilder { +impl PrimitiveBuilder { pub fn new(capacity: usize) -> Self { Self { nulls: NullmaskBuilder::new(capacity), @@ -26,12 +27,12 @@ impl PrimitiveBuilder { phantom_data: PhantomData } } - + pub fn append(&mut self, val: T::Native) { self.nulls.append(true); self.values.push(val) } - + pub fn append_option(&mut self, val: Option) { if let Some(val) = val { self.append(val) @@ -40,17 +41,13 @@ impl PrimitiveBuilder { self.values.push(T::default_value()) } } - + pub fn finish(self) -> PrimitiveArray { - PrimitiveArray::new( - ScalarBuffer::from(self.values), - self.nulls.finish() - ) + PrimitiveArray::new(ScalarBuffer::from(self.values), self.nulls.finish()) } } - -impl ArrayBuilder for PrimitiveBuilder { +impl ArrayBuilder for PrimitiveBuilder { fn data_type(&self) -> DataType { T::DATA_TYPE } @@ -73,8 +70,7 @@ impl ArrayBuilder for PrimitiveBuilder { } } - -impl ArrayWriter for PrimitiveBuilder { +impl ArrayWriter for PrimitiveBuilder { type Writer = MemoryWriter; fn bitmask(&mut self, _buf: usize) -> &mut ::Bitmask { @@ -104,21 +100,16 @@ impl ArrayWriter for PrimitiveBuilder { } } - -impl AsSlice for PrimitiveBuilder { +impl AsSlice for PrimitiveBuilder { type Slice<'a> = PrimitiveSlice<'a, T::Native>; fn as_slice(&self) -> Self::Slice<'_> { - PrimitiveSlice::new( - self.values.typed_data::(), - self.nulls.as_slice().bitmask() - ) + PrimitiveSlice::new(self.values.typed_data::(), self.nulls.as_slice().bitmask()) } } - -impl Default for PrimitiveBuilder { +impl Default for PrimitiveBuilder { fn default() -> Self { Self::new(0) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/struct.rs b/crates/array/src/builder/struct.rs index 2a5f7fce..9116805f 100644 --- a/crates/array/src/builder/struct.rs +++ b/crates/array/src/builder/struct.rs @@ -1,13 +1,16 @@ -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::nullmask::NullmaskBuilder; -use crate::builder::{AnyBuilder, ArrayBuilder}; -use crate::slice::{AnyStructSlice, AsSlice}; -use crate::util::{bisect_offsets, build_field_offsets, invalid_buffer_access}; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::{ArrayRef, StructArray}; -use arrow::datatypes::{DataType, Fields}; use std::sync::Arc; +use arrow::{ + array::{ArrayRef, StructArray}, + datatypes::{DataType, Fields} +}; + +use crate::{ + builder::{memory_writer::MemoryWriter, nullmask::NullmaskBuilder, AnyBuilder, ArrayBuilder}, + slice::{AnyStructSlice, AsSlice}, + util::{bisect_offsets, build_field_offsets, invalid_buffer_access}, + writer::{ArrayWriter, Writer} +}; pub struct AnyStructBuilder { fields: Fields, @@ -16,15 +19,12 @@ pub struct AnyStructBuilder { columns: Vec } - impl AnyStructBuilder { pub fn new(fields: Fields) -> Self { let column_offsets = build_field_offsets(1, &fields); - - let columns = fields.iter() - .map(|f| AnyBuilder::new(f.data_type())) - .collect(); - + + let columns = fields.iter().map(|f| AnyBuilder::new(f.data_type())).collect(); + Self { fields, column_offsets, @@ -32,7 +32,7 @@ impl AnyStructBuilder { columns } } - + pub fn finish(self) -> StructArray { StructArray::new( self.fields, @@ -40,7 +40,7 @@ impl AnyStructBuilder { self.nulls.finish() ) } - + pub unsafe fn finish_unchecked(self) -> StructArray { StructArray::new_unchecked( self.fields, @@ -48,7 +48,7 @@ impl AnyStructBuilder { self.nulls.finish() ) } - + fn find_column(&self, buf: usize) -> (usize, usize) { if let Some(col) = bisect_offsets(&self.column_offsets, buf) { (col, buf - self.column_offsets[col]) @@ -58,7 +58,6 @@ impl AnyStructBuilder { } } - impl ArrayWriter for AnyStructBuilder { type Writer = MemoryWriter; @@ -87,7 +86,6 @@ impl ArrayWriter for AnyStructBuilder { } } - impl AsSlice for AnyStructBuilder { type Slice<'a> = AnyStructSlice<'a>; @@ -99,7 +97,6 @@ impl AsSlice for AnyStructBuilder { } } - impl ArrayBuilder for AnyStructBuilder { fn data_type(&self) -> DataType { DataType::Struct(self.fields.clone()) @@ -127,4 +124,4 @@ impl ArrayBuilder for AnyStructBuilder { unsafe fn finish_unchecked(self) -> ArrayRef { Arc::new(self.finish_unchecked()) } -} \ No newline at end of file +} diff --git a/crates/array/src/builder/table.rs b/crates/array/src/builder/table.rs index 412ac3a9..4fa6f184 100644 --- a/crates/array/src/builder/table.rs +++ b/crates/array/src/builder/table.rs @@ -1,11 +1,11 @@ -use crate::builder::memory_writer::MemoryWriter; -use crate::builder::{AnyBuilder, ArrayBuilder}; -use crate::slice::{AnyTableSlice, AsSlice}; -use crate::util::{bisect_offsets, build_field_offsets, invalid_buffer_access}; -use crate::writer::{ArrayWriter, Writer}; -use arrow::array::RecordBatch; -use arrow::datatypes::SchemaRef; +use arrow::{array::RecordBatch, datatypes::SchemaRef}; +use crate::{ + builder::{memory_writer::MemoryWriter, AnyBuilder, ArrayBuilder}, + slice::{AnyTableSlice, AsSlice}, + util::{bisect_offsets, build_field_offsets, invalid_buffer_access}, + writer::{ArrayWriter, Writer} +}; pub struct AnyTableBuilder { schema: SchemaRef, @@ -13,14 +13,11 @@ pub struct AnyTableBuilder { columns: Vec } - impl AnyTableBuilder { pub fn new(schema: SchemaRef) -> Self { let buffers = build_field_offsets(0, schema.fields()); - let columns = schema.fields().iter() - .map(|f| AnyBuilder::new(f.data_type())) - .collect(); + let columns = schema.fields().iter().map(|f| AnyBuilder::new(f.data_type())).collect(); Self { schema, @@ -30,27 +27,25 @@ impl AnyTableBuilder { } pub fn finish(self) -> RecordBatch { - RecordBatch::try_new( - self.schema, - self.columns.into_iter().map(|c| c.finish()).collect() - ).unwrap() + RecordBatch::try_new(self.schema, self.columns.into_iter().map(|c| c.finish()).collect()).unwrap() } pub unsafe fn finish_unchecked(self) -> RecordBatch { RecordBatch::try_new( self.schema, self.columns.into_iter().map(|c| c.finish_unchecked()).collect() - ).unwrap() + ) + .unwrap() } pub fn num_columns(&self) -> usize { self.columns.len() } - - pub fn column_writer(&mut self, column: usize) -> &mut impl ArrayWriter { + + pub fn column_writer(&mut self, column: usize) -> &mut impl ArrayWriter { &mut self.columns[column] } - + pub fn clear(&mut self) { for c in self.columns.iter_mut() { c.clear() @@ -66,7 +61,6 @@ impl AnyTableBuilder { } } - impl ArrayWriter for AnyTableBuilder { type Writer = MemoryWriter; @@ -91,13 +85,10 @@ impl ArrayWriter for AnyTableBuilder { } } - impl AsSlice for AnyTableBuilder { type Slice<'a> = AnyTableSlice<'a>; fn as_slice(&self) -> Self::Slice<'_> { - AnyTableSlice::new( - self.columns.iter().map(|c| c.as_slice()).collect() - ) + AnyTableSlice::new(self.columns.iter().map(|c| c.as_slice()).collect()) } -} \ No newline at end of file +} diff --git a/crates/array/src/chunking.rs b/crates/array/src/chunking.rs index 35818a4f..eeab0bca 100644 --- a/crates/array/src/chunking.rs +++ b/crates/array/src/chunking.rs @@ -1,6 +1,5 @@ use crate::util::get_offset_position; - #[derive(Clone, Default, Debug)] pub struct ChunkRange { pub chunk: u32, @@ -8,7 +7,6 @@ pub struct ChunkRange { pub len: u32 } - impl ChunkRange { pub fn new(chunk: usize, offset: usize, len: usize) -> Self { Self { @@ -17,7 +15,7 @@ impl ChunkRange { len: len as u32 } } - + #[inline] pub fn chunk_index(&self) -> usize { self.chunk as usize @@ -27,40 +25,27 @@ impl ChunkRange { pub fn offset_index(&self) -> usize { self.offset as usize } - + #[inline] pub fn len_index(&self) -> usize { self.len as usize } - - pub fn build_abs_order_list( - chunk_offsets: &[usize], - order: &[usize] - ) -> Vec - { + + pub fn build_abs_order_list(chunk_offsets: &[usize], order: &[usize]) -> Vec { Self::build_order_list(chunk_offsets, order, true) } - pub fn build_rel_order_list( - chunk_offsets: &[usize], - order: &[usize] - ) -> Vec - { + pub fn build_rel_order_list(chunk_offsets: &[usize], order: &[usize]) -> Vec { Self::build_order_list(chunk_offsets, order, false) } - fn build_order_list( - chunk_offsets: &[usize], - order: &[usize], - is_abs: bool - ) -> Vec - { + fn build_order_list(chunk_offsets: &[usize], order: &[usize], is_abs: bool) -> Vec { let mut chunks = Vec::new(); let mut last = ChunkRange { chunk: chunk_offsets.len() as u32 - 1, // never existing chunk offset: 0, - len: 0, + len: 0 }; let mut prev = 0; @@ -87,13 +72,12 @@ impl ChunkRange { } } - #[cfg(test)] mod test { - use crate::chunking::ChunkRange; - use crate::util::build_offsets; use proptest::prelude::*; + use crate::{chunking::ChunkRange, util::build_offsets}; + #[test] fn test_abs_order_list_building() { let arb = prop::collection::vec(1..20usize, 100).prop_flat_map(|lengths| { diff --git a/crates/array/src/index/mod.rs b/crates/array/src/index/mod.rs index 36eedd3f..af433865 100644 --- a/crates/array/src/index/mod.rs +++ b/crates/array/src/index/mod.rs @@ -1,44 +1,35 @@ use std::ops::Range; - pub trait RangeList { - fn iter(&self) -> impl Iterator> + Clone; + fn iter(&self) -> impl Iterator> + Clone; fn span(&mut self) -> usize; #[inline] fn shift(&mut self, offset: usize, len: usize) -> impl RangeList { - ShiftedRangeList { - src: self, - offset, - len - } + ShiftedRangeList { src: self, offset, len } } #[inline] fn scale(&mut self, factor: usize) -> impl RangeList { - ScaledRangeList { - src: self, - factor - } + ScaledRangeList { src: self, factor } } } - struct ShiftedRangeList<'a, S: ?Sized> { src: &'a mut S, offset: usize, len: usize } - -impl <'a, S: RangeList + ?Sized> RangeList for ShiftedRangeList<'a, S> { - fn iter(&self) -> impl Iterator> + Clone { +impl<'a, S: RangeList + ?Sized> RangeList for ShiftedRangeList<'a, S> { + fn iter(&self) -> impl Iterator> + Clone { self.src.iter().map(|r| { assert!( - r.start <= self.len && r.end <= self.len, + r.start <= self.len && r.end <= self.len, "{:?} is out of upper bound {}", - r, self.len + r, + self.len ); let beg = self.offset + r.start; let end = self.offset + r.end; @@ -62,14 +53,13 @@ impl <'a, S: RangeList + ?Sized> RangeList for ShiftedRangeList<'a, S> { } } - struct ScaledRangeList<'a, S: ?Sized> { src: &'a mut S, - factor: usize, + factor: usize } -impl <'a, S: RangeList + ?Sized> RangeList for ScaledRangeList<'a, S> { - fn iter(&self) -> impl Iterator> + Clone { +impl<'a, S: RangeList + ?Sized> RangeList for ScaledRangeList<'a, S> { + fn iter(&self) -> impl Iterator> + Clone { self.src.iter().map(|r| { let beg = r.start * self.factor; let end = r.end * self.factor; @@ -85,12 +75,11 @@ impl <'a, S: RangeList + ?Sized> RangeList for ScaledRangeList<'a, S> { fn scale(&mut self, factor: usize) -> impl RangeList { ScaledRangeList { src: self.src, - factor: self.factor * factor, + factor: self.factor * factor } } } - macro_rules! compute_span { ($this:ident) => { if let Some(span) = $this.span { @@ -103,19 +92,14 @@ macro_rules! compute_span { }; } - pub struct RangeListFromIterator { inner: I, span: Option } - -impl RangeListFromIterator { +impl RangeListFromIterator { pub fn new(inner: I) -> Self { - Self { - inner, - span: None - } + Self { inner, span: None } } pub fn with_size(inner: I, span: impl Into>) -> Self { @@ -126,10 +110,9 @@ impl RangeListFromIterator { } } - -impl > + Clone> RangeList for RangeListFromIterator { +impl> + Clone> RangeList for RangeListFromIterator { #[inline] - fn iter(&self) -> impl Iterator> + Clone { + fn iter(&self) -> impl Iterator> + Clone { self.inner.clone() } @@ -138,21 +121,21 @@ impl > + Clone> RangeList for RangeListFromIterato } } - pub struct MaterializedRangeList { ranges: Vec>, span: Option } - impl MaterializedRangeList { - pub fn from_iter(ranges: impl Iterator>) -> Self { + pub fn from_iter(ranges: impl Iterator>) -> Self { let mut span = 0; - - let ranges = ranges.map(|r| { - span += r.len(); - r.start as u32..r.end as u32 - }).collect(); + + let ranges = ranges + .map(|r| { + span += r.len(); + r.start as u32..r.end as u32 + }) + .collect(); Self { ranges, @@ -161,14 +144,13 @@ impl MaterializedRangeList { } } - impl RangeList for MaterializedRangeList { #[inline] - fn iter(&self) -> impl Iterator> + Clone { + fn iter(&self) -> impl Iterator> + Clone { self.ranges.iter().map(|r| r.start as usize..r.end as usize) } - + fn span(&mut self) -> usize { compute_span!(self) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/dense/mod.rs b/crates/array/src/io/dense/mod.rs index 27810f9e..b06963d7 100644 --- a/crates/array/src/io/dense/mod.rs +++ b/crates/array/src/io/dense/mod.rs @@ -1,6 +1,5 @@ mod reader; mod writer; - pub use reader::*; pub use writer::*; diff --git a/crates/array/src/io/dense/reader.rs b/crates/array/src/io/dense/reader.rs index 1c6a05d9..26098be2 100644 --- a/crates/array/src/io/dense/reader.rs +++ b/crates/array/src/io/dense/reader.rs @@ -1,48 +1,40 @@ -use crate::io::reader::IOReaderFactory; use anyhow::ensure; +use crate::io::reader::IOReaderFactory; const USIZE: usize = size_of::(); - pub struct DenseReader<'a> { data: &'a [u8], buffer_lengths: Vec, pos: usize } - impl<'a> DenseReader<'a> { pub fn new(data: &'a [u8]) -> anyhow::Result { - ensure!( - data.len() >= USIZE, - "dense file must have at least {} bytes", - USIZE - ); - - let num_buffers = u32::from_le_bytes( - data[data.len() - USIZE..].try_into()? - ) as usize; - + ensure!(data.len() >= USIZE, "dense file must have at least {} bytes", USIZE); + + let num_buffers = u32::from_le_bytes(data[data.len() - USIZE..].try_into()?) as usize; + let min_byte_size = num_buffers * USIZE + USIZE; ensure!( data.len() >= min_byte_size, "dense file of {} buffer(s) must have at least {} bytes", - num_buffers, + num_buffers, min_byte_size ); - - let buffer_lengths: Vec = (0..num_buffers).map(|i| { - let beg = data.len() - min_byte_size + i * USIZE; - let end = beg + USIZE; - u32::from_le_bytes( - data[beg..end].try_into().unwrap() - ) as usize - }).collect(); - + + let buffer_lengths: Vec = (0..num_buffers) + .map(|i| { + let beg = data.len() - min_byte_size + i * USIZE; + let end = beg + USIZE; + u32::from_le_bytes(data[beg..end].try_into().unwrap()) as usize + }) + .collect(); + let buffers_size: usize = buffer_lengths.iter().sum(); ensure!(buffers_size + min_byte_size == data.len(), "invalid file length"); - + Ok(Self { data, buffer_lengths, @@ -51,8 +43,7 @@ impl<'a> DenseReader<'a> { } } - -impl<'a> IOReaderFactory for DenseReader<'a> { +impl<'a> IOReaderFactory for DenseReader<'a> { type ByteReader = &'a [u8]; fn next_byte_reader(&mut self) -> anyhow::Result { @@ -62,4 +53,4 @@ impl<'a> IOReaderFactory for DenseReader<'a> { self.pos += 1; Ok(buf) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/dense/writer.rs b/crates/array/src/io/dense/writer.rs index 15acb948..7ac4dad8 100644 --- a/crates/array/src/io/dense/writer.rs +++ b/crates/array/src/io/dense/writer.rs @@ -1,15 +1,13 @@ -use crate::io::writer::IOWriterFactory; +use std::{cell::RefCell, io::Write, rc::Rc}; + use arrow_buffer::ToByteSlice; -use std::cell::RefCell; -use std::io::Write; -use std::rc::Rc; +use crate::io::writer::IOWriterFactory; pub struct DenseWriter { inner: Rc>> } - impl DenseWriter { pub fn new(write: W) -> Self { let inner = WriterInner { @@ -40,18 +38,14 @@ impl DenseWriter { } } - struct WriterInner { write: W, - buffers: Vec>, + buffers: Vec> } - impl WriterInner { fn finish(mut self) -> std::io::Result { - let buffer_lengths: Vec = self.buffers.iter() - .map(|b| b.len() as u32) - .collect(); + let buffer_lengths: Vec = self.buffers.iter().map(|b| b.len() as u32).collect(); for buf in self.buffers.into_iter() { self.write.write_all(&buf)?; @@ -63,14 +57,12 @@ impl WriterInner { } } - pub struct BufferWriter { write: Rc>>, buf: Vec, buffer_index: usize } - impl BufferWriter { pub fn finish(self) { let mut file = self.write.borrow_mut(); @@ -78,7 +70,6 @@ impl BufferWriter { } } - impl Write for BufferWriter { fn write(&mut self, buf: &[u8]) -> std::io::Result { self.buf.write(buf) @@ -93,11 +84,10 @@ impl Write for BufferWriter { } } - impl IOWriterFactory for DenseWriter { type Write = BufferWriter; fn next_write(&mut self) -> anyhow::Result { Ok(self.new_buffer()) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/file/byte_reader.rs b/crates/array/src/io/file/byte_reader.rs index 5daf4746..11ec4bb6 100644 --- a/crates/array/src/io/file/byte_reader.rs +++ b/crates/array/src/io/file/byte_reader.rs @@ -1,7 +1,6 @@ -use crate::io::file::shared_file::SharedFileRef; -use crate::io::reader::ByteReader; use anyhow::ensure; +use crate::io::{file::shared_file::SharedFileRef, reader::ByteReader}; pub struct FileByteReader { file: SharedFileRef, @@ -11,7 +10,6 @@ pub struct FileByteReader { buffered_len: usize } - impl FileByteReader { pub fn new(file: SharedFileRef) -> anyhow::Result { let len = file.len()?; @@ -23,7 +21,7 @@ impl FileByteReader { buffered_len: 0 }) } - + #[inline(never)] fn fill_buffer(&mut self, offset: usize) -> std::io::Result<()> { let len = self.file.read(offset, self.buf.as_mut_slice())?; @@ -33,7 +31,6 @@ impl FileByteReader { } } - impl ByteReader for FileByteReader { fn len(&self) -> usize { self.file_len @@ -41,19 +38,19 @@ impl ByteReader for FileByteReader { fn read(&mut self, offset: usize, len: usize) -> anyhow::Result<&[u8]> { ensure!(offset + len <= self.file_len, "out of bounds read"); - + if self.buffered_offset <= offset && offset < self.buffered_offset + self.buffered_len { let beg = offset - self.buffered_offset; let end = beg + std::cmp::min(len, self.buffered_len - beg); - return Ok(&self.buf[beg..end]) + return Ok(&self.buf[beg..end]); } if len == 0 { - return Ok(&[]) + return Ok(&[]); } - + self.fill_buffer(offset)?; - + Ok(&self.buf[0..std::cmp::min(len, self.buffered_len)]) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/file/mod.rs b/crates/array/src/io/file/mod.rs index 1810a306..cd5ef3cb 100644 --- a/crates/array/src/io/file/mod.rs +++ b/crates/array/src/io/file/mod.rs @@ -2,34 +2,24 @@ mod byte_reader; mod shared_file; mod writer; - -use crate::io::reader::IOReader; -use crate::reader::AnyReader; -use crate::util::get_num_buffers; use arrow::datatypes::DataType; use byte_reader::FileByteReader; use shared_file::SharedFileRef; - - pub use writer::*; +use crate::{io::reader::IOReader, reader::AnyReader, util::get_num_buffers}; pub type FileReader = IOReader; pub type ArrayFileReader = AnyReader; - pub struct ArrayFile { data_type: DataType, buffers: Vec } - impl ArrayFile { pub(self) fn new(data_type: DataType, buffers: Vec) -> Self { - Self { - data_type, - buffers - } + Self { data_type, buffers } } pub fn data_type(&self) -> &DataType { @@ -45,22 +35,16 @@ impl ArrayFile { }; AnyReader::from_factory(&mut factory, &self.data_type) } - + pub fn write(self) -> anyhow::Result { ArrayFileWriter::new(self.data_type, self.buffers) } - + pub fn new_temporary(data_type: DataType) -> anyhow::Result { - let buffers = - std::iter::repeat_with(|| { - tempfile::tempfile().map(SharedFileRef::new) - }) - .take(get_num_buffers(&data_type)) - .collect::, _>>()?; + let buffers = std::iter::repeat_with(|| tempfile::tempfile().map(SharedFileRef::new)) + .take(get_num_buffers(&data_type)) + .collect::, _>>()?; - Ok(Self { - data_type, - buffers - }) + Ok(Self { data_type, buffers }) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/file/shared_file.rs b/crates/array/src/io/file/shared_file.rs index cb1ac3ca..e6a4d499 100644 --- a/crates/array/src/io/file/shared_file.rs +++ b/crates/array/src/io/file/shared_file.rs @@ -1,30 +1,31 @@ -use parking_lot::Mutex; -use std::fs::File; -use std::io::{Read, Seek, SeekFrom}; -use std::sync::Arc; +use std::{ + fs::File, + io::{Read, Seek, SeekFrom}, + sync::Arc +}; +use parking_lot::Mutex; #[derive(Clone)] pub struct SharedFileRef { inner: Arc> } - impl SharedFileRef { pub fn new(file: File) -> Self { Self { inner: Arc::new(Mutex::new(SharedFile::new(file))) } } - + pub fn read(&self, offset: usize, buf: &mut [u8]) -> std::io::Result { self.inner.lock().read(offset, buf) } - + pub fn len(&self) -> std::io::Result { self.inner.lock().len() } - + pub fn into_file(self) -> File { Arc::into_inner(self.inner) .expect("this shared file is still in use by other readers") @@ -33,19 +34,14 @@ impl SharedFileRef { } } - pub struct SharedFile { inner: File, pos: Option } - impl SharedFile { pub fn new(file: File) -> Self { - Self { - inner: file, - pos: None - } + Self { inner: file, pos: None } } pub fn read(&mut self, offset: usize, buf: &mut [u8]) -> std::io::Result { @@ -66,4 +62,4 @@ impl SharedFile { pub fn into_file(self) -> File { self.inner } -} \ No newline at end of file +} diff --git a/crates/array/src/io/file/writer.rs b/crates/array/src/io/file/writer.rs index a0b9f419..03d8520b 100644 --- a/crates/array/src/io/file/writer.rs +++ b/crates/array/src/io/file/writer.rs @@ -1,25 +1,29 @@ -use crate::io::file::shared_file::SharedFileRef; -use crate::io::file::ArrayFile; -use crate::io::writer::IOWriter; -use crate::writer::{AnyArrayWriter, ArrayWriter, Writer}; +use std::{ + fs::File, + io::{BufWriter, Seek, SeekFrom, Write} +}; + use arrow::datatypes::DataType; -use std::fs::File; -use std::io::{BufWriter, Seek, SeekFrom, Write}; +use crate::{ + io::{ + file::{shared_file::SharedFileRef, ArrayFile}, + writer::IOWriter + }, + writer::{AnyArrayWriter, ArrayWriter, Writer} +}; pub type FileWriter = IOWriter>; - pub struct ArrayFileWriter { data_type: DataType, inner: AnyArrayWriter } - impl ArrayFileWriter { pub(super) fn new(data_type: DataType, buffers: Vec) -> anyhow::Result { let mut buffers = buffers.into_iter(); - + let mut factory = move || { let shared = buffers.next().expect("no more buffers left"); let mut file = shared.into_file(); @@ -27,28 +31,29 @@ impl ArrayFileWriter { file.set_len(0)?; Ok(BufWriter::new(file)) }; - + let inner = AnyArrayWriter::from_factory(&mut factory, &data_type)?; - - Ok(Self { - data_type, - inner - }) + + Ok(Self { data_type, inner }) } - + pub fn finish(self) -> anyhow::Result { - let buffers = self.inner.into_inner().into_iter().map(|buf| { - let mut buf_writer = IOWriter::finish_any_writer(buf)?; - buf_writer.flush()?; - let file = buf_writer.into_inner().expect("buffer was already flushed"); - Ok(SharedFileRef::new(file)) - }).collect::>>()?; - + let buffers = self + .inner + .into_inner() + .into_iter() + .map(|buf| { + let mut buf_writer = IOWriter::finish_any_writer(buf)?; + buf_writer.flush()?; + let file = buf_writer.into_inner().expect("buffer was already flushed"); + Ok(SharedFileRef::new(file)) + }) + .collect::>>()?; + Ok(ArrayFile::new(self.data_type, buffers)) } } - impl ArrayWriter for ArrayFileWriter { type Writer = FileWriter; @@ -71,4 +76,4 @@ impl ArrayWriter for ArrayFileWriter { fn offset(&mut self, buf: usize) -> &mut ::Offset { self.inner.offset(buf) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/mod.rs b/crates/array/src/io/mod.rs index caf355b3..fd6fedc3 100644 --- a/crates/array/src/io/mod.rs +++ b/crates/array/src/io/mod.rs @@ -1,4 +1,4 @@ pub mod dense; pub mod file; pub mod reader; -pub mod writer; \ No newline at end of file +pub mod writer; diff --git a/crates/array/src/io/reader/bitmask.rs b/crates/array/src/io/reader/bitmask.rs index 3d923183..e34bcb08 100644 --- a/crates/array/src/io/reader/bitmask.rs +++ b/crates/array/src/io/reader/bitmask.rs @@ -1,42 +1,32 @@ -use crate::io::reader::byte_reader::ByteReader; -use crate::reader::BitmaskReader; -use crate::writer::BitmaskWriter; use anyhow::{ensure, Context}; use arrow_buffer::bit_util; +use crate::{io::reader::byte_reader::ByteReader, reader::BitmaskReader, writer::BitmaskWriter}; pub struct BitmaskIOReader { byte_reader: R, len: usize } - -impl BitmaskIOReader { +impl BitmaskIOReader { pub fn new_unchecked(byte_reader: R, len: usize) -> Self { - Self { - byte_reader, - len - } + Self { byte_reader, len } } pub fn new(mut byte_reader: R) -> anyhow::Result { - let (byte_len, bit_len) = Self::read_length(&mut byte_reader).context( - "failed to read bitmask length" - )?; + let (byte_len, bit_len) = Self::read_length(&mut byte_reader).context("failed to read bitmask length")?; ensure!( byte_len == bit_util::ceil(bit_len, 8), "bitmask buffer has unexpected length" ); - Ok( - Self::new_unchecked(byte_reader, bit_len) - ) + Ok(Self::new_unchecked(byte_reader, bit_len)) } pub(super) fn read_length(byte_reader: &mut R) -> anyhow::Result<(usize, usize)> { const WORD: usize = size_of::(); - + ensure!(byte_reader.len() >= WORD); let byte_len = byte_reader.len() - WORD; @@ -48,30 +38,27 @@ impl BitmaskIOReader { Ok(()) })?; let bit_len = u32::from_le_bytes(bit_len_slice); - - Ok( - (byte_len, bit_len as usize) - ) + + Ok((byte_len, bit_len as usize)) } } - -impl BitmaskReader for BitmaskIOReader { +impl BitmaskReader for BitmaskIOReader { fn len(&self) -> usize { self.len } - + fn read_slice(&mut self, dst: &mut impl BitmaskWriter, offset: usize, mut len: usize) -> anyhow::Result<()> { ensure!(offset + len <= self.len); - + if len == 0 { - return Ok(()) + return Ok(()); } - + let byte_offset = offset / 8; let mut bit_offset = offset - byte_offset * 8; let byte_len = bit_util::ceil(len + bit_offset, 8); - + self.byte_reader.read_exact(byte_offset, byte_len, |data| { let bits_to_write = std::cmp::min(data.len() * 8 - bit_offset, len); dst.write_slice(data, bit_offset, bits_to_write)?; @@ -79,9 +66,9 @@ impl BitmaskReader for BitmaskIOReader { bit_offset = 0; Ok(()) })?; - + debug_assert_eq!(len, 0, "got unexpected number of bytes from .read_exact()"); - + Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/reader/byte_reader.rs b/crates/array/src/io/reader/byte_reader.rs index 98056ba5..7766b53d 100644 --- a/crates/array/src/io/reader/byte_reader.rs +++ b/crates/array/src/io/reader/byte_reader.rs @@ -1,6 +1,6 @@ -use anyhow::ensure; use std::io::{BufRead, BufReader, Read, Seek, SeekFrom}; +use anyhow::ensure; pub trait ByteReader { fn len(&self) -> usize; @@ -12,8 +12,7 @@ pub trait ByteReader { mut offset: usize, len: usize, mut cb: impl FnMut(&[u8]) -> anyhow::Result<()> - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { let end = offset + len; while offset < end { let bytes = self.read(offset, end - offset)?; @@ -25,7 +24,6 @@ pub trait ByteReader { } } - impl<'a> ByteReader for &'a [u8] { fn len(&self) -> usize { (*self).len() @@ -41,22 +39,19 @@ impl<'a> ByteReader for &'a [u8] { offset: usize, len: usize, mut cb: impl FnMut(&[u8]) -> anyhow::Result<()> - ) -> anyhow::Result<()> - { - let bytes = &self[offset..offset + len]; + ) -> anyhow::Result<()> { + let bytes = &self[offset..offset + len]; cb(bytes) } } - pub struct IOByteReader { read: BufReader, len: usize, pos: Option } - -impl IOByteReader { +impl IOByteReader { pub fn new(len: usize, read: R) -> Self { Self { read: BufReader::new(read), @@ -66,8 +61,7 @@ impl IOByteReader { } } - -impl ByteReader for IOByteReader { +impl ByteReader for IOByteReader { fn len(&self) -> usize { self.len } @@ -76,7 +70,7 @@ impl ByteReader for IOByteReader { ensure!(offset + len <= self.len, "out of bounds read"); if len == 0 { - return Ok(&[]) + return Ok(&[]); } if let Some(pos) = self.pos { @@ -89,7 +83,7 @@ impl ByteReader for IOByteReader { let bytes = self.read.fill_buf()?; ensure!(bytes.len() > 0, "reached EOF"); - + let take = std::cmp::min(len, bytes.len()); self.pos = Some(offset); diff --git a/crates/array/src/io/reader/mod.rs b/crates/array/src/io/reader/mod.rs index 288aa5d5..711dccac 100644 --- a/crates/array/src/io/reader/mod.rs +++ b/crates/array/src/io/reader/mod.rs @@ -1,43 +1,39 @@ -use crate::reader::{Reader, ReaderFactory}; -use arrow_buffer::ArrowNativeType; use std::marker::PhantomData; +use arrow_buffer::ArrowNativeType; + +use crate::reader::{Reader, ReaderFactory}; -mod byte_reader; mod bitmask; +mod byte_reader; mod native; mod nullmask; mod offsets; - pub use bitmask::*; pub use byte_reader::*; pub use native::*; pub use nullmask::*; pub use offsets::*; - pub struct IOReader { byte_reader: PhantomData } - -impl Reader for IOReader { +impl Reader for IOReader { type Nullmask = NullmaskIOReader; type Bitmask = BitmaskIOReader; type Native = NativeIOReader; type Offset = OffsetsIOReader; -} - +} pub trait IOReaderFactory { type ByteReader: ByteReader; - + fn next_byte_reader(&mut self) -> anyhow::Result; } - -impl ReaderFactory for F { +impl ReaderFactory for F { type Reader = IOReader; fn nullmask(&mut self) -> anyhow::Result<::Nullmask> { @@ -61,11 +57,10 @@ impl ReaderFactory for F { } } - -impl anyhow::Result> IOReaderFactory for F { +impl anyhow::Result> IOReaderFactory for F { type ByteReader = R; fn next_byte_reader(&mut self) -> anyhow::Result { self() } -} \ No newline at end of file +} diff --git a/crates/array/src/io/reader/native.rs b/crates/array/src/io/reader/native.rs index 001637d7..7e9c946c 100644 --- a/crates/array/src/io/reader/native.rs +++ b/crates/array/src/io/reader/native.rs @@ -1,23 +1,20 @@ use anyhow::ensure; -use crate::io::reader::byte_reader::ByteReader; -use crate::reader::NativeReader; -use crate::writer::NativeWriter; +use crate::{io::reader::byte_reader::ByteReader, reader::NativeReader, writer::NativeWriter}; pub struct NativeIOReader { byte_reader: R, value_size: usize } - -impl NativeIOReader { +impl NativeIOReader { pub fn new_unchecked(byte_reader: R, value_size: usize) -> Self { Self { byte_reader, value_size } } - + pub fn new(byte_reader: R, value_size: usize) -> anyhow::Result { ensure!( byte_reader.len() % value_size == 0, @@ -32,21 +29,15 @@ impl NativeIOReader { } } - -impl NativeReader for NativeIOReader { +impl NativeReader for NativeIOReader { fn len(&self) -> usize { self.byte_reader.len() / self.value_size } - fn read_slice( - &mut self, - dst: &mut impl NativeWriter, - offset: usize, - len: usize - ) -> anyhow::Result<()> - { - self.byte_reader.read_exact(offset * self.value_size, len * self.value_size, |bytes| { - dst.write_slice(bytes) - }) + fn read_slice(&mut self, dst: &mut impl NativeWriter, offset: usize, len: usize) -> anyhow::Result<()> { + self.byte_reader + .read_exact(offset * self.value_size, len * self.value_size, |bytes| { + dst.write_slice(bytes) + }) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/reader/nullmask.rs b/crates/array/src/io/reader/nullmask.rs index df219b3c..56828235 100644 --- a/crates/array/src/io/reader/nullmask.rs +++ b/crates/array/src/io/reader/nullmask.rs @@ -1,18 +1,18 @@ -use crate::io::reader::bitmask::BitmaskIOReader; -use crate::io::reader::byte_reader::ByteReader; -use crate::reader::BitmaskReader; -use crate::writer::BitmaskWriter; use anyhow::ensure; use arrow_buffer::bit_util; +use crate::{ + io::reader::{bitmask::BitmaskIOReader, byte_reader::ByteReader}, + reader::BitmaskReader, + writer::BitmaskWriter +}; pub struct NullmaskIOReader { bitmask: Option>, len: usize } - -impl NullmaskIOReader { +impl NullmaskIOReader { pub fn new(mut byte_reader: R) -> anyhow::Result { let (byte_len, bit_len) = BitmaskIOReader::::read_length(&mut byte_reader)?; if byte_len == 0 { @@ -33,10 +33,7 @@ impl NullmaskIOReader { } pub fn new_empty(len: usize) -> Self { - Self { - bitmask: None, - len - } + Self { bitmask: None, len } } pub fn from_bitmask(bitmask: BitmaskIOReader) -> Self { @@ -47,24 +44,17 @@ impl NullmaskIOReader { } } - -impl BitmaskReader for NullmaskIOReader { +impl BitmaskReader for NullmaskIOReader { fn len(&self) -> usize { self.len } - fn read_slice( - &mut self, - dst: &mut impl BitmaskWriter, - offset: usize, - len: usize - ) -> anyhow::Result<()> - { + fn read_slice(&mut self, dst: &mut impl BitmaskWriter, offset: usize, len: usize) -> anyhow::Result<()> { if let Some(bitmask) = self.bitmask.as_mut() { bitmask.read_slice(dst, offset, len) } else { ensure!(offset + len <= self.len); dst.write_many(true, len) - } + } } -} \ No newline at end of file +} diff --git a/crates/array/src/io/reader/offsets.rs b/crates/array/src/io/reader/offsets.rs index a8b6961f..59c501b1 100644 --- a/crates/array/src/io/reader/offsets.rs +++ b/crates/array/src/io/reader/offsets.rs @@ -1,22 +1,18 @@ -use crate::io::reader::byte_reader::ByteReader; -use crate::offsets::Offsets; -use crate::reader::OffsetsReader; -use crate::writer::OffsetsWriter; +use std::ops::Range; + use anyhow::{anyhow, ensure}; use arrow_buffer::MutableBuffer; -use std::ops::Range; +use crate::{io::reader::byte_reader::ByteReader, offsets::Offsets, reader::OffsetsReader, writer::OffsetsWriter}; const OS: usize = size_of::(); - pub struct OffsetsIOReader { byte_reader: R, len: usize, buf: MutableBuffer } - impl OffsetsIOReader { pub fn new(byte_reader: R) -> anyhow::Result { let len = byte_reader.len(); @@ -36,11 +32,9 @@ impl OffsetsIOReader { fn buffered_offsets(&self) -> &[i32] { let len = self.buf.len() / OS; - unsafe { - std::slice::from_raw_parts(self.buf.as_ptr().cast(), len) - } + unsafe { std::slice::from_raw_parts(self.buf.as_ptr().cast(), len) } } - + fn read(&mut self, byte_offset: &mut usize, byte_len: &mut usize) -> anyhow::Result<()> { let buf = self.byte_reader.read(*byte_offset, *byte_len)?; self.buf.extend_from_slice(buf); @@ -48,7 +42,7 @@ impl OffsetsIOReader { *byte_len -= buf.len(); Ok(()) } - + fn shift(&mut self) { let len = self.buf.len() / OS; if len > 1 { @@ -60,32 +54,25 @@ impl OffsetsIOReader { } } - -impl OffsetsReader for OffsetsIOReader { +impl OffsetsReader for OffsetsIOReader { fn len(&self) -> usize { self.len } - - fn read_slice( - &mut self, - dst: &mut impl OffsetsWriter, - offset: usize, - len: usize - ) -> anyhow::Result> - { + + fn read_slice(&mut self, dst: &mut impl OffsetsWriter, offset: usize, len: usize) -> anyhow::Result> { ensure!(offset + len <= self.len); self.buf.clear(); - + let mut byte_offset = offset * OS; let mut byte_len = (len + 1) * OS; - + loop { self.read(&mut byte_offset, &mut byte_len)?; if self.buf.len() >= OS { - break + break; } } - + let first_offset = self.buffered_offsets()[0] as usize; while byte_len > 0 { @@ -100,7 +87,7 @@ impl OffsetsReader for OffsetsIOReader { let offsets = Offsets::try_new(self.buffered_offsets()).map_err(|msg| anyhow!(msg))?; dst.write_slice(offsets)?; - + Ok(first_offset..offsets.last_index()) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/writer/bitmask.rs b/crates/array/src/io/writer/bitmask.rs index fe2ff427..6cd74e42 100644 --- a/crates/array/src/io/writer/bitmask.rs +++ b/crates/array/src/io/writer/bitmask.rs @@ -1,9 +1,8 @@ -use crate::index::RangeList; -use crate::writer::BitmaskWriter; -use arrow_buffer::bit_chunk_iterator::BitChunks; -use arrow_buffer::{bit_util, ToByteSlice}; use std::io::Write; +use arrow_buffer::{bit_chunk_iterator::BitChunks, bit_util, ToByteSlice}; + +use crate::{index::RangeList, writer::BitmaskWriter}; pub struct BitmaskIOWriter { write: W, @@ -12,8 +11,7 @@ pub struct BitmaskIOWriter { len: usize } - -impl BitmaskIOWriter { +impl BitmaskIOWriter { pub fn new(writer: W) -> Self { Self { write: writer, @@ -22,24 +20,23 @@ impl BitmaskIOWriter { len: 0 } } - + pub fn into_write(self) -> W { self.write } - + #[inline] fn buf_mut_prt(&mut self) -> *mut u8 { std::ptr::from_mut(&mut self.buf).cast() } } - -impl BitmaskWriter for BitmaskIOWriter { +impl BitmaskWriter for BitmaskIOWriter { fn write_slice(&mut self, data: &[u8], mut offset: usize, mut len: usize) -> anyhow::Result<()> { assert!(data.len() >= bit_util::ceil(offset + len, 8)); - + self.len += len; - + if self.buf_len > 0 { let to_set = std::cmp::min(64 - self.buf_len, len); unsafe { @@ -51,33 +48,28 @@ impl BitmaskWriter for BitmaskIOWriter { self.buf = 0; self.buf_len = 0; } else { - return Ok(()) + return Ok(()); } offset += to_set; len -= to_set; } if len == 0 { - return Ok(()) + return Ok(()); } let bit_chunks = BitChunks::new(data, offset, len); for chunk in bit_chunks.iter() { self.write.write_all(chunk.to_byte_slice())?; } - + self.buf = bit_chunks.remainder_bits(); self.buf_len = bit_chunks.remainder_len(); - + Ok(()) } - fn write_slice_indexes( - &mut self, - data: &[u8], - mut indexes: impl Iterator - ) -> anyhow::Result<()> - { + fn write_slice_indexes(&mut self, data: &[u8], mut indexes: impl Iterator) -> anyhow::Result<()> { loop { while self.buf_len < 64 { if let Some(i) = indexes.next() { @@ -89,7 +81,7 @@ impl BitmaskWriter for BitmaskIOWriter { self.buf_len += 1; self.len += 1; } else { - return Ok(()) + return Ok(()); } } self.write.write_all(self.buf.to_byte_slice())?; @@ -107,18 +99,18 @@ impl BitmaskWriter for BitmaskIOWriter { fn write_many(&mut self, val: bool, mut count: usize) -> anyhow::Result<()> { self.len += count; - + let ones: u64 = !0; - + if self.buf_len > 0 { let to_set = std::cmp::min(64 - self.buf_len, count); let new_len = self.buf_len + to_set; - + if val { self.buf |= ones << self.buf_len; self.buf &= ones >> (64 - new_len); } - + if new_len == 64 { self.write.write_all(self.buf.to_byte_slice())?; self.buf = 0; @@ -126,7 +118,7 @@ impl BitmaskWriter for BitmaskIOWriter { count -= to_set; } else { self.buf_len = new_len; - return Ok(()) + return Ok(()); } } @@ -146,28 +138,24 @@ impl BitmaskWriter for BitmaskIOWriter { } self.buf_len = count; } - + Ok(()) } } - -impl BitmaskIOWriter { +impl BitmaskIOWriter { pub fn finish(mut self) -> anyhow::Result { if self.buf_len > 0 { let byte_len = bit_util::ceil(self.buf_len, 8); self.write.write_all(&self.buf.to_byte_slice()[0..byte_len])?; } - self.write.write_all( - (self.len as u32).to_byte_slice() - )?; + self.write.write_all((self.len as u32).to_byte_slice())?; Ok(self.write) } } - unsafe fn set_bits_slow(dst: *mut u8, dst_offset: usize, data: *const u8, offset: usize, len: usize) { for i in 0..len { if bit_util::get_bit_raw(data, offset + i) { @@ -176,20 +164,15 @@ unsafe fn set_bits_slow(dst: *mut u8, dst_offset: usize, data: *const u8, offset } } - #[cfg(test)] mod test { use arrow_buffer::BooleanBufferBuilder; use proptest::prelude::*; - use crate::io::writer::BitmaskIOWriter; - use crate::writer::BitmaskWriter; + use crate::{io::writer::BitmaskIOWriter, writer::BitmaskWriter}; fn arb_write_many() -> impl Strategy> { - prop::collection::vec( - (any::(), 0..100usize), - 0..100 - ) + prop::collection::vec((any::(), 0..100usize), 0..100) } fn arb_write_slice() -> impl Strategy, Vec<(usize, usize)>)> { @@ -239,8 +222,8 @@ mod test { builder.append_packed_range(offset..offset + len, &case.0); } let ref_buf = builder.finish(); - + assert_eq!(&buf[0..buf.len() - 4], ref_buf.values()); } } -} \ No newline at end of file +} diff --git a/crates/array/src/io/writer/mod.rs b/crates/array/src/io/writer/mod.rs index 17debb15..e2b706e6 100644 --- a/crates/array/src/io/writer/mod.rs +++ b/crates/array/src/io/writer/mod.rs @@ -1,26 +1,23 @@ -use crate::writer::{AnyWriter, Writer, WriterFactory}; +use std::{io::Write, marker::PhantomData}; + use arrow_buffer::ArrowNativeType; -use std::io::Write; -use std::marker::PhantomData; +use crate::writer::{AnyWriter, Writer, WriterFactory}; mod bitmask; mod native; mod nullmask; mod offsets; - pub use bitmask::*; pub use native::*; pub use nullmask::*; pub use offsets::*; - pub struct IOWriter { phantom_data: PhantomData } - impl Writer for IOWriter { type Bitmask = BitmaskIOWriter; type Nullmask = NullmaskIOWriter; @@ -28,14 +25,12 @@ impl Writer for IOWriter { type Offset = OffsetsIOWriter; } - pub trait IOWriterFactory { type Write: Write; - + fn next_write(&mut self) -> anyhow::Result; } - impl WriterFactory for F { type Writer = IOWriter; @@ -60,8 +55,7 @@ impl WriterFactory for F { } } - -impl anyhow::Result> IOWriterFactory for F { +impl anyhow::Result> IOWriterFactory for F { type Write = W; fn next_write(&mut self) -> anyhow::Result { @@ -69,14 +63,13 @@ impl anyhow::Result> IOWriterFactory for F { } } - impl IOWriter { pub fn finish_any_writer(writer: AnyWriter>) -> anyhow::Result { Ok(match writer { AnyWriter::Bitmask(w) => w.finish()?, AnyWriter::Nullmask(w) => w.finish()?, AnyWriter::Native(w) => w.into_write(), - AnyWriter::Offsets(w) => w.finish()?, + AnyWriter::Offsets(w) => w.finish()? }) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/writer/native.rs b/crates/array/src/io/writer/native.rs index 0c1b8995..d4d1cce2 100644 --- a/crates/array/src/io/writer/native.rs +++ b/crates/array/src/io/writer/native.rs @@ -1,15 +1,14 @@ -use crate::index::RangeList; -use crate::writer::NativeWriter; -use arrow_buffer::{ArrowNativeType, ToByteSlice}; use std::io::Write; +use arrow_buffer::{ArrowNativeType, ToByteSlice}; + +use crate::{index::RangeList, writer::NativeWriter}; pub struct NativeIOWriter { write: W } - -impl NativeIOWriter { +impl NativeIOWriter { pub fn new(write: W) -> Self { Self { write } } @@ -19,8 +18,7 @@ impl NativeIOWriter { } } - -impl NativeWriter for NativeIOWriter { +impl NativeWriter for NativeIOWriter { #[inline] fn write(&mut self, value: T) -> anyhow::Result<()> { self.write.write_all(value.to_byte_slice())?; @@ -28,7 +26,7 @@ impl NativeWriter for NativeIOWriter { } #[inline] - fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()> { + fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()> { for v in values { self.write.write_all(v.to_byte_slice())? } @@ -43,9 +41,9 @@ impl NativeWriter for NativeIOWriter { #[inline] fn write_slice_indexes( - &mut self, - values: &[T], - indexes: impl Iterator + &mut self, + values: &[T], + indexes: impl Iterator ) -> anyhow::Result<()> { for i in indexes { self.write(values[i])?; @@ -55,8 +53,8 @@ impl NativeWriter for NativeIOWriter { #[inline] fn write_slice_ranges( - &mut self, - values: &[T], + &mut self, + values: &[T], ranges: &mut impl RangeList ) -> anyhow::Result<()> { for r in ranges.iter() { @@ -64,4 +62,4 @@ impl NativeWriter for NativeIOWriter { } Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/io/writer/nullmask.rs b/crates/array/src/io/writer/nullmask.rs index 1f5f1260..df54884c 100644 --- a/crates/array/src/io/writer/nullmask.rs +++ b/crates/array/src/io/writer/nullmask.rs @@ -1,24 +1,21 @@ -use crate::index::RangeList; -use crate::io::writer::bitmask::BitmaskIOWriter; -use crate::util::bit_tools; -use crate::writer::BitmaskWriter; -use arrow_buffer::ToByteSlice; use std::io::Write; +use arrow_buffer::ToByteSlice; + +use crate::{index::RangeList, io::writer::bitmask::BitmaskIOWriter, util::bit_tools, writer::BitmaskWriter}; pub struct NullmaskIOWriter { nulls: BitmaskIOWriter, has_nulls: bool, - len: usize, + len: usize } - -impl NullmaskIOWriter { +impl NullmaskIOWriter { pub fn new(writer: W) -> Self { Self { nulls: BitmaskIOWriter::new(writer), has_nulls: false, - len: 0, + len: 0 } } @@ -31,15 +28,15 @@ impl NullmaskIOWriter { Ok(write) } } - + #[inline] fn check_bitmask_presence(&mut self, all_valid: impl FnOnce() -> Option) -> anyhow::Result { if self.has_nulls { - return Ok(true) + return Ok(true); } if let Some(len) = all_valid() { self.len += len; - return Ok(false) + return Ok(false); } self.has_nulls = true; self.nulls.write_many(true, self.len)?; @@ -47,7 +44,6 @@ impl NullmaskIOWriter { } } - impl BitmaskWriter for NullmaskIOWriter { fn write_slice(&mut self, data: &[u8], offset: usize, len: usize) -> anyhow::Result<()> { if self.check_bitmask_presence(|| bit_tools::all_valid(data, offset, len).then_some(len))? { @@ -56,12 +52,7 @@ impl BitmaskWriter for NullmaskIOWriter { Ok(()) } - fn write_slice_indexes( - &mut self, - data: &[u8], - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) -> anyhow::Result<()> { if self.check_bitmask_presence(|| bit_tools::all_indexes_valid(data, indexes.clone()))? { self.nulls.write_slice_indexes(data, indexes)?; } @@ -77,14 +68,14 @@ impl BitmaskWriter for NullmaskIOWriter { fn write_many(&mut self, val: bool, count: usize) -> anyhow::Result<()> { if count == 0 { - return Ok(()) + return Ok(()); } match (self.has_nulls, val) { (true, val) => self.nulls.write_many(val, count), (false, true) => { self.len += count; Ok(()) - }, + } (false, false) => { self.has_nulls = true; self.nulls.write_many(true, self.len)?; @@ -92,4 +83,4 @@ impl BitmaskWriter for NullmaskIOWriter { } } } -} \ No newline at end of file +} diff --git a/crates/array/src/io/writer/offsets.rs b/crates/array/src/io/writer/offsets.rs index c956fc35..b1c08fd2 100644 --- a/crates/array/src/io/writer/offsets.rs +++ b/crates/array/src/io/writer/offsets.rs @@ -1,9 +1,8 @@ -use crate::offsets::Offsets; -use crate::writer::OffsetsWriter; -use arrow_buffer::ToByteSlice; use std::io::Write; -use crate::index::RangeList; +use arrow_buffer::ToByteSlice; + +use crate::{index::RangeList, offsets::Offsets, writer::OffsetsWriter}; pub struct OffsetsIOWriter { write: W, @@ -11,8 +10,7 @@ pub struct OffsetsIOWriter { first_offset: bool } - -impl OffsetsIOWriter { +impl OffsetsIOWriter { pub fn new(write: W) -> Self { Self { write, @@ -20,7 +18,7 @@ impl OffsetsIOWriter { first_offset: true } } - + pub fn finish(mut self) -> anyhow::Result { self.write_first_offset()?; Ok(self.write) @@ -34,48 +32,51 @@ impl OffsetsIOWriter { } Ok(()) } - + #[inline] fn write_slice(&mut self, offsets: Offsets<'_>) -> anyhow::Result<()> { let beg = offsets.first_offset(); let last_offset = self.last_offset; - + for offset in offsets.values()[1..].iter().copied() { let val = offset - beg + last_offset; self.write.write_all(val.to_byte_slice())?; self.last_offset = val } - + Ok(()) } } - -impl OffsetsWriter for OffsetsIOWriter { +impl OffsetsWriter for OffsetsIOWriter { fn write_slice(&mut self, offsets: Offsets<'_>) -> anyhow::Result<()> { self.write_first_offset()?; self.write_slice(offsets) } - fn write_slice_indexes(&mut self, offsets: Offsets<'_>, indexes: impl Iterator) -> anyhow::Result<()> { + fn write_slice_indexes( + &mut self, + offsets: Offsets<'_>, + indexes: impl Iterator + ) -> anyhow::Result<()> { self.write_first_offset()?; - + for i in indexes { let len = offsets.values()[i + 1] - offsets.values()[i]; self.last_offset += len; self.write.write_all(self.last_offset.to_byte_slice())?; } - + Ok(()) } fn write_slice_ranges(&mut self, offsets: Offsets<'_>, ranges: &mut impl RangeList) -> anyhow::Result<()> { self.write_first_offset()?; - + for r in ranges.iter() { self.write_slice(offsets.slice_by_range(r))? } - + Ok(()) } @@ -86,4 +87,4 @@ impl OffsetsWriter for OffsetsIOWriter { self.write.write_all(self.last_offset.to_byte_slice())?; Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/item_index_cast.rs b/crates/array/src/item_index_cast.rs index 72c7f237..720ccbcc 100644 --- a/crates/array/src/item_index_cast.rs +++ b/crates/array/src/item_index_cast.rs @@ -1,53 +1,43 @@ -use crate::offsets::Offsets; -use crate::slice::nullmask::NullmaskSlice; -use crate::slice::{AnyListItem, AnySlice, ListSlice, PrimitiveSlice, Slice}; -use crate::writer::{ArrayWriter, NativeWriter, OffsetsWriter}; +use std::{fmt::Debug, sync::Arc}; + use anyhow::{anyhow, bail}; use arrow::datatypes::{DataType, Field}; use arrow_buffer::ArrowNativeType; -use std::fmt::Debug; -use std::sync::Arc; +use crate::{ + offsets::Offsets, + slice::{nullmask::NullmaskSlice, AnyListItem, AnySlice, ListSlice, PrimitiveSlice, Slice}, + writer::{ArrayWriter, NativeWriter, OffsetsWriter} +}; pub fn is_item_index(data_type: &DataType) -> bool { match data_type { DataType::UInt16 => true, DataType::UInt32 => true, DataType::UInt64 => true, - DataType::List(f) => { - match f.data_type() { - DataType::UInt16 => true, - DataType::UInt32 => true, - _ => false - } + DataType::List(f) => match f.data_type() { + DataType::UInt16 => true, + DataType::UInt32 => true, + _ => false }, _ => false } } - pub fn common_item_index_type(a: &DataType, b: &DataType) -> Option { - match (a, b) { - (DataType::List(ai), DataType::List(bi)) => { - common_index(ai.data_type(), bi.data_type()).map(|t| { - let field = Field::new( - ai.name(), - t, - ai.is_nullable() || bi.is_nullable() - ); - DataType::List(Arc::new(field)) - }) - }, + match (a, b) { + (DataType::List(ai), DataType::List(bi)) => common_index(ai.data_type(), bi.data_type()).map(|t| { + let field = Field::new(ai.name(), t, ai.is_nullable() || bi.is_nullable()); + DataType::List(Arc::new(field)) + }), (a, b) => common_index(a, b) } } - fn common_index(a: &DataType, b: &DataType) -> Option { common_index_inner(a, b).or_else(|| common_index_inner(b, a)) } - fn common_index_inner(a: &DataType, b: &DataType) -> Option { match (a, b) { (DataType::UInt16, DataType::UInt16) => Some(DataType::UInt16), @@ -59,20 +49,13 @@ fn common_index_inner(a: &DataType, b: &DataType) -> Option { } } - macro_rules! invalid_cast { ($from:expr, $to:expr) => { bail!("invalid index cast from {} to {}", $from, $to) }; } - -pub fn cast_item_index( - src: &AnySlice<'_>, - target_type: &DataType, - dst: &mut impl ArrayWriter -) -> anyhow::Result<()> -{ +pub fn cast_item_index(src: &AnySlice<'_>, target_type: &DataType, dst: &mut impl ArrayWriter) -> anyhow::Result<()> { match src { AnySlice::UInt16(s) => match target_type { DataType::UInt32 => cast_primitive::<_, u32>(s, dst), @@ -93,32 +76,25 @@ pub fn cast_item_index( } } - -fn cast_primitive( - src: &PrimitiveSlice<'_, S>, - dst: &mut impl ArrayWriter -) -> anyhow::Result<()> +fn cast_primitive(src: &PrimitiveSlice<'_, S>, dst: &mut impl ArrayWriter) -> anyhow::Result<()> where S: ArrowNativeType, T: ArrowNativeType, T: TryFrom { src.nulls().write(dst.nullmask(0))?; - + let mut conversion_error: Option = None; - - dst.native(1).write_iter( - src.values().iter().map(|v| { - match T::try_from(*v) { - Ok(value) => value, - Err(err) => { - conversion_error = Some(err); - T::default() - } + + dst.native(1) + .write_iter(src.values().iter().map(|v| match T::try_from(*v) { + Ok(value) => value, + Err(err) => { + conversion_error = Some(err); + T::default() } - }) - )?; - + }))?; + if let Some(err) = conversion_error { Err(anyhow!("conversion error: {:?}", err)) } else { @@ -126,60 +102,44 @@ where } } - fn cast_address( src: &ListSlice<'_, AnyListItem<'_>>, target_type: &DataType, dst: &mut impl ArrayWriter -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { match src.values().item() { AnySlice::UInt16(items) => match target_type { DataType::List(f) if f.data_type() == &DataType::UInt32 => { - cast_address_impl::<_, u32>( - src.nulls(), - src.offsets(), - items, - dst - ) - }, + cast_address_impl::<_, u32>(src.nulls(), src.offsets(), items, dst) + } _ => invalid_cast!("u16 list", target_type) }, AnySlice::UInt32(items) => match target_type { DataType::List(f) if f.data_type() == &DataType::UInt16 => { - cast_address_impl::<_, u16>( - src.nulls(), - src.offsets(), - items, - dst - ) - }, + cast_address_impl::<_, u16>(src.nulls(), src.offsets(), items, dst) + } _ => invalid_cast!("u32 list", target_type) }, _ => bail!("src is not an index slice") } } - fn cast_address_impl( nulls: NullmaskSlice<'_>, offsets: Offsets<'_>, - items: PrimitiveSlice<'_, S>, + items: PrimitiveSlice<'_, S>, dst: &mut impl ArrayWriter -) -> anyhow::Result<()> +) -> anyhow::Result<()> where S: ArrowNativeType, T: ArrowNativeType, T: TryFrom { nulls.write(dst.nullmask(0))?; - + dst.offset(1).write_slice(offsets)?; let item_range = offsets.range(); - - cast_primitive::( - &items.slice(item_range.start, item_range.len()), - &mut dst.shift(2) - ) -} \ No newline at end of file + + cast_primitive::(&items.slice(item_range.start, item_range.len()), &mut dst.shift(2)) +} diff --git a/crates/array/src/offsets.rs b/crates/array/src/offsets.rs index 4322ebe7..b4e11e14 100644 --- a/crates/array/src/offsets.rs +++ b/crates/array/src/offsets.rs @@ -1,24 +1,22 @@ -use crate::util::validate_offsets; -use arrow_buffer::OffsetBuffer; use std::ops::Range; +use arrow_buffer::OffsetBuffer; + +use crate::util::validate_offsets; #[derive(Copy, Clone)] pub struct Offsets<'a> { offsets: &'a [i32] } - -impl <'a> Offsets<'a> { +impl<'a> Offsets<'a> { pub fn new(offsets: &'a [i32]) -> Self { Self::try_new(offsets).unwrap() } - + pub fn try_new(offsets: &'a [i32]) -> Result { validate_offsets(offsets, 0)?; - Ok(Self { - offsets - }) + Ok(Self { offsets }) } #[inline] @@ -55,7 +53,7 @@ impl <'a> Offsets<'a> { pub fn last_offset(&self) -> i32 { self.offsets[self.len()] } - + #[inline] pub fn first_index(&self) -> usize { self.first_offset() as usize @@ -65,10 +63,10 @@ impl <'a> Offsets<'a> { pub fn last_index(&self) -> usize { self.last_offset() as usize } - + #[inline] pub fn range(&self) -> Range { - self.first_index().. self.last_index() + self.first_index()..self.last_index() } #[inline] @@ -86,11 +84,8 @@ impl <'a> Offsets<'a> { } } - -impl <'a> From<&'a OffsetBuffer> for Offsets<'a> { +impl<'a> From<&'a OffsetBuffer> for Offsets<'a> { fn from(value: &'a OffsetBuffer) -> Self { - Self { - offsets: value.inner() - } + Self { offsets: value.inner() } } -} \ No newline at end of file +} diff --git a/crates/array/src/reader/any.rs b/crates/array/src/reader/any.rs index ef905c38..9d9de1f9 100644 --- a/crates/array/src/reader/any.rs +++ b/crates/array/src/reader/any.rs @@ -1,17 +1,26 @@ -use crate::chunking::ChunkRange; -use crate::reader::native::{ChunkedNativeArrayReader, NativeArrayReader}; -use crate::reader::{ArrayReader, BinaryReader, BooleanReader, ChunkedArrayReader, ChunkedBinaryReader, ChunkedBooleanReader, ChunkedFixedSizeBinaryReader, ChunkedFixedSizeListReader, ChunkedListReader, ChunkedPrimitiveReader, ChunkedStructReader, FixedSizeBinaryReader, FixedSizeListReader, ListReader, PrimitiveReader, Reader, ReaderFactory, StructReader}; -use crate::visitor::DataTypeVisitor; -use crate::writer::ArrayWriter; -use arrow::array::ArrowPrimitiveType; -use arrow::datatypes::{DataType, FieldRef}; use std::marker::PhantomData; +use arrow::{ + array::ArrowPrimitiveType, + datatypes::{DataType, FieldRef} +}; + +use crate::{ + chunking::ChunkRange, + reader::{ + native::{ChunkedNativeArrayReader, NativeArrayReader}, + ArrayReader, BinaryReader, BooleanReader, ChunkedArrayReader, ChunkedBinaryReader, ChunkedBooleanReader, + ChunkedFixedSizeBinaryReader, ChunkedFixedSizeListReader, ChunkedListReader, ChunkedPrimitiveReader, + ChunkedStructReader, FixedSizeBinaryReader, FixedSizeListReader, ListReader, PrimitiveReader, Reader, + ReaderFactory, StructReader + }, + visitor::DataTypeVisitor, + writer::ArrayWriter +}; pub type AnyListReader = ListReader>; pub type AnyFixedSizeListReader = FixedSizeListReader>; - pub enum AnyReader { Boolean(BooleanReader), Primitive(PrimitiveReader), @@ -22,18 +31,11 @@ pub enum AnyReader { Struct(StructReader) } - -impl AnyReader { - pub fn from_factory( - factory: &mut impl ReaderFactory, - data_type: &DataType - ) -> anyhow::Result - { - AnyReaderFactory { - factory - }.visit(data_type) +impl AnyReader { + pub fn from_factory(factory: &mut impl ReaderFactory, data_type: &DataType) -> anyhow::Result { + AnyReaderFactory { factory }.visit(data_type) } - + #[inline] pub fn as_boolean(&mut self) -> &mut BooleanReader { match self { @@ -91,8 +93,7 @@ impl AnyReader { } } - -impl ArrayReader for AnyReader { +impl ArrayReader for AnyReader { fn num_buffers(&self) -> usize { match self { AnyReader::Boolean(r) => r.num_buffers(), @@ -101,7 +102,7 @@ impl ArrayReader for AnyReader { AnyReader::FixedSizeBinary(r) => r.num_buffers(), AnyReader::List(r) => r.num_buffers(), AnyReader::FixedSizeList(r) => r.num_buffers(), - AnyReader::Struct(r) => r.num_buffers(), + AnyReader::Struct(r) => r.num_buffers() } } @@ -113,7 +114,7 @@ impl ArrayReader for AnyReader { AnyReader::FixedSizeBinary(r) => r.len(), AnyReader::List(r) => r.len(), AnyReader::FixedSizeList(r) => r.len(), - AnyReader::Struct(r) => r.len(), + AnyReader::Struct(r) => r.len() } } @@ -125,66 +126,58 @@ impl ArrayReader for AnyReader { AnyReader::FixedSizeBinary(r) => r.read_slice(dst, offset, len), AnyReader::List(r) => r.read_slice(dst, offset, len), AnyReader::FixedSizeList(r) => r.read_slice(dst, offset, len), - AnyReader::Struct(r) => r.read_slice(dst, offset, len), + AnyReader::Struct(r) => r.read_slice(dst, offset, len) } } } - -impl From> for AnyReader { +impl From> for AnyReader { fn from(value: BooleanReader) -> Self { AnyReader::Boolean(value) } } - -impl From> for AnyReader { +impl From> for AnyReader { fn from(value: PrimitiveReader) -> Self { AnyReader::Primitive(value) } } - -impl From> for AnyReader { +impl From> for AnyReader { fn from(value: BinaryReader) -> Self { AnyReader::Binary(value) } } - -impl From> for AnyReader { +impl From> for AnyReader { fn from(value: FixedSizeBinaryReader) -> Self { AnyReader::FixedSizeBinary(value) } } - -impl From> for AnyReader { +impl From> for AnyReader { fn from(value: AnyListReader) -> Self { AnyReader::List(Box::new(value)) } } -impl From> for AnyReader { +impl From> for AnyReader { fn from(value: AnyFixedSizeListReader) -> Self { AnyReader::FixedSizeList(Box::new(value)) } } - -impl From> for AnyReader { +impl From> for AnyReader { fn from(value: StructReader) -> Self { AnyReader::Struct(value) } } - struct AnyReaderFactory<'a, F> { - factory: &'a mut F, + factory: &'a mut F } - -impl <'a, F: ReaderFactory> DataTypeVisitor for AnyReaderFactory<'a, F> { +impl<'a, F: ReaderFactory> DataTypeVisitor for AnyReaderFactory<'a, F> { type Result = anyhow::Result>; fn boolean(&mut self) -> Self::Result { @@ -205,22 +198,14 @@ impl <'a, F: ReaderFactory> DataTypeVisitor for AnyReaderFactory<'a, F> { let nulls = self.factory.nullmask()?; let offsets = self.factory.offset()?; let values = self.factory.native::()?; - let reader = BinaryReader::try_new( - nulls, - offsets, - NativeArrayReader::new(values) - )?; + let reader = BinaryReader::try_new(nulls, offsets, NativeArrayReader::new(values))?; Ok(reader.into()) } fn fixed_size_binary(&mut self, size: usize) -> Self::Result { let nulls = self.factory.nullmask()?; let values = self.factory.native::()?; - let reader = FixedSizeBinaryReader::try_new( - size, - nulls, - NativeArrayReader::new(values) - )?; + let reader = FixedSizeBinaryReader::try_new(size, nulls, NativeArrayReader::new(values))?; Ok(reader.into()) } @@ -228,32 +213,27 @@ impl <'a, F: ReaderFactory> DataTypeVisitor for AnyReaderFactory<'a, F> { let nulls = self.factory.nullmask()?; let offsets = self.factory.offset()?; let values = self.visit(item)?; - let reader = ListReader::try_new( - nulls, - offsets, - values - )?; + let reader = ListReader::try_new(nulls, offsets, values)?; Ok(reader.into()) } fn r#struct(&mut self, fields: &[FieldRef]) -> Self::Result { let nulls = self.factory.nullmask()?; - - let columns = fields.iter() + + let columns = fields + .iter() .map(|f| self.visit(f.data_type())) .collect::>>()?; - + let reader = StructReader::try_new(nulls, columns)?; - + Ok(reader.into()) } } - pub type AnyChunkedListReader = ChunkedListReader>; pub type AnyChunkedFixedSizeListReader = ChunkedFixedSizeListReader>; - pub enum AnyChunkedReader { Boolean(ChunkedBooleanReader), Primitive(ChunkedPrimitiveReader), @@ -264,21 +244,20 @@ pub enum AnyChunkedReader { Struct(ChunkedStructReader) } - impl AnyChunkedReader { pub fn new(data_type: &DataType) -> Self { Self::with_capacity(0, data_type) } - + pub fn with_capacity(cap: usize, data_type: &DataType) -> Self { AnyChunkedReaderFactory { cap, phantom_data: PhantomData::::default() - }.visit(data_type) + } + .visit(data_type) } } - impl ChunkedArrayReader for AnyChunkedReader { type Chunk = AnyReader; @@ -290,7 +269,7 @@ impl ChunkedArrayReader for AnyChunkedReader { AnyChunkedReader::FixedSizeBinary(r) => r.num_buffers(), AnyChunkedReader::List(r) => r.num_buffers(), AnyChunkedReader::FixedSizeList(r) => r.num_buffers(), - AnyChunkedReader::Struct(r) => r.num_buffers(), + AnyChunkedReader::Struct(r) => r.num_buffers() } } @@ -310,9 +289,8 @@ impl ChunkedArrayReader for AnyChunkedReader { fn read_chunked_ranges( &mut self, dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone - ) -> anyhow::Result<()> - { + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { match self { AnyChunkedReader::Boolean(r) => r.read_chunked_ranges(dst, ranges), AnyChunkedReader::Primitive(r) => r.read_chunked_ranges(dst, ranges), @@ -320,18 +298,16 @@ impl ChunkedArrayReader for AnyChunkedReader { AnyChunkedReader::FixedSizeBinary(r) => r.read_chunked_ranges(dst, ranges), AnyChunkedReader::List(r) => r.read_chunked_ranges(dst, ranges), AnyChunkedReader::FixedSizeList(r) => r.read_chunked_ranges(dst, ranges), - AnyChunkedReader::Struct(r) => r.read_chunked_ranges(dst, ranges), + AnyChunkedReader::Struct(r) => r.read_chunked_ranges(dst, ranges) } } } - struct AnyChunkedReaderFactory { cap: usize, phantom_data: PhantomData } - impl DataTypeVisitor for AnyChunkedReaderFactory { type Result = AnyChunkedReader; @@ -359,9 +335,7 @@ impl DataTypeVisitor for AnyChunkedReaderFactory { } fn list(&mut self, item: &DataType) -> Self::Result { - AnyChunkedReader::List(Box::new( - AnyChunkedListReader::new(self.cap, self.visit(item)) - )) + AnyChunkedReader::List(Box::new(AnyChunkedListReader::new(self.cap, self.visit(item)))) } fn r#struct(&mut self, fields: &[FieldRef]) -> Self::Result { @@ -369,4 +343,4 @@ impl DataTypeVisitor for AnyChunkedReaderFactory { let reader = ChunkedStructReader::new(self.cap, columns); AnyChunkedReader::Struct(reader) } -} \ No newline at end of file +} diff --git a/crates/array/src/reader/binary.rs b/crates/array/src/reader/binary.rs index 40a3ea84..eb87ce6a 100644 --- a/crates/array/src/reader/binary.rs +++ b/crates/array/src/reader/binary.rs @@ -1,9 +1,11 @@ -use crate::reader::native::{ChunkedNativeArrayReader, NativeArrayReader}; -use crate::reader::{ChunkedListReader, ChunkedFixedSizeListReader, FixedSizeListReader, ListReader, Reader}; - +use crate::reader::{ + native::{ChunkedNativeArrayReader, NativeArrayReader}, + ChunkedFixedSizeListReader, ChunkedListReader, FixedSizeListReader, ListReader, Reader +}; pub type BinaryReader = ListReader::Native>>; pub type ChunkedBinaryReader = ChunkedListReader::Native>>; pub type FixedSizeBinaryReader = FixedSizeListReader::Native>>; -pub type ChunkedFixedSizeBinaryReader = ChunkedFixedSizeListReader::Native>>; +pub type ChunkedFixedSizeBinaryReader = + ChunkedFixedSizeListReader::Native>>; diff --git a/crates/array/src/reader/boolean.rs b/crates/array/src/reader/boolean.rs index 4010ae94..cd4919cd 100644 --- a/crates/array/src/reader/boolean.rs +++ b/crates/array/src/reader/boolean.rs @@ -1,37 +1,31 @@ -use crate::chunking::ChunkRange; -use crate::reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, Reader}; -use crate::writer::ArrayWriter; use anyhow::ensure; +use crate::{ + chunking::ChunkRange, + reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, Reader}, + writer::ArrayWriter +}; pub struct BooleanReader { nulls: R::Nullmask, values: R::Bitmask } - -impl BooleanReader { +impl BooleanReader { pub fn new_unchecked(nulls: R::Nullmask, values: R::Bitmask) -> Self { - Self { - nulls, - values - } + Self { nulls, values } } - + pub fn try_new(nulls: R::Nullmask, values: R::Bitmask) -> anyhow::Result { ensure!( - nulls.len() == values.len(), + nulls.len() == values.len(), "null and value buffers have incompatible lengths" ); - Ok(Self { - nulls, - values - }) + Ok(Self { nulls, values }) } } - -impl ArrayReader for BooleanReader { +impl ArrayReader for BooleanReader { fn num_buffers(&self) -> usize { 2 } @@ -46,13 +40,11 @@ impl ArrayReader for BooleanReader { } } - pub struct ChunkedBooleanReader { nulls: Vec, values: Vec } - impl ChunkedBooleanReader { pub fn with_capacity(cap: usize) -> Self { Self { @@ -60,8 +52,7 @@ impl ChunkedBooleanReader { values: Vec::with_capacity(cap) } } -} - +} impl ChunkedArrayReader for ChunkedBooleanReader { type Chunk = BooleanReader; @@ -76,29 +67,20 @@ impl ChunkedArrayReader for ChunkedBooleanReader { } fn read_chunked_ranges( - &mut self, - dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone - ) -> anyhow::Result<()> - { + &mut self, + dst: &mut impl ArrayWriter, + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { let nullmask_dst = dst.nullmask(0); for r in ranges.clone() { - self.nulls[r.chunk_index()].read_slice( - nullmask_dst, - r.offset_index(), - r.len_index() - )?; + self.nulls[r.chunk_index()].read_slice(nullmask_dst, r.offset_index(), r.len_index())?; } - + let bitmask_dst = dst.bitmask(1); for r in ranges { - self.values[r.chunk_index()].read_slice( - bitmask_dst, - r.offset_index(), - r.len_index() - )?; + self.values[r.chunk_index()].read_slice(bitmask_dst, r.offset_index(), r.len_index())?; } - + Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/reader/fixed_size_list.rs b/crates/array/src/reader/fixed_size_list.rs index 9940a560..d73c13e1 100644 --- a/crates/array/src/reader/fixed_size_list.rs +++ b/crates/array/src/reader/fixed_size_list.rs @@ -1,8 +1,10 @@ -use crate::chunking::ChunkRange; -use crate::reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, Reader}; -use crate::writer::ArrayWriter; use anyhow::ensure; +use crate::{ + chunking::ChunkRange, + reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, Reader}, + writer::ArrayWriter +}; pub struct FixedSizeListReader { size: usize, @@ -10,24 +12,20 @@ pub struct FixedSizeListReader { values: T } - -impl FixedSizeListReader { +impl FixedSizeListReader { pub fn try_new(size: usize, nulls: R::Nullmask, values: T) -> anyhow::Result { ensure!( nulls.len() * size == values.len(), "null and value buffers have incompatible lengths: {} * {} != {}", - nulls.len(), size, values.len() - ); - Ok(Self { + nulls.len(), size, - nulls, - values - }) + values.len() + ); + Ok(Self { size, nulls, values }) } } - -impl ArrayReader for FixedSizeListReader { +impl ArrayReader for FixedSizeListReader { fn num_buffers(&self) -> usize { 1 + self.values.num_buffers() } @@ -39,18 +37,17 @@ impl ArrayReader for FixedSizeListReader { fn read_slice(&mut self, dst: &mut impl ArrayWriter, offset: usize, len: usize) -> anyhow::Result<()> { self.nulls.read_slice(dst.nullmask(0), offset, len)?; - self.values.read_slice(&mut dst.shift(1), offset * self.size, len * self.size) + self.values + .read_slice(&mut dst.shift(1), offset * self.size, len * self.size) } } - pub struct ChunkedFixedSizeListReader { size: usize, nulls: Vec, values: T } - impl ChunkedFixedSizeListReader { pub fn new(size: usize, capacity: usize, values: T) -> Self { Self { @@ -61,8 +58,7 @@ impl ChunkedFixedSizeListReader { } } - -impl ChunkedArrayReader for ChunkedFixedSizeListReader { +impl ChunkedArrayReader for ChunkedFixedSizeListReader { type Chunk = FixedSizeListReader; fn num_buffers(&self) -> usize { @@ -77,17 +73,12 @@ impl ChunkedArrayReader for ChunkedFixedSizeL fn read_chunked_ranges( &mut self, dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone - ) -> anyhow::Result<()> - { + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { let nullmask_dst = dst.nullmask(0); let mut ranges_len = 0; for r in ranges.clone() { - self.nulls[r.chunk_index()].read_slice( - nullmask_dst, - r.offset_index(), - r.len_index() - )?; + self.nulls[r.chunk_index()].read_slice(nullmask_dst, r.offset_index(), r.len_index())?; ranges_len += 1; } @@ -100,9 +91,7 @@ impl ChunkedArrayReader for ChunkedFixedSizeL }) } - self.values.read_chunked_ranges( - &mut dst.shift(1), - value_ranges.into_iter() - ) + self.values + .read_chunked_ranges(&mut dst.shift(1), value_ranges.into_iter()) } } diff --git a/crates/array/src/reader/list.rs b/crates/array/src/reader/list.rs index 140e82a2..e0d98de3 100644 --- a/crates/array/src/reader/list.rs +++ b/crates/array/src/reader/list.rs @@ -1,8 +1,10 @@ -use crate::chunking::ChunkRange; -use crate::reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, OffsetsReader, Reader}; -use crate::writer::ArrayWriter; use anyhow::ensure; +use crate::{ + chunking::ChunkRange, + reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, OffsetsReader, Reader}, + writer::ArrayWriter +}; pub struct ListReader { nulls: R::Nullmask, @@ -10,24 +12,19 @@ pub struct ListReader { values: T } - -impl ListReader { +impl ListReader { pub fn try_new(nulls: R::Nullmask, offsets: R::Offset, values: T) -> anyhow::Result { ensure!( - nulls.len() == offsets.len(), - "null and offset buffers have incompatible lengths: {} != {}", - nulls.len(), offsets.len() + nulls.len() == offsets.len(), + "null and offset buffers have incompatible lengths: {} != {}", + nulls.len(), + offsets.len() ); - Ok(Self { - nulls, - offsets, - values - }) + Ok(Self { nulls, offsets, values }) } } - -impl ArrayReader for ListReader { +impl ArrayReader for ListReader { fn num_buffers(&self) -> usize { 2 + self.values.num_buffers() } @@ -38,21 +35,20 @@ impl ArrayReader for ListReader { fn read_slice(&mut self, dst: &mut impl ArrayWriter, offset: usize, len: usize) -> anyhow::Result<()> { self.nulls.read_slice(dst.nullmask(0), offset, len)?; - + let value_range = self.offsets.read_slice(dst.offset(1), offset, len)?; - - self.values.read_slice(&mut dst.shift(2), value_range.start, value_range.len()) + + self.values + .read_slice(&mut dst.shift(2), value_range.start, value_range.len()) } } - pub struct ChunkedListReader { nulls: Vec, offsets: Vec, values: T } - impl ChunkedListReader { pub fn new(cap: usize, values: T) -> Self { Self { @@ -63,8 +59,7 @@ impl ChunkedListReader { } } - -impl ChunkedArrayReader for ChunkedListReader { +impl ChunkedArrayReader for ChunkedListReader { type Chunk = ListReader; fn num_buffers(&self) -> usize { @@ -80,28 +75,19 @@ impl ChunkedArrayReader for ChunkedListReader fn read_chunked_ranges( &mut self, dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone - ) -> anyhow::Result<()> - { + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { let nullmask_dst = dst.nullmask(0); let mut ranges_len = 0; for r in ranges.clone() { - self.nulls[r.chunk_index()].read_slice( - nullmask_dst, - r.offset_index(), - r.len_index() - )?; + self.nulls[r.chunk_index()].read_slice(nullmask_dst, r.offset_index(), r.len_index())?; ranges_len += 1; } let offsets_dst = dst.offset(1); let mut value_ranges = Vec::with_capacity(ranges_len); for r in ranges { - let value_range = self.offsets[r.chunk_index()].read_slice( - offsets_dst, - r.offset_index(), - r.len_index() - )?; + let value_range = self.offsets[r.chunk_index()].read_slice(offsets_dst, r.offset_index(), r.len_index())?; if !value_range.is_empty() { value_ranges.push(ChunkRange { chunk: r.chunk, @@ -111,11 +97,9 @@ impl ChunkedArrayReader for ChunkedListReader } } - self.values.read_chunked_ranges( - &mut dst.shift(2), - value_ranges.iter().cloned() - )?; + self.values + .read_chunked_ranges(&mut dst.shift(2), value_ranges.iter().cloned())?; Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/reader/mod.rs b/crates/array/src/reader/mod.rs index 0fcb441c..d9cfa9f2 100644 --- a/crates/array/src/reader/mod.rs +++ b/crates/array/src/reader/mod.rs @@ -1,8 +1,11 @@ -use crate::chunking::ChunkRange; -use crate::writer::{ArrayWriter, BitmaskWriter, NativeWriter, OffsetsWriter}; -use arrow_buffer::ArrowNativeType; use std::ops::Range; +use arrow_buffer::ArrowNativeType; + +use crate::{ + chunking::ChunkRange, + writer::{ArrayWriter, BitmaskWriter, NativeWriter, OffsetsWriter} +}; mod any; mod binary; @@ -13,7 +16,6 @@ mod native; mod primitive; mod r#struct; - pub use any::*; pub use binary::*; pub use boolean::*; @@ -22,56 +24,37 @@ pub use list::*; pub use primitive::*; pub use r#struct::*; - pub trait BitmaskReader { fn len(&self) -> usize; - fn read_slice( - &mut self, - dst: &mut impl BitmaskWriter, - offset: usize, - len: usize - ) -> anyhow::Result<()>; + fn read_slice(&mut self, dst: &mut impl BitmaskWriter, offset: usize, len: usize) -> anyhow::Result<()>; fn read(&mut self, dst: &mut impl BitmaskWriter) -> anyhow::Result<()> { self.read_slice(dst, 0, self.len()) } } - pub trait NativeReader { fn len(&self) -> usize; - fn read_slice( - &mut self, - dst: &mut impl NativeWriter, - offset: usize, - len: usize - ) -> anyhow::Result<()>; - + fn read_slice(&mut self, dst: &mut impl NativeWriter, offset: usize, len: usize) -> anyhow::Result<()>; + fn read(&mut self, dst: &mut impl NativeWriter) -> anyhow::Result<()> { self.read_slice(dst, 0, self.len()) } } - pub trait OffsetsReader { fn len(&self) -> usize; - fn read_slice( - &mut self, - dst: &mut impl OffsetsWriter, - offset: usize, - len: usize - ) -> anyhow::Result>; - + fn read_slice(&mut self, dst: &mut impl OffsetsWriter, offset: usize, len: usize) -> anyhow::Result>; + #[inline] fn read(&mut self, dst: &mut impl OffsetsWriter) -> anyhow::Result> { self.read_slice(dst, 0, self.len()) } } - pub trait Reader { type Nullmask: BitmaskReader; type Bitmask: BitmaskReader; @@ -79,25 +62,18 @@ pub trait Reader { type Offset: OffsetsReader; } - pub trait ArrayReader { fn num_buffers(&self) -> usize; - + fn len(&self) -> usize; fn read(&mut self, dst: &mut impl ArrayWriter) -> anyhow::Result<()> { self.read_slice(dst, 0, self.len()) } - fn read_slice( - &mut self, - dst: &mut impl ArrayWriter, - offset: usize, - len: usize - ) -> anyhow::Result<()>; + fn read_slice(&mut self, dst: &mut impl ArrayWriter, offset: usize, len: usize) -> anyhow::Result<()>; } - pub trait ChunkedArrayReader { type Chunk; @@ -108,11 +84,10 @@ pub trait ChunkedArrayReader { fn read_chunked_ranges( &mut self, dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone + ranges: impl Iterator + Clone ) -> anyhow::Result<()>; } - pub trait ReaderFactory { type Reader: Reader; @@ -123,4 +98,4 @@ pub trait ReaderFactory { fn native(&mut self) -> anyhow::Result<::Native>; fn offset(&mut self) -> anyhow::Result<::Offset>; -} \ No newline at end of file +} diff --git a/crates/array/src/reader/native.rs b/crates/array/src/reader/native.rs index 747f19a5..ffa40e55 100644 --- a/crates/array/src/reader/native.rs +++ b/crates/array/src/reader/native.rs @@ -1,23 +1,20 @@ -use crate::chunking::ChunkRange; -use crate::reader::{ArrayReader, ChunkedArrayReader, NativeReader}; -use crate::writer::ArrayWriter; - +use crate::{ + chunking::ChunkRange, + reader::{ArrayReader, ChunkedArrayReader, NativeReader}, + writer::ArrayWriter +}; pub struct NativeArrayReader { native_reader: R } - impl NativeArrayReader { pub fn new(native_reader: R) -> Self { - Self { - native_reader - } + Self { native_reader } } } - -impl ArrayReader for NativeArrayReader { +impl ArrayReader for NativeArrayReader { #[inline] fn num_buffers(&self) -> usize { 1 @@ -34,22 +31,19 @@ impl ArrayReader for NativeArrayReader { } } - pub struct ChunkedNativeArrayReader { chunks: Vec } - impl ChunkedNativeArrayReader { pub fn with_capacity(cap: usize) -> Self { Self { chunks: Vec::with_capacity(cap) } } -} - +} -impl ChunkedArrayReader for ChunkedNativeArrayReader { +impl ChunkedArrayReader for ChunkedNativeArrayReader { type Chunk = NativeArrayReader; fn num_buffers(&self) -> usize { @@ -61,19 +55,14 @@ impl ChunkedArrayReader for ChunkedNativeArrayReader { } fn read_chunked_ranges( - &mut self, - dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone - ) -> anyhow::Result<()> - { + &mut self, + dst: &mut impl ArrayWriter, + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { let dst = dst.native(0); for r in ranges { - self.chunks[r.chunk_index()].read_slice( - dst, - r.offset_index(), - r.len_index() - )?; + self.chunks[r.chunk_index()].read_slice(dst, r.offset_index(), r.len_index())?; } Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/reader/primitive.rs b/crates/array/src/reader/primitive.rs index f60b903f..def50bc8 100644 --- a/crates/array/src/reader/primitive.rs +++ b/crates/array/src/reader/primitive.rs @@ -1,30 +1,27 @@ -use crate::chunking::ChunkRange; -use crate::reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, NativeReader, Reader}; -use crate::writer::ArrayWriter; use anyhow::ensure; +use crate::{ + chunking::ChunkRange, + reader::{ArrayReader, BitmaskReader, ChunkedArrayReader, NativeReader, Reader}, + writer::ArrayWriter +}; pub struct PrimitiveReader { nulls: R::Nullmask, values: R::Native } - -impl PrimitiveReader { +impl PrimitiveReader { pub fn try_new(nulls: R::Nullmask, values: R::Native) -> anyhow::Result { ensure!( - nulls.len() == values.len(), + nulls.len() == values.len(), "null and value buffers have incompatible lengths" ); - Ok(Self { - nulls, - values - }) + Ok(Self { nulls, values }) } } - -impl ArrayReader for PrimitiveReader { +impl ArrayReader for PrimitiveReader { fn num_buffers(&self) -> usize { 2 } @@ -39,13 +36,11 @@ impl ArrayReader for PrimitiveReader { } } - pub struct ChunkedPrimitiveReader { nulls: Vec, values: Vec } - impl ChunkedPrimitiveReader { pub fn with_capacity(cap: usize) -> Self { Self { @@ -55,7 +50,6 @@ impl ChunkedPrimitiveReader { } } - impl ChunkedArrayReader for ChunkedPrimitiveReader { type Chunk = PrimitiveReader; @@ -69,29 +63,20 @@ impl ChunkedArrayReader for ChunkedPrimitiveReader { } fn read_chunked_ranges( - &mut self, + &mut self, dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone - ) -> anyhow::Result<()> - { + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { let nullmask_dst = dst.nullmask(0); for r in ranges.clone() { - self.nulls[r.chunk_index()].read_slice( - nullmask_dst, - r.offset_index(), - r.len_index() - )?; + self.nulls[r.chunk_index()].read_slice(nullmask_dst, r.offset_index(), r.len_index())?; } let values_dst = dst.native(1); for r in ranges { - self.values[r.chunk_index()].read_slice( - values_dst, - r.offset_index(), - r.len_index() - )?; + self.values[r.chunk_index()].read_slice(values_dst, r.offset_index(), r.len_index())?; } Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/reader/struct.rs b/crates/array/src/reader/struct.rs index 3e67a166..6993a6eb 100644 --- a/crates/array/src/reader/struct.rs +++ b/crates/array/src/reader/struct.rs @@ -1,35 +1,27 @@ -use crate::chunking::ChunkRange; -use crate::reader::any::AnyReader; -use crate::reader::{AnyChunkedReader, ArrayReader, BitmaskReader, ChunkedArrayReader, Reader}; -use crate::writer::ArrayWriter; use anyhow::ensure; +use crate::{ + chunking::ChunkRange, + reader::{any::AnyReader, AnyChunkedReader, ArrayReader, BitmaskReader, ChunkedArrayReader, Reader}, + writer::ArrayWriter +}; pub struct StructReader { nulls: R::Nullmask, columns: Vec> } - -impl StructReader { +impl StructReader { pub fn try_new(nulls: R::Nullmask, columns: Vec>) -> anyhow::Result { let len = nulls.len(); for (i, c) in columns.iter().enumerate() { - ensure!( - len == c.len(), - "null mask and column {} have incompatible lengths", - i - ); + ensure!(len == c.len(), "null mask and column {} have incompatible lengths", i); } - Ok(Self { - nulls, - columns - }) + Ok(Self { nulls, columns }) } } - -impl ArrayReader for StructReader { +impl ArrayReader for StructReader { fn num_buffers(&self) -> usize { 1 + self.columns.iter().map(|c| c.num_buffers()).sum::() } @@ -51,13 +43,11 @@ impl ArrayReader for StructReader { } } - pub struct ChunkedStructReader { nulls: Vec, columns: Vec> } - impl ChunkedStructReader { pub fn new(cap: usize, columns: Vec>) -> Self { Self { @@ -67,7 +57,6 @@ impl ChunkedStructReader { } } - impl ChunkedArrayReader for ChunkedStructReader { type Chunk = StructReader; @@ -86,16 +75,11 @@ impl ChunkedArrayReader for ChunkedStructReader { fn read_chunked_ranges( &mut self, dst: &mut impl ArrayWriter, - ranges: impl Iterator + Clone - ) -> anyhow::Result<()> - { + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { let nullmask_dst = dst.nullmask(0); for r in ranges.clone() { - self.nulls[r.chunk_index()].read_slice( - nullmask_dst, - r.offset_index(), - r.len_index() - )?; + self.nulls[r.chunk_index()].read_slice(nullmask_dst, r.offset_index(), r.len_index())?; } let mut shift = 1; @@ -109,34 +93,25 @@ impl ChunkedArrayReader for ChunkedStructReader { } } - pub struct TableReader { columns: Vec> } - impl TableReader { pub fn try_new(columns: Vec>) -> anyhow::Result { let len = columns.first().map_or(0, |c| c.len()); for (i, c) in columns.iter().enumerate() { - ensure!( - len == c.len(), - "columns 0 and {} have different lengths", - i - ); + ensure!(len == c.len(), "columns 0 and {} have different lengths", i); } - Ok(Self { - columns - }) + Ok(Self { columns }) } - + pub fn column_reader(&mut self, i: usize) -> &mut AnyReader { &mut self.columns[i] } } - -impl ArrayReader for TableReader { +impl ArrayReader for TableReader { fn num_buffers(&self) -> usize { self.columns.iter().map(|c| c.num_buffers()).sum() } @@ -155,22 +130,17 @@ impl ArrayReader for TableReader { } } - pub struct ChunkedTableReader { columns: Vec> } - -impl ChunkedTableReader { +impl ChunkedTableReader { pub fn new(columns: Vec>) -> Self { - Self { - columns - } + Self { columns } } } - -impl ChunkedArrayReader for ChunkedTableReader { +impl ChunkedArrayReader for ChunkedTableReader { type Chunk = TableReader; fn num_buffers(&self) -> usize { @@ -184,7 +154,11 @@ impl ChunkedArrayReader for ChunkedTableReader { } } - fn read_chunked_ranges(&mut self, dst: &mut impl ArrayWriter, ranges: impl Iterator + Clone) -> anyhow::Result<()> { + fn read_chunked_ranges( + &mut self, + dst: &mut impl ArrayWriter, + ranges: impl Iterator + Clone + ) -> anyhow::Result<()> { let mut shift = 0; for col in self.columns.iter_mut() { col.read_chunked_ranges(&mut dst.shift(shift), ranges.clone())?; @@ -192,4 +166,4 @@ impl ChunkedArrayReader for ChunkedTableReader { } Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/schema_metadata.rs b/crates/array/src/schema_metadata.rs index 4e6775cf..38d450f5 100644 --- a/crates/array/src/schema_metadata.rs +++ b/crates/array/src/schema_metadata.rs @@ -1,34 +1,28 @@ -use crate::schema_patch::SchemaPatch; +use std::fmt::Write; + use anyhow::{ensure, Context}; use arrow::datatypes::{Schema, SchemaRef}; -use std::fmt::Write; +use crate::schema_patch::SchemaPatch; pub const SQD_SORT_KEY: &'static str = "sqd_sort_key"; - pub fn get_sort_key(schema: &Schema) -> anyhow::Result> { if let Some(key) = schema.metadata().get(SQD_SORT_KEY) { - parse_sort_key(key, schema.fields().len()).with_context(|| { - format!("invalid sqd_sort_key - `{}`", key) - }) + parse_sort_key(key, schema.fields().len()).with_context(|| format!("invalid sqd_sort_key - `{}`", key)) } else { Ok(Vec::new()) } } - pub fn set_sort_key(schema: SchemaRef, key: &[usize]) -> SchemaRef { let mut patch = SchemaPatch::new(schema); patch.set_sort_key(key); patch.finish() } - fn parse_sort_key(key: &str, num_columns: usize) -> anyhow::Result> { - let indexes = key.split(',').map(|s| { - s.parse() - }).collect::, _>>()?; + let indexes = key.split(',').map(|s| s.parse()).collect::, _>>()?; ensure!( indexes.iter().all(|i| *i < num_columns), @@ -38,7 +32,6 @@ fn parse_sort_key(key: &str, num_columns: usize) -> anyhow::Result> { Ok(indexes) } - pub fn print_sort_key(key: &[usize]) -> String { let mut out = String::new(); if key.len() > 0 { diff --git a/crates/array/src/schema_patch.rs b/crates/array/src/schema_patch.rs index 6fe93448..6d2a3c54 100644 --- a/crates/array/src/schema_patch.rs +++ b/crates/array/src/schema_patch.rs @@ -1,21 +1,19 @@ -use crate::schema_metadata::{print_sort_key, SQD_SORT_KEY}; +use std::{collections::HashMap, sync::Arc}; + use arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef}; -use std::collections::HashMap; -use std::sync::Arc; +use crate::schema_metadata::{print_sort_key, SQD_SORT_KEY}; struct NewSchema { fields: Vec, metadata: HashMap } - pub struct SchemaPatch { original: SchemaRef, new_schema: Option } - impl SchemaPatch { pub fn new(schema: SchemaRef) -> Self { Self { @@ -39,13 +37,12 @@ impl SchemaPatch { } pub fn find_by_name(&self, name: &str) -> Option<(usize, FieldRef)> { - self.fields().iter() + self.fields() + .iter() .enumerate() - .find_map(|(i, f)| { - (f.name() == name).then_some((i, f.clone())) - }) + .find_map(|(i, f)| (f.name() == name).then_some((i, f.clone()))) } - + pub fn metadata_mut(&mut self) -> &mut HashMap { &mut self.new_schema_mut().metadata } @@ -55,14 +52,10 @@ impl SchemaPatch { if field.data_type() == &ty { return; } - let new_field = Field::new( - field.name(), - ty, - field.is_nullable() - ); + let new_field = Field::new(field.name(), ty, field.is_nullable()); self.set_field(index, Arc::new(new_field)) } - + pub fn insert_metadata(&mut self, key: impl ToString, val: impl ToString) { let key = key.to_string(); let val = val.to_string(); @@ -96,12 +89,11 @@ impl SchemaPatch { } pub fn finish(self) -> SchemaRef { - self.new_schema.map(|new_schema| { - let schema = Schema::new_with_metadata( - new_schema.fields, - new_schema.metadata - ); - Arc::new(schema) - }).unwrap_or(self.original) + self.new_schema + .map(|new_schema| { + let schema = Schema::new_with_metadata(new_schema.fields, new_schema.metadata); + Arc::new(schema) + }) + .unwrap_or(self.original) } -} \ No newline at end of file +} diff --git a/crates/array/src/slice/any.rs b/crates/array/src/slice/any.rs index 6b16dab2..b8d5ceb9 100644 --- a/crates/array/src/slice/any.rs +++ b/crates/array/src/slice/any.rs @@ -1,16 +1,21 @@ -use crate::index::RangeList; -use crate::slice::boolean::BooleanSlice; -use crate::slice::fixed_size_list::FixedSizeListSlice; -use crate::slice::list::ListSlice; -use crate::slice::primitive::PrimitiveSlice; -use crate::slice::r#struct::AnyStructSlice; -use crate::slice::{AsSlice, Slice}; -use crate::writer::ArrayWriter; -use arrow::array::{Array, AsArray}; -use arrow::datatypes::{DataType, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, TimestampMillisecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type}; -use std::ops::Range; -use std::sync::Arc; +use std::{ops::Range, sync::Arc}; +use arrow::{ + array::{Array, AsArray}, + datatypes::{ + DataType, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, TimestampMillisecondType, TimestampSecondType, + UInt16Type, UInt32Type, UInt64Type, UInt8Type + } +}; + +use crate::{ + index::RangeList, + slice::{ + boolean::BooleanSlice, fixed_size_list::FixedSizeListSlice, list::ListSlice, primitive::PrimitiveSlice, + r#struct::AnyStructSlice, AsSlice, Slice + }, + writer::ArrayWriter +}; #[derive(Clone)] pub enum AnySlice<'a> { @@ -32,7 +37,6 @@ pub enum AnySlice<'a> { Struct(AnyStructSlice<'a>) } - impl<'a> AnySlice<'a> { pub fn as_bool(&self) -> BooleanSlice<'a> { match self { @@ -133,8 +137,7 @@ impl<'a> AnySlice<'a> { } } - -impl <'a> Slice for AnySlice<'a> { +impl<'a> Slice for AnySlice<'a> { fn num_buffers(&self) -> usize { match self { AnySlice::Boolean(s) => s.num_buffers(), @@ -152,7 +155,7 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => s.num_buffers(), AnySlice::List(s) => s.num_buffers(), AnySlice::FixedSizeList(s) => s.num_buffers(), - AnySlice::Struct(s) => s.num_buffers(), + AnySlice::Struct(s) => s.num_buffers() } } @@ -173,7 +176,7 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => s.byte_size(), AnySlice::List(s) => s.byte_size(), AnySlice::FixedSizeList(s) => s.byte_size(), - AnySlice::Struct(s) => s.byte_size(), + AnySlice::Struct(s) => s.byte_size() } } @@ -194,7 +197,7 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => s.len(), AnySlice::List(s) => s.len(), AnySlice::FixedSizeList(s) => s.len(), - AnySlice::Struct(s) => s.len(), + AnySlice::Struct(s) => s.len() } } @@ -215,7 +218,7 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => AnySlice::FixedSizeBinary(s.slice(offset, len)), AnySlice::List(s) => AnySlice::List(s.slice(offset, len)), AnySlice::FixedSizeList(s) => AnySlice::FixedSizeList(s.slice(offset, len)), - AnySlice::Struct(s) => AnySlice::Struct(s.slice(offset, len)), + AnySlice::Struct(s) => AnySlice::Struct(s.slice(offset, len)) } } @@ -236,7 +239,7 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => s.write(dst), AnySlice::List(s) => s.write(dst), AnySlice::FixedSizeList(s) => s.write(dst), - AnySlice::Struct(s) => s.write(dst), + AnySlice::Struct(s) => s.write(dst) } } @@ -257,7 +260,7 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => s.write_range(dst, range), AnySlice::List(s) => s.write_range(dst, range), AnySlice::FixedSizeList(s) => s.write_range(dst, range), - AnySlice::Struct(s) => s.write_range(dst, range), + AnySlice::Struct(s) => s.write_range(dst, range) } } @@ -278,16 +281,15 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => s.write_ranges(dst, ranges), AnySlice::List(s) => s.write_ranges(dst, ranges), AnySlice::FixedSizeList(s) => s.write_ranges(dst, ranges), - AnySlice::Struct(s) => s.write_ranges(dst, ranges), + AnySlice::Struct(s) => s.write_ranges(dst, ranges) } } fn write_indexes( - &self, - dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + &self, + dst: &mut impl ArrayWriter, + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { match self { AnySlice::Boolean(s) => s.write_indexes(dst, indexes), AnySlice::UInt8(s) => s.write_indexes(dst, indexes), @@ -304,31 +306,26 @@ impl <'a> Slice for AnySlice<'a> { AnySlice::FixedSizeBinary(s) => s.write_indexes(dst, indexes), AnySlice::List(s) => s.write_indexes(dst, indexes), AnySlice::FixedSizeList(s) => s.write_indexes(dst, indexes), - AnySlice::Struct(s) => s.write_indexes(dst, indexes), + AnySlice::Struct(s) => s.write_indexes(dst, indexes) } } } - #[derive(Clone)] pub struct AnyListItem<'a> { item: Arc> } - impl<'a> AnyListItem<'a> { pub fn new(item: AnySlice<'a>) -> Self { - Self { - item: Arc::new(item) - } + Self { item: Arc::new(item) } } - + pub fn item(&self) -> AnySlice<'a> { self.item.as_ref().clone() } } - impl<'a> Slice for AnyListItem<'a> { #[inline] fn num_buffers(&self) -> usize { @@ -366,17 +363,15 @@ impl<'a> Slice for AnyListItem<'a> { #[inline] fn write_indexes( - &self, - dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + &self, + dst: &mut impl ArrayWriter, + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { self.item.write_indexes(dst, indexes) } } - -impl <'a> From<&'a dyn Array> for AnySlice<'a> { +impl<'a> From<&'a dyn Array> for AnySlice<'a> { fn from(value: &'a dyn Array) -> Self { match value.data_type() { DataType::Boolean => AnySlice::Boolean(value.as_boolean().into()), @@ -390,10 +385,10 @@ impl <'a> From<&'a dyn Array> for AnySlice<'a> { DataType::UInt64 => AnySlice::UInt64(value.as_primitive::().into()), DataType::Timestamp(TimeUnit::Second, _) => { AnySlice::Int64(value.as_primitive::().into()) - }, + } DataType::Timestamp(TimeUnit::Millisecond, _) => { AnySlice::Int64(value.as_primitive::().into()) - }, + } DataType::Binary => AnySlice::Binary(value.as_binary::().into()), DataType::FixedSizeBinary(_) => AnySlice::FixedSizeBinary(value.as_fixed_size_binary().into()), DataType::Utf8 => AnySlice::Binary(value.as_string::().into()), @@ -405,75 +400,62 @@ impl <'a> From<&'a dyn Array> for AnySlice<'a> { } } - impl<'a> AsSlice for &'a dyn Array { - type Slice<'b> = AnySlice<'b> where Self: 'b; + type Slice<'b> + = AnySlice<'b> + where + Self: 'b; fn as_slice(&self) -> Self::Slice<'a> { AnySlice::from(*self) } } - -impl <'a> From> for AnySlice<'a> { +impl<'a> From> for AnySlice<'a> { fn from(value: BooleanSlice<'a>) -> Self { AnySlice::Boolean(value) } } - -impl <'a> From> for AnySlice<'a> { +impl<'a> From> for AnySlice<'a> { fn from(value: ListSlice<'a, &'a [u8]>) -> Self { AnySlice::Binary(value) } } - -impl <'a> From> for AnySlice<'a> { +impl<'a> From> for AnySlice<'a> { fn from(value: FixedSizeListSlice<'a, &'a [u8]>) -> Self { AnySlice::FixedSizeBinary(value) } } - -impl <'a, T: Slice + Into>> From> for AnySlice<'a> { +impl<'a, T: Slice + Into>> From> for AnySlice<'a> { fn from(value: ListSlice<'a, T>) -> Self { let nulls = value.nulls(); let offsets = value.offsets(); - let items = AnyListItem::new( - value.values().into() - ); - AnySlice::List( - ListSlice::new(offsets, items, nulls.bitmask()) - ) + let items = AnyListItem::new(value.values().into()); + AnySlice::List(ListSlice::new(offsets, items, nulls.bitmask())) } } - -impl <'a, T: Slice + Into>> From> for AnySlice<'a> { +impl<'a, T: Slice + Into>> From> for AnySlice<'a> { fn from(value: FixedSizeListSlice<'a, T>) -> Self { let size = value.size(); let nulls = value.nulls(); - let items = AnyListItem::new( - value.values().into() - ); - AnySlice::FixedSizeList( - FixedSizeListSlice::new(size, items, nulls.bitmask()) - ) + let items = AnyListItem::new(value.values().into()); + AnySlice::FixedSizeList(FixedSizeListSlice::new(size, items, nulls.bitmask())) } } - -impl <'a> From> for AnySlice<'a> { +impl<'a> From> for AnySlice<'a> { fn from(value: AnyStructSlice<'a>) -> Self { AnySlice::Struct(value) } } - macro_rules! impl_from_prim { ($t:ty, $v:ident) => { - impl <'a> From> for AnySlice<'a> { + impl<'a> From> for AnySlice<'a> { fn from(value: PrimitiveSlice<'a, $t>) -> Self { AnySlice::$v(value) } diff --git a/crates/array/src/slice/bitmask.rs b/crates/array/src/slice/bitmask.rs index a6fb8159..318b7428 100644 --- a/crates/array/src/slice/bitmask.rs +++ b/crates/array/src/slice/bitmask.rs @@ -1,8 +1,8 @@ -use crate::index::RangeList; -use crate::writer::BitmaskWriter; -use arrow_buffer::{bit_util, BooleanBuffer}; use std::ops::Range; +use arrow_buffer::{bit_util, BooleanBuffer}; + +use crate::{index::RangeList, writer::BitmaskWriter}; #[derive(Clone)] pub struct BitmaskSlice<'a> { @@ -11,17 +11,12 @@ pub struct BitmaskSlice<'a> { len: usize } - impl<'a> BitmaskSlice<'a> { pub fn new(data: &'a [u8], offset: usize, len: usize) -> Self { assert!(offset + len <= data.len() * 8); - Self { - data, - offset, - len - } + Self { data, offset, len } } - + #[inline] pub fn len(&self) -> usize { self.len @@ -41,14 +36,12 @@ impl<'a> BitmaskSlice<'a> { len } } - + #[inline] pub fn value(&self, i: usize) -> bool { assert!(self.offset + i < self.len); // SAFETY: bounds should be guaranteed by construction and the above assertion - unsafe { - bit_util::get_bit_raw(self.data.as_ptr(), self.offset + i) - } + unsafe { bit_util::get_bit_raw(self.data.as_ptr(), self.offset + i) } } pub fn write(&self, dst: &mut impl BitmaskWriter) -> anyhow::Result<()> { @@ -57,28 +50,22 @@ impl<'a> BitmaskSlice<'a> { pub fn write_range(&self, dst: &mut impl BitmaskWriter, range: Range) -> anyhow::Result<()> { if range.is_empty() { - return Ok(()) + return Ok(()); } dst.write_slice(self.data, self.offset + range.start, range.len()) } - - pub fn write_ranges( - &self, - dst: &mut impl BitmaskWriter, - ranges: &mut impl RangeList - ) -> anyhow::Result<()> - { + + pub fn write_ranges(&self, dst: &mut impl BitmaskWriter, ranges: &mut impl RangeList) -> anyhow::Result<()> { dst.write_slice_ranges(self.data, &mut ranges.shift(self.offset, self.len)) } pub fn write_indexes( &self, dst: &mut impl BitmaskWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { dst.write_slice_indexes( - self.data, + self.data, indexes.map(|i| { assert!(i < self.len); self.offset + i @@ -87,7 +74,6 @@ impl<'a> BitmaskSlice<'a> { } } - impl<'a> From<&'a BooleanBuffer> for BitmaskSlice<'a> { fn from(value: &'a BooleanBuffer) -> Self { Self { @@ -96,4 +82,4 @@ impl<'a> From<&'a BooleanBuffer> for BitmaskSlice<'a> { len: value.len() } } -} \ No newline at end of file +} diff --git a/crates/array/src/slice/boolean.rs b/crates/array/src/slice/boolean.rs index b6ec31b4..bd01b90f 100644 --- a/crates/array/src/slice/boolean.rs +++ b/crates/array/src/slice/boolean.rs @@ -1,12 +1,13 @@ -use crate::access::Access; -use crate::index::RangeList; -use crate::slice::bitmask::BitmaskSlice; -use crate::slice::nullmask::NullmaskSlice; -use crate::slice::Slice; -use crate::writer::ArrayWriter; -use arrow::array::BooleanArray; use std::ops::Range; +use arrow::array::BooleanArray; + +use crate::{ + access::Access, + index::RangeList, + slice::{bitmask::BitmaskSlice, nullmask::NullmaskSlice, Slice}, + writer::ArrayWriter +}; #[derive(Clone)] pub struct BooleanSlice<'a> { @@ -14,26 +15,21 @@ pub struct BooleanSlice<'a> { values: BitmaskSlice<'a> } - -impl <'a> BooleanSlice<'a> { +impl<'a> BooleanSlice<'a> { pub fn new(values: BitmaskSlice<'a>, nulls: Option>) -> Self { Self { nulls: NullmaskSlice::new(values.len(), nulls), values } } - + pub fn with_nullmask(values: BitmaskSlice<'a>, nulls: NullmaskSlice<'a>) -> Self { assert_eq!(values.len(), nulls.len()); - Self { - nulls, - values - } + Self { nulls, values } } } - -impl <'a> Slice for BooleanSlice<'a> { +impl<'a> Slice for BooleanSlice<'a> { fn num_buffers(&self) -> usize { 2 } @@ -69,18 +65,16 @@ impl <'a> Slice for BooleanSlice<'a> { } fn write_indexes( - &self, - dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + &self, + dst: &mut impl ArrayWriter, + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { self.nulls.write_indexes(dst.nullmask(0), indexes.clone())?; self.values.write_indexes(dst.bitmask(1), indexes) } } - -impl <'a> From<&'a BooleanArray> for BooleanSlice<'a> { +impl<'a> From<&'a BooleanArray> for BooleanSlice<'a> { fn from(value: &'a BooleanArray) -> Self { Self { nulls: NullmaskSlice::from_array(value), @@ -89,8 +83,7 @@ impl <'a> From<&'a BooleanArray> for BooleanSlice<'a> { } } - -impl <'a> Access for BooleanSlice<'a> { +impl<'a> Access for BooleanSlice<'a> { type Value = bool; #[inline] @@ -107,4 +100,4 @@ impl <'a> Access for BooleanSlice<'a> { fn has_nulls(&self) -> bool { self.nulls.has_nulls() } -} \ No newline at end of file +} diff --git a/crates/array/src/slice/fixed_size_list.rs b/crates/array/src/slice/fixed_size_list.rs index 29563484..6708524c 100644 --- a/crates/array/src/slice/fixed_size_list.rs +++ b/crates/array/src/slice/fixed_size_list.rs @@ -1,17 +1,19 @@ -use crate::access::Access; -use crate::index::{RangeList, RangeListFromIterator}; -use crate::slice::bitmask::BitmaskSlice; -use crate::slice::nullmask::NullmaskSlice; -use crate::slice::{AnyListItem, AnySlice, Slice}; -use crate::writer::ArrayWriter; -use arrow::array::{FixedSizeBinaryArray, FixedSizeListArray}; use std::ops::Range; +use arrow::array::{FixedSizeBinaryArray, FixedSizeListArray}; + +use crate::{ + access::Access, + index::{RangeList, RangeListFromIterator}, + slice::{bitmask::BitmaskSlice, nullmask::NullmaskSlice, AnyListItem, AnySlice, Slice}, + writer::ArrayWriter +}; + #[derive(Clone)] pub struct FixedSizeListSlice<'a, T: Clone> { size: usize, nulls: NullmaskSlice<'a>, - values: T, + values: T } impl<'a, T: Slice> FixedSizeListSlice<'a, T> { @@ -21,7 +23,7 @@ impl<'a, T: Slice> FixedSizeListSlice<'a, T> { Self { size, nulls: NullmaskSlice::new(len, nulls), - values, + values } } @@ -62,7 +64,7 @@ impl<'a, T: Slice> Slice for FixedSizeListSlice<'a, T> { Self { size: self.size, nulls: self.nulls.slice(offset, len), - values: self.values.slice(offset * self.size, len * self.size), + values: self.values.slice(offset * self.size, len * self.size) } } @@ -84,23 +86,18 @@ impl<'a, T: Slice> Slice for FixedSizeListSlice<'a, T> { self.values.write_range(&mut dst.shift(1), value_range) } - fn write_ranges( - &self, - dst: &mut impl ArrayWriter, - ranges: &mut impl RangeList, - ) -> anyhow::Result<()> { + fn write_ranges(&self, dst: &mut impl ArrayWriter, ranges: &mut impl RangeList) -> anyhow::Result<()> { self.nulls.write_ranges(dst.nullmask(0), ranges)?; let mut value_ranges = ranges.scale(self.size); - self.values - .write_ranges(&mut dst.shift(1), &mut value_ranges) + self.values.write_ranges(&mut dst.shift(1), &mut value_ranges) } fn write_indexes( &self, dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone, + indexes: impl Iterator + Clone ) -> anyhow::Result<()> { self.nulls.write_indexes(dst.nullmask(0), indexes.clone())?; @@ -110,10 +107,8 @@ impl<'a, T: Slice> Slice for FixedSizeListSlice<'a, T> { beg..end }); - self.values.write_ranges( - &mut dst.shift(1), - &mut RangeListFromIterator::new(item_ranges), - ) + self.values + .write_ranges(&mut dst.shift(1), &mut RangeListFromIterator::new(item_ranges)) } } @@ -122,7 +117,7 @@ impl<'a> From<&'a FixedSizeBinaryArray> for FixedSizeListSlice<'a, &'a [u8]> { Self { size: value.value_length() as usize, nulls: NullmaskSlice::from_array(value), - values: value.values().as_ref(), + values: value.values().as_ref() } } } @@ -132,7 +127,7 @@ impl<'a> From<&'a FixedSizeListArray> for FixedSizeListSlice<'a, AnySlice<'a>> { Self { size: value.value_length() as usize, nulls: NullmaskSlice::from_array(value), - values: value.values().as_ref().into(), + values: value.values().as_ref().into() } } } @@ -142,7 +137,7 @@ impl<'a> From<&'a FixedSizeListArray> for FixedSizeListSlice<'a, AnyListItem<'a> Self { size: value.value_length() as usize, nulls: NullmaskSlice::from_array(value), - values: AnyListItem::new(value.values().as_ref().into()), + values: AnyListItem::new(value.values().as_ref().into()) } } } diff --git a/crates/array/src/slice/list.rs b/crates/array/src/slice/list.rs index 077d536a..8ca368b6 100644 --- a/crates/array/src/slice/list.rs +++ b/crates/array/src/slice/list.rs @@ -1,14 +1,17 @@ -use crate::access::Access; -use crate::index::{MaterializedRangeList, RangeList, RangeListFromIterator}; -use crate::offsets::Offsets; -use crate::slice::bitmask::BitmaskSlice; -use crate::slice::nullmask::NullmaskSlice; -use crate::slice::{AnyListItem, AnySlice, Slice}; -use crate::writer::{ArrayWriter, OffsetsWriter}; -use arrow::array::{GenericByteArray, ListArray}; -use arrow::datatypes::ByteArrayType; use std::ops::Range; +use arrow::{ + array::{GenericByteArray, ListArray}, + datatypes::ByteArrayType +}; + +use crate::{ + access::Access, + index::{MaterializedRangeList, RangeList, RangeListFromIterator}, + offsets::Offsets, + slice::{bitmask::BitmaskSlice, nullmask::NullmaskSlice, AnyListItem, AnySlice, Slice}, + writer::{ArrayWriter, OffsetsWriter} +}; #[derive(Clone)] pub struct ListSlice<'a, T: Clone> { @@ -17,7 +20,6 @@ pub struct ListSlice<'a, T: Clone> { values: T } - impl<'a, T: Slice> ListSlice<'a, T> { pub fn new(offsets: Offsets<'a>, values: T, nulls: Option>) -> Self { assert!(offsets.last_index() <= values.len()); @@ -27,7 +29,7 @@ impl<'a, T: Slice> ListSlice<'a, T> { values } } - + #[inline] pub fn nulls(&self) -> NullmaskSlice<'a> { self.nulls.clone() @@ -44,8 +46,7 @@ impl<'a, T: Slice> ListSlice<'a, T> { } } - -impl <'a, T: Slice> Slice for ListSlice<'a, T> { +impl<'a, T: Slice> Slice for ListSlice<'a, T> { #[inline] fn num_buffers(&self) -> usize { 2 + self.values.num_buffers() @@ -80,7 +81,7 @@ impl <'a, T: Slice> Slice for ListSlice<'a, T> { fn write_range(&self, dst: &mut impl ArrayWriter, range: Range) -> anyhow::Result<()> { if range.is_empty() { - return Ok(()) + return Ok(()); } self.nulls.write_range(dst.nullmask(0), range.clone())?; @@ -96,31 +97,25 @@ impl <'a, T: Slice> Slice for ListSlice<'a, T> { self.nulls.write_ranges(dst.nullmask(0), ranges)?; dst.offset(1).write_slice_ranges(self.offsets, ranges)?; - - let mut value_ranges = MaterializedRangeList::from_iter( - ranges.iter().map(|r| { - let beg = self.offsets.index(r.start); - let end = self.offsets.index(r.end); - beg..end - }) - ); - - self.values.write_ranges( - &mut dst.shift(2), - &mut value_ranges - ) + + let mut value_ranges = MaterializedRangeList::from_iter(ranges.iter().map(|r| { + let beg = self.offsets.index(r.start); + let end = self.offsets.index(r.end); + beg..end + })); + + self.values.write_ranges(&mut dst.shift(2), &mut value_ranges) } fn write_indexes( &self, dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { self.nulls.write_indexes(dst.nullmask(0), indexes.clone())?; dst.offset(1).write_slice_indexes(self.offsets, indexes.clone())?; - + let item_ranges = indexes.filter_map(|i| { let beg = self.offsets.index(i); let end = self.offsets.index(i + 1); @@ -130,13 +125,13 @@ impl <'a, T: Slice> Slice for ListSlice<'a, T> { None } }); - - self.values.write_ranges(&mut dst.shift(2), &mut RangeListFromIterator::new(item_ranges)) + + self.values + .write_ranges(&mut dst.shift(2), &mut RangeListFromIterator::new(item_ranges)) } } - -impl <'a, T: ByteArrayType> From<&'a GenericByteArray> for ListSlice<'a, &'a [u8]> { +impl<'a, T: ByteArrayType> From<&'a GenericByteArray> for ListSlice<'a, &'a [u8]> { fn from(value: &'a GenericByteArray) -> Self { Self { nulls: NullmaskSlice::from_array(value), @@ -146,7 +141,6 @@ impl <'a, T: ByteArrayType> From<&'a GenericByteArray> for ListSl } } - impl<'a> From<&'a ListArray> for ListSlice<'a, AnySlice<'a>> { fn from(value: &'a ListArray) -> Self { Self { @@ -157,7 +151,6 @@ impl<'a> From<&'a ListArray> for ListSlice<'a, AnySlice<'a>> { } } - impl<'a> From<&'a ListArray> for ListSlice<'a, AnyListItem<'a>> { fn from(value: &'a ListArray) -> Self { Self { @@ -168,8 +161,7 @@ impl<'a> From<&'a ListArray> for ListSlice<'a, AnyListItem<'a>> { } } - -impl <'a, T> Access for ListSlice<'a, &'a [T]>{ +impl<'a, T> Access for ListSlice<'a, &'a [T]> { type Value = &'a [T]; #[inline] @@ -190,13 +182,12 @@ impl <'a, T> Access for ListSlice<'a, &'a [T]>{ } } - impl<'a, T: Ord> ListSlice<'a, &'a [T]> { pub fn max(&self) -> Option<&'a [T]> { if self.has_nulls() { - (0..self.nulls.len()).flat_map(|i| { - self.is_valid(i).then(|| self.get(i)) - }).max() + (0..self.nulls.len()) + .flat_map(|i| self.is_valid(i).then(|| self.get(i))) + .max() } else { (0..self.nulls.len()).map(|i| self.get(i)).max() } @@ -204,11 +195,11 @@ impl<'a, T: Ord> ListSlice<'a, &'a [T]> { pub fn min(&self) -> Option<&'a [T]> { if self.has_nulls() { - (0..self.nulls.len()).flat_map(|i| { - self.is_valid(i).then(|| self.get(i)) - }).min() + (0..self.nulls.len()) + .flat_map(|i| self.is_valid(i).then(|| self.get(i))) + .min() } else { (0..self.nulls.len()).map(|i| self.get(i)).min() } } -} \ No newline at end of file +} diff --git a/crates/array/src/slice/mod.rs b/crates/array/src/slice/mod.rs index 84989471..9e7bd886 100644 --- a/crates/array/src/slice/mod.rs +++ b/crates/array/src/slice/mod.rs @@ -1,7 +1,6 @@ -use crate::index::RangeList; -use crate::writer::ArrayWriter; use std::ops::Range; +use crate::{index::RangeList, writer::ArrayWriter}; mod any; pub mod bitmask; @@ -13,7 +12,6 @@ pub mod nullmask; mod primitive; mod r#struct; - pub use any::*; pub use boolean::*; pub use fixed_size_list::*; @@ -21,18 +19,17 @@ pub use list::*; pub use primitive::*; pub use r#struct::*; - pub trait Slice: Clone { fn num_buffers(&self) -> usize; fn byte_size(&self) -> usize; - + fn len(&self) -> usize; - + fn slice(&self, offset: usize, len: usize) -> Self; - + fn write(&self, dst: &mut impl ArrayWriter) -> anyhow::Result<()>; - + #[inline] fn write_range(&self, dst: &mut impl ArrayWriter, range: Range) -> anyhow::Result<()> { self.slice(range.start, range.len()).write(dst) @@ -41,15 +38,16 @@ pub trait Slice: Clone { fn write_ranges(&self, dst: &mut impl ArrayWriter, ranges: &mut impl RangeList) -> anyhow::Result<()>; fn write_indexes( - &self, - dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone + &self, + dst: &mut impl ArrayWriter, + indexes: impl Iterator + Clone ) -> anyhow::Result<()>; } - pub trait AsSlice { - type Slice<'a>: Slice where Self: 'a; + type Slice<'a>: Slice + where + Self: 'a; fn as_slice(&self) -> Self::Slice<'_>; -} \ No newline at end of file +} diff --git a/crates/array/src/slice/native.rs b/crates/array/src/slice/native.rs index 3d58d74e..ba107f90 100644 --- a/crates/array/src/slice/native.rs +++ b/crates/array/src/slice/native.rs @@ -1,11 +1,14 @@ -use crate::index::RangeList; -use crate::slice::Slice; -use crate::writer::{ArrayWriter, NativeWriter}; -use arrow_buffer::ArrowNativeType; use std::ops::Range; +use arrow_buffer::ArrowNativeType; + +use crate::{ + index::RangeList, + slice::Slice, + writer::{ArrayWriter, NativeWriter} +}; -impl <'a, T: ArrowNativeType> Slice for &'a [T] { +impl<'a, T: ArrowNativeType> Slice for &'a [T] { #[inline] fn num_buffers(&self) -> usize { 1 @@ -42,12 +45,7 @@ impl <'a, T: ArrowNativeType> Slice for &'a [T] { } #[inline] - fn write_indexes( - &self, - dst: &mut impl ArrayWriter, - indexes: impl Iterator - ) -> anyhow::Result<()> - { + fn write_indexes(&self, dst: &mut impl ArrayWriter, indexes: impl Iterator) -> anyhow::Result<()> { dst.native(0).write_slice_indexes(self, indexes) } -} \ No newline at end of file +} diff --git a/crates/array/src/slice/nullmask.rs b/crates/array/src/slice/nullmask.rs index 4d65d5ff..60a2e3a7 100644 --- a/crates/array/src/slice/nullmask.rs +++ b/crates/array/src/slice/nullmask.rs @@ -1,9 +1,8 @@ -use crate::index::RangeList; -use crate::slice::bitmask::BitmaskSlice; -use crate::writer::BitmaskWriter; -use arrow::array::Array; use std::ops::Range; +use arrow::array::Array; + +use crate::{index::RangeList, slice::bitmask::BitmaskSlice, writer::BitmaskWriter}; #[derive(Clone)] pub struct NullmaskSlice<'a> { @@ -11,30 +10,26 @@ pub struct NullmaskSlice<'a> { len: usize } - impl<'a> NullmaskSlice<'a> { pub fn from_array(array: &'a dyn Array) -> Self { Self::new(array.len(), array.nulls().map(|b| b.inner().into())) } - + pub fn new(len: usize, nulls: Option>) -> Self { if let Some(nulls) = nulls.as_ref() { assert_eq!(nulls.len(), len); } - Self { - nulls, - len - } + Self { nulls, len } } - + pub fn len(&self) -> usize { self.len } - + pub fn byte_size(&self) -> usize { self.nulls.as_ref().map(|m| m.bytes_size()).unwrap_or(0) } - + pub fn slice(&self, offset: usize, len: usize) -> Self { assert!(offset + len <= self.len); Self { @@ -42,21 +37,21 @@ impl<'a> NullmaskSlice<'a> { len } } - + #[inline] pub fn is_valid(&self, i: usize) -> bool { assert!(i < self.len); self.nulls.as_ref().map(|nulls| nulls.value(i)).unwrap_or(true) } - + pub fn bitmask(&self) -> Option> { self.nulls.clone() } - + pub fn has_nulls(&self) -> bool { self.nulls.is_some() } - + pub fn write(&self, dst: &mut impl BitmaskWriter) -> anyhow::Result<()> { if let Some(nulls) = self.nulls.as_ref() { nulls.write(dst) @@ -67,7 +62,7 @@ impl<'a> NullmaskSlice<'a> { pub fn write_range(&self, dst: &mut impl BitmaskWriter, range: Range) -> anyhow::Result<()> { if range.is_empty() { - return Ok(()) + return Ok(()); } if let Some(nulls) = self.nulls.as_ref() { nulls.write_range(dst, range) @@ -75,7 +70,7 @@ impl<'a> NullmaskSlice<'a> { dst.write_many(true, range.len()) } } - + pub fn write_ranges(&self, dst: &mut impl BitmaskWriter, ranges: &mut impl RangeList) -> anyhow::Result<()> { if let Some(nulls) = self.nulls.as_ref() { nulls.write_ranges(dst, ranges) @@ -87,13 +82,12 @@ impl<'a> NullmaskSlice<'a> { pub fn write_indexes( &self, dst: &mut impl BitmaskWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { if let Some(nulls) = self.nulls.as_ref() { nulls.write_indexes(dst, indexes) } else { dst.write_many(true, indexes.count()) } } -} \ No newline at end of file +} diff --git a/crates/array/src/slice/primitive.rs b/crates/array/src/slice/primitive.rs index 51dd85f0..7a96d8df 100644 --- a/crates/array/src/slice/primitive.rs +++ b/crates/array/src/slice/primitive.rs @@ -1,13 +1,14 @@ -use crate::access::Access; -use crate::index::RangeList; -use crate::slice::bitmask::BitmaskSlice; -use crate::slice::nullmask::NullmaskSlice; -use crate::slice::Slice; -use crate::writer::{ArrayWriter, NativeWriter}; +use std::ops::Range; + use arrow::array::{ArrowPrimitiveType, PrimitiveArray}; use arrow_buffer::ArrowNativeType; -use std::ops::Range; +use crate::{ + access::Access, + index::RangeList, + slice::{bitmask::BitmaskSlice, nullmask::NullmaskSlice, Slice}, + writer::{ArrayWriter, NativeWriter} +}; #[derive(Clone)] pub struct PrimitiveSlice<'a, T> { @@ -15,26 +16,24 @@ pub struct PrimitiveSlice<'a, T> { values: &'a [T] } - -impl <'a, T> PrimitiveSlice<'a, T> { +impl<'a, T> PrimitiveSlice<'a, T> { pub fn new(values: &'a [T], nulls: Option>) -> Self { Self { nulls: NullmaskSlice::new(values.len(), nulls), values } } - + pub fn nulls(&self) -> NullmaskSlice<'a> { self.nulls.clone() - } - + } + pub fn values(&self) -> &'a [T] { self.values } } - -impl <'a, T: ArrowNativeType> Slice for PrimitiveSlice<'a, T> { +impl<'a, T: ArrowNativeType> Slice for PrimitiveSlice<'a, T> { #[inline] fn num_buffers(&self) -> usize { 2 @@ -74,18 +73,16 @@ impl <'a, T: ArrowNativeType> Slice for PrimitiveSlice<'a, T> { } fn write_indexes( - &self, + &self, dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { self.nulls.write_indexes(dst.nullmask(0), indexes.clone())?; dst.native(1).write_slice_indexes(self.values, indexes) } } - -impl <'a, T: ArrowPrimitiveType> From<&'a PrimitiveArray> for PrimitiveSlice<'a, T::Native> { +impl<'a, T: ArrowPrimitiveType> From<&'a PrimitiveArray> for PrimitiveSlice<'a, T::Native> { fn from(value: &'a PrimitiveArray) -> Self { Self { nulls: NullmaskSlice::from_array(value), @@ -94,8 +91,7 @@ impl <'a, T: ArrowPrimitiveType> From<&'a PrimitiveArray> for PrimitiveSlice< } } - -impl <'a, T: ArrowNativeType> Access for PrimitiveSlice<'a, T> { +impl<'a, T: ArrowNativeType> Access for PrimitiveSlice<'a, T> { type Value = T; #[inline] @@ -114,13 +110,14 @@ impl <'a, T: ArrowNativeType> Access for PrimitiveSlice<'a, T> { } } - impl<'a, T: ArrowNativeType + Ord> PrimitiveSlice<'a, T> { pub fn max(&self) -> Option { if self.nulls.has_nulls() { - self.values.iter().enumerate().filter_map(|(i, v)| { - self.nulls.is_valid(i).then_some(*v) - }).max() + self.values + .iter() + .enumerate() + .filter_map(|(i, v)| self.nulls.is_valid(i).then_some(*v)) + .max() } else { self.values.iter().max().copied() } @@ -128,11 +125,13 @@ impl<'a, T: ArrowNativeType + Ord> PrimitiveSlice<'a, T> { pub fn min(&self) -> Option { if self.nulls.has_nulls() { - self.values.iter().enumerate().filter_map(|(i, v)| { - self.nulls.is_valid(i).then_some(*v) - }).min() + self.values + .iter() + .enumerate() + .filter_map(|(i, v)| self.nulls.is_valid(i).then_some(*v)) + .min() } else { self.values.iter().min().copied() } } -} \ No newline at end of file +} diff --git a/crates/array/src/slice/struct.rs b/crates/array/src/slice/struct.rs index 5fb1c31c..b728c45a 100644 --- a/crates/array/src/slice/struct.rs +++ b/crates/array/src/slice/struct.rs @@ -1,12 +1,12 @@ -use crate::index::RangeList; -use crate::slice::any::AnySlice; -use crate::slice::nullmask::NullmaskSlice; -use crate::slice::{AsSlice, Slice}; -use crate::writer::ArrayWriter; +use std::{ops::Range, sync::Arc}; + use arrow::array::{Array, RecordBatch, StructArray}; -use std::ops::Range; -use std::sync::Arc; +use crate::{ + index::RangeList, + slice::{any::AnySlice, nullmask::NullmaskSlice, AsSlice, Slice}, + writer::ArrayWriter +}; #[derive(Clone)] pub struct AnyStructSlice<'a> { @@ -16,7 +16,6 @@ pub struct AnyStructSlice<'a> { len: usize } - impl<'a> AnyStructSlice<'a> { pub fn new(nulls: NullmaskSlice<'a>, columns: Arc<[AnySlice<'a>]>) -> Self { let len = nulls.len(); @@ -30,7 +29,7 @@ impl<'a> AnyStructSlice<'a> { len } } - + pub fn has_nulls(&self) -> bool { self.nulls.has_nulls() } @@ -38,16 +37,16 @@ impl<'a> AnyStructSlice<'a> { pub fn nulls(&self) -> NullmaskSlice<'a> { self.nulls.slice(self.offset, self.len) } - + pub fn column(&self, i: usize) -> AnySlice<'a> { self.columns[i].slice(self.offset, self.len) } - + pub fn num_columns(&self) -> usize { self.columns.len() } - - pub fn project(&self, fields: impl Iterator) -> Self { + + pub fn project(&self, fields: impl Iterator) -> Self { let columns = fields.map(|i| self.columns[i].clone()).collect(); Self { nulls: self.nulls.clone(), @@ -56,7 +55,7 @@ impl<'a> AnyStructSlice<'a> { len: self.len } } - + fn write_src_range(&self, dst: &mut impl ArrayWriter, range: Range) -> anyhow::Result<()> { self.nulls.write_range(dst.nullmask(0), range.clone())?; @@ -69,7 +68,6 @@ impl<'a> AnyStructSlice<'a> { } } - impl<'a> Slice for AnyStructSlice<'a> { fn num_buffers(&self) -> usize { 1 + self.columns.iter().map(|c| c.num_buffers()).sum::() @@ -104,34 +102,38 @@ impl<'a> Slice for AnyStructSlice<'a> { } fn write_ranges(&self, dst: &mut impl ArrayWriter, ranges: &mut impl RangeList) -> anyhow::Result<()> { - self.nulls.slice(self.offset, self.len).write_ranges(dst.nullmask(0), ranges)?; - + self.nulls + .slice(self.offset, self.len) + .write_ranges(dst.nullmask(0), ranges)?; + let mut shift = 1; for c in self.columns.iter() { - c.slice(self.offset, self.len).write_ranges(&mut dst.shift(shift), ranges)?; + c.slice(self.offset, self.len) + .write_ranges(&mut dst.shift(shift), ranges)?; shift += c.num_buffers(); } Ok(()) } fn write_indexes( - &self, - dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { - self.nulls.slice(self.offset, self.len).write_indexes(dst.nullmask(0), indexes.clone())?; + &self, + dst: &mut impl ArrayWriter, + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { + self.nulls + .slice(self.offset, self.len) + .write_indexes(dst.nullmask(0), indexes.clone())?; let mut shift = 1; for c in self.columns.iter() { - c.slice(self.offset, self.len).write_indexes(&mut dst.shift(shift), indexes.clone())?; + c.slice(self.offset, self.len) + .write_indexes(&mut dst.shift(shift), indexes.clone())?; shift += c.num_buffers(); } Ok(()) } } - impl<'a> From<&'a StructArray> for AnyStructSlice<'a> { fn from(value: &'a StructArray) -> Self { Self { @@ -143,7 +145,6 @@ impl<'a> From<&'a StructArray> for AnyStructSlice<'a> { } } - impl AsSlice for StructArray { type Slice<'a> = AnyStructSlice<'a>; @@ -152,7 +153,6 @@ impl AsSlice for StructArray { } } - #[derive(Clone)] pub struct AnyTableSlice<'a> { columns: Arc<[AnySlice<'a>]>, @@ -160,7 +160,6 @@ pub struct AnyTableSlice<'a> { len: usize } - impl<'a> AnyTableSlice<'a> { pub fn new(columns: Arc<[AnySlice<'a>]>) -> Self { let len = columns.first().map_or(0, |c| c.len()); @@ -192,7 +191,6 @@ impl<'a> AnyTableSlice<'a> { } } - impl<'a> Slice for AnyTableSlice<'a> { fn num_buffers(&self) -> usize { self.columns.iter().map(|c| c.num_buffers()).sum() @@ -228,7 +226,8 @@ impl<'a> Slice for AnyTableSlice<'a> { fn write_ranges(&self, dst: &mut impl ArrayWriter, ranges: &mut impl RangeList) -> anyhow::Result<()> { let mut shift = 0; for c in self.columns.iter() { - c.slice(self.offset, self.len).write_ranges(&mut dst.shift(shift), ranges)?; + c.slice(self.offset, self.len) + .write_ranges(&mut dst.shift(shift), ranges)?; shift += c.num_buffers(); } Ok(()) @@ -237,20 +236,19 @@ impl<'a> Slice for AnyTableSlice<'a> { fn write_indexes( &self, dst: &mut impl ArrayWriter, - indexes: impl Iterator + Clone - ) -> anyhow::Result<()> - { + indexes: impl Iterator + Clone + ) -> anyhow::Result<()> { let mut shift = 0; for c in self.columns.iter() { - c.slice(self.offset, self.len).write_indexes(&mut dst.shift(shift), indexes.clone())?; + c.slice(self.offset, self.len) + .write_indexes(&mut dst.shift(shift), indexes.clone())?; shift += c.num_buffers(); } Ok(()) } } - -impl<'a> From<&'a RecordBatch> for AnyTableSlice<'a> { +impl<'a> From<&'a RecordBatch> for AnyTableSlice<'a> { fn from(value: &'a RecordBatch) -> Self { Self { columns: value.columns().iter().map(|c| c.as_ref().into()).collect(), @@ -260,7 +258,6 @@ impl<'a> From<&'a RecordBatch> for AnyTableSlice<'a> { } } - impl AsSlice for RecordBatch { type Slice<'a> = AnyTableSlice<'a>; @@ -269,7 +266,6 @@ impl AsSlice for RecordBatch { } } - impl<'a> From<&'a AnyStructSlice<'a>> for AnyTableSlice<'a> { fn from(value: &'a AnyStructSlice<'a>) -> Self { Self { @@ -280,7 +276,6 @@ impl<'a> From<&'a AnyStructSlice<'a>> for AnyTableSlice<'a> { } } - impl<'a> From<&'a StructArray> for AnyTableSlice<'a> { fn from(value: &'a StructArray) -> Self { Self { @@ -289,4 +284,4 @@ impl<'a> From<&'a StructArray> for AnyTableSlice<'a> { len: value.len() } } -} \ No newline at end of file +} diff --git a/crates/array/src/sort/functions.rs b/crates/array/src/sort/functions.rs index 1133aad5..60230041 100644 --- a/crates/array/src/sort/functions.rs +++ b/crates/array/src/sort/functions.rs @@ -1,31 +1,30 @@ -use crate::builder::AnyTableBuilder; -use crate::slice::{AsSlice, Slice}; -use crate::sort::sort_table_to_indexes; use arrow::array::RecordBatch; +use crate::{ + builder::AnyTableBuilder, + slice::{AsSlice, Slice}, + sort::sort_table_to_indexes +}; pub fn sort_record_batch<'a>( records: &'a RecordBatch, - by_columns: impl IntoIterator -) -> anyhow::Result -{ - let columns = by_columns.into_iter().map(|name| { - Ok(records.schema_ref().index_of(name)?) - }).collect::>>()?; - + by_columns: impl IntoIterator +) -> anyhow::Result { + let columns = by_columns + .into_iter() + .map(|name| Ok(records.schema_ref().index_of(name)?)) + .collect::>>()?; + sort_record_batch_impl(records, &columns) } - #[inline(never)] fn sort_record_batch_impl(records: &RecordBatch, columns: &[usize]) -> anyhow::Result { let slice = records.as_slice(); let indexes = sort_table_to_indexes(&slice, columns); - + let mut builder = AnyTableBuilder::new(records.schema()); slice.write_indexes(&mut builder, indexes.iter().copied())?; - - Ok(unsafe { - builder.finish_unchecked() - }) -} \ No newline at end of file + + Ok(unsafe { builder.finish_unchecked() }) +} diff --git a/crates/array/src/sort/mod.rs b/crates/array/src/sort/mod.rs index ce28b9c4..2458e05c 100644 --- a/crates/array/src/sort/mod.rs +++ b/crates/array/src/sort/mod.rs @@ -1,7 +1,6 @@ +mod functions; pub mod order; mod sorting; -mod functions; - pub use functions::*; -pub use sorting::*; \ No newline at end of file +pub use sorting::*; diff --git a/crates/array/src/sort/order.rs b/crates/array/src/sort/order.rs index bbe5f141..30fb1a75 100644 --- a/crates/array/src/sort/order.rs +++ b/crates/array/src/sort/order.rs @@ -1,16 +1,14 @@ use std::cmp::Ordering; -use crate::access::Access; +use crate::access::Access; pub trait Order { fn compare(&self, a: Idx, b: Idx) -> Ordering; } - pub struct OrderPair(pub A, pub B); - -impl , B: Order> Order for OrderPair { +impl, B: Order> Order for OrderPair { fn compare(&self, a: Idx, b: Idx) -> Ordering { match self.0.compare(a, b) { Ordering::Equal => self.1.compare(a, b), @@ -19,21 +17,18 @@ impl , B: Order> Order for OrderPair { } } - pub struct OrderList { list: Vec } - -impl OrderList { +impl OrderList { pub fn new(list: Vec) -> Self { assert!(list.len() > 0); Self { list } } } - -impl > Order for OrderList { +impl> Order for OrderList { fn compare(&self, a: Idx, b: Idx) -> Ordering { for o in self.list.iter() { match o.compare(a, b) { @@ -45,8 +40,7 @@ impl > Order for OrderList { } } - -impl > Order for T { +impl> Order for T { #[inline] fn compare(&self, a: usize, b: usize) -> Ordering { match (self.is_valid(a), self.is_valid(b)) { @@ -58,29 +52,25 @@ impl > Order for T { } } - pub struct IgnoreNulls(pub T); - -impl > Order for IgnoreNulls { +impl> Order for IgnoreNulls { #[inline] fn compare(&self, a: usize, b: usize) -> Ordering { self.0.get(a).cmp(&self.0.get(b)) } } - -impl <'a> Order for &'a dyn Order { +impl<'a> Order for &'a dyn Order { #[inline] fn compare(&self, a: usize, b: usize) -> Ordering { (*self).compare(a, b) } } - -impl <'a> Order for Box + 'a> { +impl<'a> Order for Box + 'a> { #[inline] fn compare(&self, a: usize, b: usize) -> Ordering { self.as_ref().compare(a, b) } -} \ No newline at end of file +} diff --git a/crates/array/src/sort/sorting.rs b/crates/array/src/sort/sorting.rs index e1972d9d..8eb9d4f1 100644 --- a/crates/array/src/sort/sorting.rs +++ b/crates/array/src/sort/sorting.rs @@ -1,7 +1,8 @@ -use crate::access::Access; -use crate::slice::{AnySlice, AnyTableSlice, FixedSizeListSlice, ListSlice, Slice}; -use crate::sort::order::{IgnoreNulls, Order, OrderList, OrderPair}; - +use crate::{ + access::Access, + slice::{AnySlice, AnyTableSlice, FixedSizeListSlice, ListSlice, Slice}, + sort::order::{IgnoreNulls, Order, OrderList, OrderPair} +}; macro_rules! with_order { ($slice:expr, $order:ident, $cb: expr) => { @@ -19,46 +20,41 @@ macro_rules! with_order { AnySlice::Float64(_) => panic!("sorting on f64 is not supported"), AnySlice::Binary(s) => dispatch_nulls!(s, $order, $cb), AnySlice::FixedSizeBinary(s) => dispatch_nulls!(s, $order, $cb), - AnySlice::List(list) => { - match list.values().item() { - AnySlice::UInt16(it) if !it.has_nulls() => { - let slice = ListSlice::new(list.offsets(), it.values(), list.nulls().bitmask()); - dispatch_nulls!(slice, $order, $cb) - }, - AnySlice::UInt32(it) if !it.has_nulls() => { - let slice = ListSlice::new(list.offsets(), it.values(), list.nulls().bitmask()); - dispatch_nulls!(slice, $order, $cb) - }, - AnySlice::Int32(it) if !it.has_nulls() => { - let slice = ListSlice::new(list.offsets(), it.values(), list.nulls().bitmask()); - dispatch_nulls!(slice, $order, $cb) - }, - _ => panic!("only lists of non-nullable u16, u32 and i32 items are sortable") + AnySlice::List(list) => match list.values().item() { + AnySlice::UInt16(it) if !it.has_nulls() => { + let slice = ListSlice::new(list.offsets(), it.values(), list.nulls().bitmask()); + dispatch_nulls!(slice, $order, $cb) + } + AnySlice::UInt32(it) if !it.has_nulls() => { + let slice = ListSlice::new(list.offsets(), it.values(), list.nulls().bitmask()); + dispatch_nulls!(slice, $order, $cb) } + AnySlice::Int32(it) if !it.has_nulls() => { + let slice = ListSlice::new(list.offsets(), it.values(), list.nulls().bitmask()); + dispatch_nulls!(slice, $order, $cb) + } + _ => panic!("only lists of non-nullable u16, u32 and i32 items are sortable") }, - AnySlice::FixedSizeList(list) => { - match list.values().item() { - AnySlice::UInt16(it) if !it.has_nulls() => { - let slice = FixedSizeListSlice::new(list.size(), it.values(), list.nulls().bitmask()); - dispatch_nulls!(slice, $order, $cb) - }, - AnySlice::UInt32(it) if !it.has_nulls() => { - let slice = FixedSizeListSlice::new(list.size(), it.values(), list.nulls().bitmask()); - dispatch_nulls!(slice, $order, $cb) - }, - AnySlice::Int32(it) if !it.has_nulls() => { - let slice = FixedSizeListSlice::new(list.size(), it.values(), list.nulls().bitmask()); - dispatch_nulls!(slice, $order, $cb) - }, - _ => panic!("only fixed size lists of non-nullable u16, u32 and i32 items are sortable") + AnySlice::FixedSizeList(list) => match list.values().item() { + AnySlice::UInt16(it) if !it.has_nulls() => { + let slice = FixedSizeListSlice::new(list.size(), it.values(), list.nulls().bitmask()); + dispatch_nulls!(slice, $order, $cb) + } + AnySlice::UInt32(it) if !it.has_nulls() => { + let slice = FixedSizeListSlice::new(list.size(), it.values(), list.nulls().bitmask()); + dispatch_nulls!(slice, $order, $cb) } + AnySlice::Int32(it) if !it.has_nulls() => { + let slice = FixedSizeListSlice::new(list.size(), it.values(), list.nulls().bitmask()); + dispatch_nulls!(slice, $order, $cb) + } + _ => panic!("only fixed size lists of non-nullable u16, u32 and i32 items are sortable") }, - AnySlice::Struct(_) => panic!("sorting on structs is not supported"), + AnySlice::Struct(_) => panic!("sorting on structs is not supported") } }; } - macro_rules! dispatch_nulls { ($slice:ident, $order:ident, $cb: expr) => { if $slice.has_nulls() { @@ -77,26 +73,17 @@ macro_rules! sort { }; } - fn to_dyn_order<'a>(slice: AnySlice<'a>) -> Box + 'a> { - with_order!(slice, order, { - Box::new(order) - }) + with_order!(slice, order, { Box::new(order) }) } - pub fn sort_to_indexes(array: &AnySlice<'_>) -> Vec { let mut indexes: Vec = (0..array.len()).collect(); sort_1(&mut indexes, array); indexes } - -pub fn sort_table_to_indexes( - table: &AnyTableSlice<'_>, - columns: &[usize] -) -> Vec -{ +pub fn sort_table_to_indexes(table: &AnyTableSlice<'_>, columns: &[usize]) -> Vec { macro_rules! col { ($i:expr) => { table.column(columns[$i]) @@ -115,12 +102,10 @@ pub fn sort_table_to_indexes( indexes } - fn sort_1(indexes: &mut [usize], col: &AnySlice<'_>) { with_order!(col, order, sort!(indexes, order)) } - fn sort_2(indexes: &mut [usize], c1: AnySlice<'_>, c2: AnySlice<'_>) { let order2 = to_dyn_order(c2); with_order!(c1, order1, { @@ -129,10 +114,11 @@ fn sort_2(indexes: &mut [usize], c1: AnySlice<'_>, c2: AnySlice<'_>) { }) } - fn sort_many(indexes: &mut [usize], table: &AnyTableSlice<'_>, columns: &[usize]) { let order2 = OrderList::new( - columns[1..].iter().copied() + columns[1..] + .iter() + .copied() .map(|i| to_dyn_order(table.column(i))) .collect() ); @@ -140,4 +126,4 @@ fn sort_many(indexes: &mut [usize], table: &AnyTableSlice<'_>, columns: &[usize] let order = OrderPair(order1, order2); sort!(indexes, order) }) -} \ No newline at end of file +} diff --git a/crates/array/src/util.rs b/crates/array/src/util.rs index a8d13847..bdc4f5b5 100644 --- a/crates/array/src/util.rs +++ b/crates/array/src/util.rs @@ -1,35 +1,27 @@ -use arrow::datatypes::{DataType, Fields}; -use std::cmp::Ordering; -use std::ops::AddAssign; +use std::{cmp::Ordering, ops::AddAssign}; +use arrow::datatypes::{DataType, Fields}; #[inline] -pub fn validate_offsets( - offsets: &[I], - mut prev: I -) -> Result<(), &'static str> -{ +pub fn validate_offsets(offsets: &[I], mut prev: I) -> Result<(), &'static str> { if offsets.len() == 0 { - return Err("offsets slice can't be empty") + return Err("offsets slice can't be empty"); } - + if offsets[0] < I::default() { - return Err("found negative offset value") + return Err("found negative offset value"); } for &val in offsets.iter() { if val < prev { - return Err( - "offset values are not monotonically increasing" - ) + return Err("offset values are not monotonically increasing"); } prev = val } - + Ok(()) } - macro_rules! invalid_buffer_access { () => { panic!("invalid arrow buffer access") @@ -37,19 +29,17 @@ macro_rules! invalid_buffer_access { } pub(crate) use invalid_buffer_access; - pub mod bit_tools { - use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk; - use arrow_buffer::bit_util; use std::ops::Range; + use arrow_buffer::{bit_chunk_iterator::UnalignedBitChunk, bit_util}; pub fn all_valid(data: &[u8], offset: usize, len: usize) -> bool { // TODO: optimize UnalignedBitChunk::new(data, offset, len).count_ones() == len } - - pub fn all_indexes_valid(data: &[u8], indexes: impl Iterator) -> Option { + + pub fn all_indexes_valid(data: &[u8], indexes: impl Iterator) -> Option { let mut len = 0; for i in indexes { if !bit_util::get_bit(data, i) { @@ -59,8 +49,8 @@ pub mod bit_tools { } Some(len) } - - pub fn all_ranges_valid(data: &[u8], ranges: impl Iterator>) -> Option { + + pub fn all_ranges_valid(data: &[u8], ranges: impl Iterator>) -> Option { let mut len = 0; for r in ranges { if !all_valid(data, r.start, r.len()) { @@ -72,20 +62,15 @@ pub mod bit_tools { } } - pub fn bisect_offsets(offsets: &[I], idx: I) -> Option { let mut beg = 0; let mut end = offsets.len() - 1; - while end - beg > 1 { + while end - beg > 1 { let mid = beg + (end - beg) / 2; match offsets[mid].cmp(&idx) { Ordering::Equal => return Some(mid), - Ordering::Less => { - beg = mid - }, - Ordering::Greater => { - end = mid - } + Ordering::Less => beg = mid, + Ordering::Greater => end = mid } } if offsets[beg] <= idx && idx < offsets[end] { @@ -95,24 +80,20 @@ pub fn bisect_offsets(offsets: &[I], idx: I) -> Option { } } - pub fn get_offset_position(offsets: &[I], index: I, first_to_try: usize) -> usize { let beg = offsets[first_to_try]; if beg <= index { if index < offsets[first_to_try + 1] { first_to_try } else { - first_to_try + bisect_offsets(&offsets[first_to_try..], index) - .expect("index is out of bounds") + first_to_try + bisect_offsets(&offsets[first_to_try..], index).expect("index is out of bounds") } } else { - bisect_offsets(&offsets[0..first_to_try + 1], index) - .expect("index is out of bounds") + bisect_offsets(&offsets[0..first_to_try + 1], index).expect("index is out of bounds") } } - -pub fn build_offsets(first: I, lengths: impl Iterator) -> Vec { +pub fn build_offsets(first: I, lengths: impl Iterator) -> Vec { let mut vec = Vec::with_capacity(1 + lengths.size_hint().0); let mut last = first; vec.push(last); @@ -123,53 +104,36 @@ pub fn build_offsets(first: I, lengths: impl Iterator Vec { - build_offsets(start_pos, fields.iter().map(|f| { - get_num_buffers(f.data_type()) - })) + build_offsets(start_pos, fields.iter().map(|f| get_num_buffers(f.data_type()))) } - pub fn get_num_buffers(data_type: &DataType) -> usize { match data_type { - DataType::Boolean | - DataType::Int8 | - DataType::Int16 | - DataType::Int32 | - DataType::Int64 | - DataType::UInt8 | - DataType::UInt16 | - DataType::UInt32 | - DataType::UInt64 | - DataType::Float16 | - DataType::Float32 | - DataType::Float64 | - DataType::Timestamp(_, _) | - DataType::Date32 | - DataType::Date64 | - DataType::Time32(_) | - DataType::Time64(_) | - DataType::Duration(_) | - DataType::Interval(_) => { - 2 - } - DataType::Binary | - DataType::Utf8 => { - 3 - } - DataType::FixedSizeBinary(_) => { - 2 - } - DataType::List(f) => { - 2 + get_num_buffers(f.data_type()) - } - DataType::FixedSizeList(f, _) => { - 1 + get_num_buffers(f.data_type()) - } - DataType::Struct(fields) => { - 1 + fields.iter().map(|f| get_num_buffers(f.data_type())).sum::() - } + DataType::Boolean + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Timestamp(_, _) + | DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Interval(_) => 2, + DataType::Binary | DataType::Utf8 => 3, + DataType::FixedSizeBinary(_) => 2, + DataType::List(f) => 2 + get_num_buffers(f.data_type()), + DataType::FixedSizeList(f, _) => 1 + get_num_buffers(f.data_type()), + DataType::Struct(fields) => 1 + fields.iter().map(|f| get_num_buffers(f.data_type())).sum::(), ty => panic!("unsupported arrow data type - {}", ty) } } diff --git a/crates/array/src/visitor.rs b/crates/array/src/visitor.rs index 50addd30..79605742 100644 --- a/crates/array/src/visitor.rs +++ b/crates/array/src/visitor.rs @@ -1,7 +1,13 @@ -use arrow::array::ArrowPrimitiveType; -use arrow::datatypes::{DataType, FieldRef, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type}; use std::sync::Arc; +use arrow::{ + array::ArrowPrimitiveType, + datatypes::{ + DataType, FieldRef, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, + TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, + UInt32Type, UInt64Type, UInt8Type + } +}; pub trait DataTypeVisitor { type Result; @@ -18,7 +24,7 @@ pub trait DataTypeVisitor { TimeUnit::Nanosecond => self.primitive::() } } - + fn binary(&mut self) -> Self::Result; fn fixed_size_binary(&mut self, size: usize) -> Self::Result; @@ -27,11 +33,11 @@ pub trait DataTypeVisitor { fn string(&mut self) -> Self::Result { self.binary() } - + fn list(&mut self, item: &DataType) -> Self::Result; - + fn r#struct(&mut self, fields: &[FieldRef]) -> Self::Result; - + fn visit(&mut self, data_type: &DataType) -> Self::Result { match data_type { DataType::Boolean => self.boolean(), @@ -55,4 +61,4 @@ pub trait DataTypeVisitor { ty => panic!("unsupported arrow type - {}", ty) } } -} \ No newline at end of file +} diff --git a/crates/array/src/writer/any.rs b/crates/array/src/writer/any.rs index b6e6e670..d0b15a59 100644 --- a/crates/array/src/writer/any.rs +++ b/crates/array/src/writer/any.rs @@ -1,24 +1,26 @@ -use crate::util::{get_num_buffers, invalid_buffer_access}; -use crate::visitor::DataTypeVisitor; -use crate::writer::{ArrayWriter, Writer, WriterFactory}; -use arrow::array::ArrowPrimitiveType; -use arrow::datatypes::{DataType, FieldRef, Schema}; +use arrow::{ + array::ArrowPrimitiveType, + datatypes::{DataType, FieldRef, Schema} +}; +use crate::{ + util::{get_num_buffers, invalid_buffer_access}, + visitor::DataTypeVisitor, + writer::{ArrayWriter, Writer, WriterFactory} +}; pub enum AnyWriter { Bitmask(W::Bitmask), Nullmask(W::Nullmask), Native(W::Native), - Offsets(W::Offset), + Offsets(W::Offset) } - pub struct AnyArrayWriter { buffers: Vec> } - -impl ArrayWriter for AnyArrayWriter { +impl ArrayWriter for AnyArrayWriter { type Writer = W; fn bitmask(&mut self, buf: usize) -> &mut W::Bitmask { @@ -50,64 +52,48 @@ impl ArrayWriter for AnyArrayWriter { } } - -impl AnyArrayWriter { +impl AnyArrayWriter { pub fn into_inner(self) -> Vec> { self.buffers } - - pub fn from_factory( - factory: &mut impl WriterFactory, - data_type: &DataType - ) -> anyhow::Result - { + + pub fn from_factory(factory: &mut impl WriterFactory, data_type: &DataType) -> anyhow::Result { let mut buffers = Vec::with_capacity(get_num_buffers(data_type)); - + AnyArrayFactory { buffers: &mut buffers, writer_factory: factory - }.visit(data_type)?; - - Ok(Self { - buffers - }) + } + .visit(data_type)?; + + Ok(Self { buffers }) } - + pub fn table_writer_from_factory( - factory: &mut impl WriterFactory, + factory: &mut impl WriterFactory, schema: &Schema - ) -> anyhow::Result - { - let mut buffers = Vec::with_capacity( - schema.fields() - .iter() - .map(|f| get_num_buffers(f.data_type())) - .sum() - ); - + ) -> anyhow::Result { + let mut buffers = Vec::with_capacity(schema.fields().iter().map(|f| get_num_buffers(f.data_type())).sum()); + let mut wf = AnyArrayFactory { buffers: &mut buffers, writer_factory: factory }; - + for f in schema.fields().iter() { wf.visit(f.data_type())?; } - - Ok(Self { - buffers - }) + + Ok(Self { buffers }) } } - struct AnyArrayFactory<'a, W: Writer, F> { buffers: &'a mut Vec>, - writer_factory: &'a mut F, + writer_factory: &'a mut F } - -impl <'a, W: Writer, F: WriterFactory> DataTypeVisitor for AnyArrayFactory<'a, W, F> { +impl<'a, W: Writer, F: WriterFactory> DataTypeVisitor for AnyArrayFactory<'a, W, F> { type Result = anyhow::Result<()>; fn boolean(&mut self) -> Self::Result { @@ -173,4 +159,4 @@ impl <'a, W: Writer, F: WriterFactory> DataTypeVisitor for AnyArrayFac Ok(()) } -} \ No newline at end of file +} diff --git a/crates/array/src/writer/mod.rs b/crates/array/src/writer/mod.rs index 77f0b5c1..10f01940 100644 --- a/crates/array/src/writer/mod.rs +++ b/crates/array/src/writer/mod.rs @@ -1,37 +1,25 @@ -use crate::index::RangeList; -use crate::offsets::Offsets; use arrow_buffer::{ArrowNativeType, ToByteSlice}; +use crate::{index::RangeList, offsets::Offsets}; mod any; - pub use any::*; - pub trait BitmaskWriter { fn write_slice(&mut self, data: &[u8], offset: usize, len: usize) -> anyhow::Result<()>; - fn write_slice_indexes( - &mut self, - data: &[u8], - indexes: impl Iterator + Clone - ) -> anyhow::Result<()>; + fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) -> anyhow::Result<()>; - fn write_slice_ranges( - &mut self, - data: &[u8], - ranges: &mut impl RangeList - ) -> anyhow::Result<()>; + fn write_slice_ranges(&mut self, data: &[u8], ranges: &mut impl RangeList) -> anyhow::Result<()>; fn write_many(&mut self, val: bool, count: usize) -> anyhow::Result<()>; } - pub trait NativeWriter { fn write(&mut self, value: T) -> anyhow::Result<()>; - - fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()>; + + fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()>; fn write_slice(&mut self, values: &[T]) -> anyhow::Result<()>; @@ -48,26 +36,17 @@ pub trait NativeWriter { ) -> anyhow::Result<()>; } - pub trait OffsetsWriter { fn write_slice(&mut self, offsets: Offsets<'_>) -> anyhow::Result<()>; - fn write_slice_indexes( - &mut self, - offsets: Offsets<'_>, - indexes: impl Iterator - ) -> anyhow::Result<()>; + fn write_slice_indexes(&mut self, offsets: Offsets<'_>, indexes: impl Iterator) + -> anyhow::Result<()>; - fn write_slice_ranges( - &mut self, - offsets: Offsets<'_>, - ranges: &mut impl RangeList - ) -> anyhow::Result<()>; + fn write_slice_ranges(&mut self, offsets: Offsets<'_>, ranges: &mut impl RangeList) -> anyhow::Result<()>; fn write_len(&mut self, len: usize) -> anyhow::Result<()>; } - pub trait Writer { type Bitmask: BitmaskWriter; type Nullmask: BitmaskWriter; @@ -75,12 +54,11 @@ pub trait Writer { type Offset: OffsetsWriter; } - pub trait ArrayWriter: Sized { type Writer: Writer; fn bitmask(&mut self, buf: usize) -> &mut ::Bitmask; - + fn nullmask(&mut self, buf: usize) -> &mut ::Nullmask; fn native(&mut self, buf: usize) -> &mut ::Native; @@ -89,21 +67,16 @@ pub trait ArrayWriter: Sized { #[inline] fn shift(&mut self, pos: usize) -> impl ArrayWriter + '_ { - ArrayWriterView { - builder: self, - pos - } + ArrayWriterView { builder: self, pos } } } - struct ArrayWriterView<'a, T> { builder: &'a mut T, pos: usize } - -impl <'a, T: ArrayWriter> ArrayWriter for ArrayWriterView<'a, T> { +impl<'a, T: ArrayWriter> ArrayWriter for ArrayWriterView<'a, T> { type Writer = T::Writer; #[inline] @@ -125,7 +98,7 @@ impl <'a, T: ArrayWriter> ArrayWriter for ArrayWriterView<'a, T> { fn offset(&mut self, buf: usize) -> &mut ::Offset { self.builder.offset(self.pos + buf) } - + #[inline] fn shift(&mut self, pos: usize) -> impl ArrayWriter + '_ { ArrayWriterView { @@ -135,7 +108,6 @@ impl <'a, T: ArrayWriter> ArrayWriter for ArrayWriterView<'a, T> { } } - pub trait WriterFactory { type Writer: Writer; @@ -146,4 +118,4 @@ pub trait WriterFactory { fn native(&mut self) -> anyhow::Result<::Native>; fn offset(&mut self) -> anyhow::Result<::Offset>; -} \ No newline at end of file +} diff --git a/crates/array/tests/fixtures.rs b/crates/array/tests/fixtures.rs index ea7b06dc..70b8ad27 100644 --- a/crates/array/tests/fixtures.rs +++ b/crates/array/tests/fixtures.rs @@ -1,29 +1,29 @@ +use std::{fs::File, path::Path}; + use arrow::array::{Array, RecordBatch, StructArray, UInt32Array}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -use sqd_array::builder::{AnyBuilder, ArrayBuilder}; -use sqd_array::io::file::ArrayFile; -use sqd_array::reader::ArrayReader; -use sqd_array::slice::{AsSlice, Slice}; -use sqd_array::sort::sort_record_batch; -use std::fs::File; -use std::path::Path; - +use sqd_array::{ + builder::{AnyBuilder, ArrayBuilder}, + io::file::ArrayFile, + reader::ArrayReader, + slice::{AsSlice, Slice}, + sort::sort_record_batch +}; #[test] fn test_write_slice_to_builder() -> anyhow::Result<()> { let records = load_parquet_fixture("solana-instructions.parquet")?; let src = StructArray::from(records); - + let mut builder = AnyBuilder::new(src.data_type()); src.as_slice().write(&mut builder)?; let result = builder.finish(); - + assert_eq!(result.to_data(), src.to_data()); - + Ok(()) } - #[test] fn test_write_reversed_slice_to_builder() -> anyhow::Result<()> { let records = load_parquet_fixture("solana-instructions.parquet")?; @@ -33,31 +33,26 @@ fn test_write_reversed_slice_to_builder() -> anyhow::Result<()> { let result = { let mut builder = AnyBuilder::new(src.data_type()); - - src.as_slice().write_indexes( - &mut builder, - indexes.values().iter().map(|i| *i as usize) - )?; - + + src.as_slice() + .write_indexes(&mut builder, indexes.values().iter().map(|i| *i as usize))?; + builder.finish() }; - + assert_eq!(result.to_data(), reference.to_data()); - + Ok(()) } - #[test] fn test_in_memory_sort() -> anyhow::Result<()> { let src = load_parquet_fixture("solana-instructions.parquet")?; - - let result = sort_record_batch(&src, [ - "program_id", - "block_number", - "transaction_index", - "instruction_address" - ])?; + + let result = sort_record_batch( + &src, + ["program_id", "block_number", "transaction_index", "instruction_address"] + )?; let reference = load_parquet_fixture("solana-instructions.sorted.parquet")?; @@ -66,38 +61,36 @@ fn test_in_memory_sort() -> anyhow::Result<()> { Ok(()) } - #[test] fn test_array_file_write_read() -> anyhow::Result<()> { let records = load_parquet_fixture("solana-instructions.parquet")?; let src = StructArray::from(records); - + let file = { let mut writer = ArrayFile::new_temporary(src.data_type().clone())?.write()?; src.as_slice().write(&mut writer)?; writer.finish()? }; - + let result = { let mut builder = AnyBuilder::new(src.data_type()); file.read()?.read(&mut builder)?; builder.finish() }; - + assert_eq!(result.to_data(), src.to_data()); - + Ok(()) } - fn load_parquet_fixture(name: &str) -> anyhow::Result { let path = Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures").join(name); let file = File::open(path)?; - + let mut reader = ParquetRecordBatchReaderBuilder::try_new(file)? .with_batch_size(1_000_000) .build()?; - + let record_batch = reader.next().unwrap()?; Ok(record_batch) -} \ No newline at end of file +} diff --git a/crates/array/tests/primitives.rs b/crates/array/tests/primitives.rs index a40e5b70..36ce7681 100644 --- a/crates/array/tests/primitives.rs +++ b/crates/array/tests/primitives.rs @@ -1,39 +1,37 @@ -use arrow_buffer::ToByteSlice; -use sqd_array::builder::offsets::OffsetsBuilder; -use sqd_array::io::reader::{IOByteReader, OffsetsIOReader}; -use sqd_array::reader::OffsetsReader; use std::io::Cursor; -use sqd_array::io::writer::OffsetsIOWriter; -use sqd_array::offsets::Offsets; -use sqd_array::writer::OffsetsWriter; +use arrow_buffer::ToByteSlice; +use sqd_array::{ + builder::offsets::OffsetsBuilder, + io::{ + reader::{IOByteReader, OffsetsIOReader}, + writer::OffsetsIOWriter + }, + offsets::Offsets, + reader::OffsetsReader, + writer::OffsetsWriter +}; #[test] fn test_io_offsets_read_all() -> anyhow::Result<()> { let offsets: Vec = (0..20001).collect(); let cursor = Cursor::new(offsets.to_byte_slice()); - let mut reader = OffsetsIOReader::new( - IOByteReader::new( - offsets.to_byte_slice().len(), - cursor - ) - )?; + let mut reader = OffsetsIOReader::new(IOByteReader::new(offsets.to_byte_slice().len(), cursor))?; let mut builder = OffsetsBuilder::new(offsets.len()); let range = reader.read_slice(&mut builder, 0, offsets.len() - 1)?; - + assert_eq!(range, 0..20000); assert_eq!(offsets.as_slice(), builder.finish().as_ref()); - + Ok(()) } - #[test] fn test_io_offsets_write_all() -> anyhow::Result<()> { let offsets: Vec = (0..20001).collect(); - + let data = { let mut writer = OffsetsIOWriter::new(Vec::new()); writer.write_slice(Offsets::new(&offsets))?; @@ -41,6 +39,6 @@ fn test_io_offsets_write_all() -> anyhow::Result<()> { }; assert_eq!(data.as_slice(), offsets.to_byte_slice()); - + Ok(()) -} \ No newline at end of file +} diff --git a/crates/bds/src/block.rs b/crates/bds/src/block.rs index 77405c24..9c7f1aff 100644 --- a/crates/bds/src/block.rs +++ b/crates/bds/src/block.rs @@ -1,12 +1,13 @@ -use sqd_primitives::BlockNumber; -use std::borrow::{Borrow, Cow}; -use std::ops::Range; -use std::sync::Arc; +use std::{ + borrow::{Borrow, Cow}, + ops::Range, + sync::Arc +}; +use sqd_primitives::BlockNumber; pub type BlockRange = Range; - #[derive(Clone, Debug)] pub struct BlockHeader<'a> { pub number: BlockNumber, @@ -17,19 +18,15 @@ pub struct BlockHeader<'a> { pub is_final: bool } - #[derive(Clone, Debug)] pub struct Block<'a> { pub header: BlockHeader<'a>, pub data: Cow<'a, [u8]> } - - pub type BlockArc = Arc>; - -impl <'a> sqd_primitives::Block for Block<'a> { +impl<'a> sqd_primitives::Block for Block<'a> { fn number(&self) -> BlockNumber { self.header.number } @@ -51,7 +48,6 @@ impl <'a> sqd_primitives::Block for Block<'a> { } } - impl<'a> sqd_primitives::Block for BlockHeader<'a> { fn number(&self) -> BlockNumber { self.number @@ -74,7 +70,6 @@ impl<'a> sqd_primitives::Block for BlockHeader<'a> { } } - impl<'a> BlockHeader<'a> { pub fn to_static(&self) -> BlockHeader<'static> { BlockHeader { @@ -86,4 +81,4 @@ impl<'a> BlockHeader<'a> { is_final: self.is_final } } -} \ No newline at end of file +} diff --git a/crates/bds/src/cassandra/block_batch.rs b/crates/bds/src/cassandra/block_batch.rs index 2f6f9a1d..13c3a633 100644 --- a/crates/bds/src/cassandra/block_batch.rs +++ b/crates/bds/src/cassandra/block_batch.rs @@ -1,6 +1,6 @@ -use super::row_batch::{Row, RowBatch}; use sqd_primitives::BlockNumber; +use super::row_batch::{Row, RowBatch}; pub struct BlockBatch { inner: RowBatch, @@ -8,8 +8,7 @@ pub struct BlockBatch { partition_end: BlockNumber } - -impl BlockBatch { +impl BlockBatch { pub(super) fn new(inner: RowBatch, partition_start: BlockNumber, partition_end: BlockNumber) -> Self { Self { inner, @@ -17,11 +16,11 @@ impl BlockBatch { partition_end } } - + pub fn blocks(&self) -> &[B::Type<'_>] { self.inner.items() } - + pub fn partition_start(&self) -> BlockNumber { self.partition_start } @@ -29,4 +28,4 @@ impl BlockBatch { pub fn partition_end(&self) -> BlockNumber { self.partition_end } -} \ No newline at end of file +} diff --git a/crates/bds/src/cassandra/ingest.rs b/crates/bds/src/cassandra/ingest.rs index ea7e1ca1..dc0f16f9 100644 --- a/crates/bds/src/cassandra/ingest.rs +++ b/crates/bds/src/cassandra/ingest.rs @@ -1,12 +1,11 @@ -use super::store::CassandraStorage; -use crate::block::BlockArc; -use crate::chain::HeadChain; -use crate::ingest::Store; +use std::pin::pin; + use anyhow::{bail, ensure}; use futures::TryStreamExt; use sqd_primitives::{Block, BlockNumber, BlockPtr, BlockRef}; -use std::pin::pin; +use super::store::CassandraStorage; +use crate::{block::BlockArc, chain::HeadChain, ingest::Store}; impl Store for CassandraStorage { type Block = BlockArc; @@ -15,14 +14,10 @@ impl Store for CassandraStorage { let states = self.fetch_write_states().await?; if !states.iter().any(|s| s.head.number >= first_block) { - return Ok(HeadChain::empty()) + return Ok(HeadChain::empty()); } - let last_block = states.into_iter() - .max_by_key(|s| s.head.number) - .unwrap() - .head - .number; + let last_block = states.into_iter().max_by_key(|s| s.head.number).unwrap().head.number; if let Some(parent_hash) = parent_hash { validate_chain_base(self, parent_hash, first_block, last_block).await?; @@ -45,23 +40,21 @@ impl Store for CassandraStorage { } } - async fn validate_chain_base( store: &CassandraStorage, parent_hash: &str, first_block: BlockNumber, last_block: BlockNumber -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let mut stream = store.list_blocks(first_block, last_block); let mut stream_pin = pin!(stream); 'L: while let Some(batch) = stream_pin.try_next().await? { for b in batch.blocks() { if b.parent_number >= first_block { - break 'L + break 'L; } if b.parent_hash == parent_hash { - return Ok(()) + return Ok(()); } } } @@ -72,14 +65,12 @@ async fn validate_chain_base( ); } - async fn build_chain( store: &CassandraStorage, first_block: BlockNumber, last_block: BlockNumber, parent_hash: Option<&str> -) -> anyhow::Result -{ +) -> anyhow::Result { let mut stream = pin! { store.list_blocks_in_reversed_order(first_block, last_block) }; @@ -94,20 +85,20 @@ async fn build_chain( chain.blocks.push(b.ptr().to_ref()); if b.is_final { chain.first_finalized = true; - break 'L + break 'L; } } } if batch.partition_end() < expected.number { bail!( - "block {} is missing in the database, while the above block is present", + "block {} is missing in the database, while the above block is present", expected ); } } if chain.blocks.is_empty() { - return Ok(chain) + return Ok(chain); } chain.blocks.reverse(); @@ -117,12 +108,12 @@ async fn build_chain( "block {} is missing in the database, while the above block is present", expected ); - + if let Some(parent_hash) = parent_hash { if expected.number < first_block { ensure!( expected.hash == parent_hash, - "the highest available chain {} is not based on block with hash {}", + "the highest available chain {} is not based on block with hash {}", chain.blocks.last().unwrap(), parent_hash ); @@ -130,4 +121,4 @@ async fn build_chain( } Ok(chain) -} \ No newline at end of file +} diff --git a/crates/bds/src/cassandra/mod.rs b/crates/bds/src/cassandra/mod.rs index 2cd969b0..1d65ea8e 100644 --- a/crates/bds/src/cassandra/mod.rs +++ b/crates/bds/src/cassandra/mod.rs @@ -4,6 +4,5 @@ mod row_batch; mod store; mod types; - +pub use block_batch::BlockBatch; pub use store::*; -pub use block_batch::BlockBatch; \ No newline at end of file diff --git a/crates/bds/src/cassandra/row_batch.rs b/crates/bds/src/cassandra/row_batch.rs index f3e175cb..5dbd9e44 100644 --- a/crates/bds/src/cassandra/row_batch.rs +++ b/crates/bds/src/cassandra/row_batch.rs @@ -1,8 +1,7 @@ -use ouroboros::self_referencing; -use scylla::_macro_internal::DeserializeRow; -use scylla::response::query_result::QueryRowsResult; use std::marker::PhantomData; +use ouroboros::self_referencing; +use scylla::{_macro_internal::DeserializeRow, response::query_result::QueryRowsResult}; pub trait Row: 'static { type Type<'a>; @@ -13,12 +12,10 @@ pub trait Row: 'static { fn reborrow<'a, 'this>(slice: &'a [Self::Type<'this>]) -> &'a [Self::Type<'a>]; } - pub struct RowBatch { inner: RowBatchInner } - #[self_referencing] struct RowBatchInner { rows: QueryRowsResult, @@ -28,23 +25,19 @@ struct RowBatchInner { phantom_data: PhantomData } - impl RowBatch { pub fn new(rows: QueryRowsResult) -> anyhow::Result { Ok(Self { inner: RowBatchInnerTryBuilder { rows, - items_builder: |rows| { - rows.rows::<'_, R::Tuple<'_>>()?.map(|row| { - R::convert(row?) - }).collect() - }, + items_builder: |rows| rows.rows::<'_, R::Tuple<'_>>()?.map(|row| R::convert(row?)).collect(), phantom_data: PhantomData::default() - }.try_build()? + } + .try_build()? }) } - + pub fn items(&self) -> &[R::Type<'_>] { self.inner.with_items(|items| R::reborrow(items)) } -} \ No newline at end of file +} diff --git a/crates/bds/src/cassandra/store.rs b/crates/bds/src/cassandra/store.rs index be12283b..c1adfc87 100644 --- a/crates/bds/src/cassandra/store.rs +++ b/crates/bds/src/cassandra/store.rs @@ -1,28 +1,27 @@ -use super::row_batch::{Row, RowBatch}; -use crate::block::{Block, BlockHeader}; -use crate::cassandra::types::WriteState; -use crate::cassandra::BlockBatch; -use anyhow::{anyhow, bail, Context}; +use std::sync::Arc; + +use anyhow::{Context, anyhow, bail}; use async_stream::try_stream; -use futures::future::BoxFuture; -use futures::{FutureExt, Stream, StreamExt, TryStreamExt}; -use scylla::client::session::Session; -use scylla::response::query_result::QueryRowsResult; -use scylla::response::{PagingState, PagingStateResponse}; -use scylla::statement::batch::Batch; -use scylla::statement::prepared::PreparedStatement; -use scylla::statement::Consistency; +use futures::{FutureExt, Stream, StreamExt, TryStreamExt, future::BoxFuture}; +use scylla::{ + client::session::Session, + response::{PagingState, PagingStateResponse, query_result::QueryRowsResult}, + statement::{Consistency, batch::Batch, prepared::PreparedStatement} +}; use sqd_primitives::{BlockNumber, BlockRef}; -use std::sync::Arc; use uuid::Uuid; +use super::row_batch::{Row, RowBatch}; +use crate::{ + block::{Block, BlockHeader}, + cassandra::{BlockBatch, types::WriteState} +}; #[derive(Clone)] pub struct CassandraStorage { inner: Arc } - struct Inner { session: Arc, update_statement: PreparedStatement, @@ -38,7 +37,6 @@ struct Inner { id: Uuid } - impl CassandraStorage { pub async fn new(session: Arc, keyspace: &str) -> anyhow::Result { let partition_size = 10; @@ -49,15 +47,19 @@ impl CassandraStorage { )).await?; update_statement.set_consistency(Consistency::One); // FIXME - let mut delete_statement = session.prepare(format!( - "DELETE FROM {}.blocks WHERE partition = ? AND number = ? AND hash = ?", - keyspace - )).await?; + let mut delete_statement = session + .prepare(format!( + "DELETE FROM {}.blocks WHERE partition = ? AND number = ? AND hash = ?", + keyspace + )) + .await?; - let mut finalize_statement = session.prepare(format!( - "UPDATE {}.blocks SET is_final = true WHERE partition = ? AND number = ? AND hash = ?", - keyspace - )).await?; + let mut finalize_statement = session + .prepare(format!( + "UPDATE {}.blocks SET is_final = true WHERE partition = ? AND number = ? AND hash = ?", + keyspace + )) + .await?; let mut fetch_statement = session.prepare(format!( "SELECT number, hash, parent_number, parent_hash, timestamp, is_final, data FROM {}.blocks WHERE partition = ? AND number >= ? AND number <= ?", @@ -83,27 +85,33 @@ impl CassandraStorage { reversed_list_statement.set_is_idempotent(true); reversed_list_statement.set_page_size((partition_size * 10) as i32); - let mut fetch_write_states_statement = session.prepare(format!( - "SELECT id, head_number, head_hash, finalized_head_number, finalized_head_hash \ + let mut fetch_write_states_statement = session + .prepare(format!( + "SELECT id, head_number, head_hash, finalized_head_number, finalized_head_hash \ FROM {}.writers WHERE dummy_partition = 0", - keyspace - )).await?; + keyspace + )) + .await?; fetch_write_states_statement.set_consistency(Consistency::One); fetch_write_states_statement.set_is_idempotent(true); - let mut set_write_head_statement = session.prepare(format!( - "UPDATE {}.writers SET head_number = ?, head_hash = ? \ + let mut set_write_head_statement = session + .prepare(format!( + "UPDATE {}.writers SET head_number = ?, head_hash = ? \ WHERE dummy_partition = 0 AND id = ?", - keyspace - )).await?; + keyspace + )) + .await?; // use Consistency::One, because it is ok to miss few blocks set_write_head_statement.set_consistency(Consistency::One); - let mut set_write_finalized_head_statement = session.prepare(format!( - "UPDATE {}.writers SET finalized_head_number = ?, finalized_head_hash = ? \ + let mut set_write_finalized_head_statement = session + .prepare(format!( + "UPDATE {}.writers SET finalized_head_number = ?, finalized_head_hash = ? \ WHERE dummy_partition = 0 AND id = ?", - keyspace - )).await?; + keyspace + )) + .await?; // use Consistency::One, because it is ok to miss few blocks set_write_finalized_head_statement.set_consistency(Consistency::One); @@ -122,65 +130,66 @@ impl CassandraStorage { id: Uuid::now_v7() }; - Ok(Self { - inner: Arc::new(inner) - }) + Ok(Self { inner: Arc::new(inner) }) } pub async fn save_block(&self, block: &Block<'_>) -> anyhow::Result<()> { - self.inner.session.execute_unpaged(&self.inner.update_statement, ( - block.header.parent_number as i64, - block.header.parent_hash.as_ref(), - block.header.timestamp, - block.data.as_ref(), - get_partition(self.inner.partition_size, block.header.number) as i64, - block.header.number as i64, - block.header.hash.as_ref() - )).await?; + self.inner + .session + .execute_unpaged( + &self.inner.update_statement, + ( + block.header.parent_number as i64, + block.header.parent_hash.as_ref(), + block.header.timestamp, + block.data.as_ref(), + get_partition(self.inner.partition_size, block.header.number) as i64, + block.header.number as i64, + block.header.hash.as_ref() + ) + ) + .await?; Ok(()) } pub async fn set_head(&self, number: BlockNumber, hash: &str) -> anyhow::Result<()> { - self.inner.session.execute_unpaged(&self.inner.set_write_head_statement, ( - number as i64, - hash, - self.inner.id - )).await?; + self.inner + .session + .execute_unpaged( + &self.inner.set_write_head_statement, + (number as i64, hash, self.inner.id) + ) + .await?; Ok(()) } pub async fn set_finalized_head(&self, number: BlockNumber, hash: &str) -> anyhow::Result<()> { - self.inner.session.execute_unpaged(&self.inner.set_write_finalized_head_statement, ( - number as i64, - hash, - self.inner.id - )).await?; + self.inner + .session + .execute_unpaged( + &self.inner.set_write_finalized_head_statement, + (number as i64, hash, self.inner.id) + ) + .await?; Ok(()) } pub async fn fetch_write_states(&self) -> anyhow::Result> { - let res = self.inner.session.execute_unpaged(&self.inner.fetch_write_states_statement, ()).await?; + let res = self + .inner + .session + .execute_unpaged(&self.inner.fetch_write_states_statement, ()) + .await?; let rows = res.into_rows_result()?; - let rows = rows.rows::<( - Uuid, - Option, - Option, - Option, - Option - )>()?; + let rows = rows.rows::<(Uuid, Option, Option, Option, Option)>()?; fn into_block_ref(number: Option, hash: Option) -> anyhow::Result> { match (number, hash) { (Some(number), Some(hash)) => { - let number: BlockNumber = number - .try_into() - .context("invalid block number")?; - Ok(Some(BlockRef { - number, - hash - })) - }, + let number: BlockNumber = number.try_into().context("invalid block number")?; + Ok(Some(BlockRef { number, hash })) + } (Some(_), None) => bail!("block number is present, but hash is not"), (None, Some(_)) => bail!("block hash is present, but block number is not"), (None, None) => Ok(None) @@ -190,28 +199,25 @@ impl CassandraStorage { rows.map(|row_result| { let (id, head_number, head_hash, finalized_head_number, finalized_head_hash) = row_result?; - let mut head = into_block_ref(head_number, head_hash).with_context(|| { - format!("invalid head in a write state {}", id) - })?; + let mut head = into_block_ref(head_number, head_hash) + .with_context(|| format!("invalid head in a write state {}", id))?; - let finalized_head = into_block_ref(finalized_head_number, finalized_head_hash).with_context(|| { - format!("invalid finalized head in a write state {}", id) - })?; + let finalized_head = into_block_ref(finalized_head_number, finalized_head_hash) + .with_context(|| format!("invalid finalized head in a write state {}", id))?; if head.is_none() { head = finalized_head.clone(); } - let head = head.ok_or_else(|| { - anyhow!("write state {} has no heads", id) - })?; + let head = head.ok_or_else(|| anyhow!("write state {} has no heads", id))?; Ok(WriteState { id, head, finalized_head }) - }).collect() + }) + .collect() } pub async fn finalize(&self, from: BlockNumber, to: BlockRef) -> anyhow::Result<()> { @@ -219,46 +225,41 @@ impl CassandraStorage { let top = to.number; - self.list_blocks_in_reversed_order(from, top).try_fold(to, |mut expected, par| async move { - let mut batch = Batch::default(); - let mut values = Vec::new(); - - for b in par.blocks() { - if b.ptr() == expected.ptr() { - expected.set_ptr(b.parent_ptr()); - batch.append_statement(self.inner.finalize_statement.clone()); - values.push(( - par.partition_start() as i64, - b.number as i64, - b.hash() - )); - } else { - batch.append_statement(self.inner.delete_statement.clone()); - values.push(( - par.partition_start() as i64, - b.number as i64, - b.hash() - )); + self.list_blocks_in_reversed_order(from, top) + .try_fold(to, |mut expected, par| async move { + let mut batch = Batch::default(); + let mut values = Vec::new(); + + for b in par.blocks() { + if b.ptr() == expected.ptr() { + expected.set_ptr(b.parent_ptr()); + batch.append_statement(self.inner.finalize_statement.clone()); + values.push((par.partition_start() as i64, b.number as i64, b.hash())); + } else { + batch.append_statement(self.inner.delete_statement.clone()); + values.push((par.partition_start() as i64, b.number as i64, b.hash())); + } } - } - // The check below is sloppy, because in theory partition could be paginated, - // however, we set a large page size and ignore such possibility. - // - // We could replace it with `par.partition_end() < expected.number`, - // but that would open a possibility of block deletions - // while the presence of expected blocks have not been checked yet. - if par.partition_start() <= expected.number && top <= expected.number { - bail!("block {} is missing", expected) - } + // The check below is sloppy, because in theory partition could be paginated, + // however, we set a large page size and ignore such possibility. + // + // We could replace it with `par.partition_end() < expected.number`, + // but that would open a possibility of block deletions + // while the presence of expected blocks have not been checked yet. + if par.partition_start() <= expected.number && top <= expected.number { + bail!("block {} is missing", expected) + } - if !values.is_empty() { - batch.set_consistency(Consistency::One); // FIXME - self.inner.session.batch(&batch, values).await?; - } + if !values.is_empty() { + batch.set_consistency(Consistency::One); // FIXME + self.inner.session.batch(&batch, values).await?; + } - Ok(expected) - }).boxed().await?; + Ok(expected) + }) + .boxed() + .await?; Ok(()) } @@ -267,42 +268,24 @@ impl CassandraStorage { &self, first_block: BlockNumber, last_block: BlockNumber - ) -> impl Stream>>> - { - self.execute_block_list( - &|inner| &inner.fetch_statement, - first_block, - last_block, - false - ) + ) -> impl Stream>>> { + self.execute_block_list(&|inner| &inner.fetch_statement, first_block, last_block, false) } pub fn list_blocks( &self, first_block: BlockNumber, - last_block: BlockNumber, - ) -> impl Stream>>> - { - self.execute_block_list( - &|inner| &inner.list_statement, - first_block, - last_block, - false - ) + last_block: BlockNumber + ) -> impl Stream>>> { + self.execute_block_list(&|inner| &inner.list_statement, first_block, last_block, false) } pub fn list_blocks_in_reversed_order( &self, first_block: BlockNumber, - last_block: BlockNumber, - ) -> impl Stream>>> - { - self.execute_block_list( - &|inner| &inner.reversed_list_statement, - first_block, - last_block, - true - ) + last_block: BlockNumber + ) -> impl Stream>>> { + self.execute_block_list(&|inner| &inner.reversed_list_statement, first_block, last_block, true) } fn execute_block_list( @@ -311,22 +294,13 @@ impl CassandraStorage { first_block: BlockNumber, last_block: BlockNumber, reverse_partitions: bool - ) -> impl Stream>> - { - self.execute_untyped_block_list( - statement, - first_block, - last_block, - reverse_partitions - ).map(|rows| { - let (rows, partition_start, partition_end) = rows?; - let batch = RowBatch::new(rows)?; - Ok(BlockBatch::new( - batch, - partition_start, - partition_end - )) - }) + ) -> impl Stream>> { + self.execute_untyped_block_list(statement, first_block, last_block, reverse_partitions) + .map(|rows| { + let (rows, partition_start, partition_end) = rows?; + let batch = RowBatch::new(rows)?; + Ok(BlockBatch::new(batch, partition_start, partition_end)) + }) } fn execute_untyped_block_list( @@ -335,8 +309,7 @@ impl CassandraStorage { first_block: BlockNumber, last_block: BlockNumber, reverse_partitions: bool - ) -> impl Stream> - { + ) -> impl Stream> { let inner = self.inner.clone(); try_stream! { for r in split_into_partitions(reverse_partitions, inner.partition_size, first_block, last_block) { @@ -369,23 +342,20 @@ impl CassandraStorage { } } - fn get_partition(partition_size: u64, block_number: BlockNumber) -> u64 { (block_number / partition_size) * partition_size } - fn split_into_partitions( reverse: bool, partition_size: u64, mut first_block: BlockNumber, mut last_block: BlockNumber -) -> impl Iterator -{ +) -> impl Iterator { let mut range = first_block..last_block + 1; std::iter::from_fn(move || { if range.is_empty() { - return None + return None; } if reverse { let start = std::cmp::max(range.start, get_partition(partition_size, range.end - 1)); @@ -401,11 +371,11 @@ fn split_into_partitions( }) } - fn execute_batch() -> BoxFuture<'static, anyhow::Result<()>> { async move { // let _ = session; // session.batch(&batch, values).await?; Ok(()) - }.boxed() -} \ No newline at end of file + } + .boxed() +} diff --git a/crates/bds/src/cassandra/types.rs b/crates/bds/src/cassandra/types.rs index 58a1bfab..cb748c82 100644 --- a/crates/bds/src/cassandra/types.rs +++ b/crates/bds/src/cassandra/types.rs @@ -1,20 +1,20 @@ -use super::row_batch::Row; -use crate::block::{Block, BlockHeader}; use anyhow::Context; use sqd_primitives::{BlockNumber, BlockRef}; use uuid::Uuid; +use super::row_batch::Row; +use crate::block::{Block, BlockHeader}; impl Row for BlockHeader<'static> { type Type<'a> = BlockHeader<'a>; - + type Tuple<'a> = ( - i64, // number - &'a str, // hash - i64, // parent_number - &'a str, // parent_hash - Option, // block_timestamp, - Option // is_final + i64, // number + &'a str, // hash + i64, // parent_number + &'a str, // parent_hash + Option, // block_timestamp, + Option // is_final ); fn convert(row: Self::Tuple<'_>) -> anyhow::Result> { @@ -35,10 +35,9 @@ impl Row for BlockHeader<'static> { } } - impl Row for Block<'static> { type Type<'a> = Block<'a>; - + type Tuple<'a> = ( as Row>::Tuple<'a>, &'a [u8]); fn convert(row: Self::Tuple<'_>) -> anyhow::Result> { @@ -54,9 +53,8 @@ impl Row for Block<'static> { } } - pub struct WriteState { pub id: Uuid, pub head: BlockRef, pub finalized_head: Option -} \ No newline at end of file +} diff --git a/crates/bds/src/chain.rs b/crates/bds/src/chain.rs index a8482b98..8bb6d2b0 100644 --- a/crates/bds/src/chain.rs +++ b/crates/bds/src/chain.rs @@ -1,19 +1,18 @@ -use crate::util::{bisect, compute_fork_base}; +use std::{collections::VecDeque, ops::Index}; + use anyhow::{anyhow, bail, ensure}; use sqd_primitives::{Block, BlockNumber, BlockPtr, BlockRef}; -use std::collections::VecDeque; -use std::ops::Index; +use crate::util::{bisect, compute_fork_base}; /// Last blocks of a chain, starting from the highest finalized block #[derive(Debug)] pub struct HeadChain { pub blocks: Vec, - /// indicates, that the first block in `self.blocks` is final + /// indicates, that the first block in `self.blocks` is final pub first_finalized: bool } - impl HeadChain { pub fn empty() -> Self { Self { @@ -23,17 +22,15 @@ impl HeadChain { } } - pub struct Chain { chain: VecDeque, fin: bool, blocks: VecDeque, stored: VecDeque, - min_size: usize, + min_size: usize } - -impl Chain { +impl Chain { pub fn new(chain: HeadChain, min_size: usize) -> Self { assert!(!(chain.first_finalized && chain.blocks.is_empty())); Self { @@ -48,19 +45,19 @@ impl Chain { pub fn base(&self) -> Option> { self.first().map(|b| b.parent_ptr()) } - + pub fn first(&self) -> Option<&B> { self.blocks.get(0) } - pub fn iter(&self) -> impl Iterator + DoubleEndedIterator { + pub fn iter(&self) -> impl Iterator + DoubleEndedIterator { self.blocks.iter() } - + pub fn bisect(&self, block_number: BlockNumber) -> usize { bisect(self.blocks.len(), &self.blocks, block_number) } - + pub fn is_stored(&self, idx: usize) -> bool { self.stored[idx] } @@ -73,7 +70,7 @@ impl Chain { self.stored.push_back(false); return Ok(()); }; - + if prev.number == block.parent_number() && prev.hash == block.parent_hash() { self.chain.push_back(block.ptr().to_ref()); self.blocks.push_back(block); @@ -106,22 +103,22 @@ impl Chain { self.stored.pop_back(); } } - + pub fn mark_stored(&mut self, number: BlockNumber, hash: &str) -> bool { let pos = self.bisect(number); - + let Some(block) = self.blocks.get(pos) else { - return false + return false; }; - + if block.number() != number || block.hash() != hash { - return false + return false; } - + self.stored[pos] = true; true } - + pub fn clean(&mut self) -> bool { let mut dropped = false; for _ in 0..self.len().saturating_sub(self.min_size) { @@ -133,7 +130,7 @@ impl Chain { } dropped = true; } else { - break + break; } } dropped @@ -150,7 +147,7 @@ impl Chain { Ok(true) } else { Ok(false) - } + }; } let pos = bisect(self.chain.len(), &self.chain, head.number); @@ -160,7 +157,7 @@ impl Chain { for _ in 0..pos { self.chain.pop_front(); } - return Ok(!fin || pos > 0) + return Ok(!fin || pos > 0); } bail!( @@ -172,10 +169,10 @@ impl Chain { pub fn finalize_all(&mut self) { match self.chain.len() { - 0 => {}, + 0 => {} 1 => { self.fin = true; - }, + } _ => { let head = self.chain.pop_back().unwrap(); self.chain.clear(); @@ -191,18 +188,18 @@ impl Chain { Some(i) => Some(self.blocks[i].parent_ptr()) } } - + pub fn stored_finalized_head(&self) -> Option> { if !self.fin { - return None + return None; } if self.chain.len() > self.blocks.len() { - return Some(self.chain[0].ptr()) + return Some(self.chain[0].ptr()); } for i in (0..=self.blocks.len() - self.chain.len()).rev() { if self.stored[i] { - return Some(self.blocks[i].ptr()) - } + return Some(self.blocks[i].ptr()); + } } None } @@ -230,7 +227,6 @@ impl Chain { } } - impl Index for Chain { type Output = B; @@ -238,4 +234,4 @@ impl Index for Chain { fn index(&self, index: usize) -> &Self::Output { self.blocks.index(index) } -} \ No newline at end of file +} diff --git a/crates/bds/src/chain_watch.rs b/crates/bds/src/chain_watch.rs index 8fdc8bfd..9ae6a69a 100644 --- a/crates/bds/src/chain_watch.rs +++ b/crates/bds/src/chain_watch.rs @@ -1,17 +1,15 @@ -use crate::chain::{Chain, HeadChain}; use sqd_primitives::{Block, BlockNumber, BlockPtr}; +use crate::chain::{Chain, HeadChain}; pub type ChainReceiver = tokio::sync::watch::Receiver>; - #[derive(Clone)] pub struct ChainSender { inner: tokio::sync::watch::Sender>, max_size: usize } - impl ChainSender { pub fn new(head_chain: HeadChain, min_size: usize, max_size: usize) -> Self { Self { @@ -29,14 +27,13 @@ impl ChainSender { } pub fn mark_stored(&self, number: BlockNumber, hash: &str) { - self.inner.send_if_modified(|chain| { - chain.mark_stored(number, hash) && chain.clean() - }); + self.inner + .send_if_modified(|chain| chain.mark_stored(number, hash) && chain.clean()); } pub fn finalize(&self, head: BlockPtr) -> anyhow::Result<()> { let mut res = Ok(false); - self.inner.send_if_modified(|chain| { + self.inner.send_if_modified(|chain| { res = chain.finalize(head); res.as_ref().map_or(false, |changed| *changed) }); @@ -62,4 +59,4 @@ impl ChainSender { recv.changed().await.expect("sender cannot be dropped"); } } -} \ No newline at end of file +} diff --git a/crates/bds/src/cmd/ingest.rs b/crates/bds/src/cmd/ingest.rs index f7c5549f..b5506238 100644 --- a/crates/bds/src/cmd/ingest.rs +++ b/crates/bds/src/cmd/ingest.rs @@ -1,14 +1,12 @@ -use crate::block::BlockArc; -use crate::cassandra::CassandraStorage; -use crate::data_source::create_data_source; -use crate::ingest::Ingest; +use std::sync::Arc; + use anyhow::Context; use sqd_data_client::reqwest::ReqwestDataClient; use sqd_primitives::BlockNumber; -use std::sync::Arc; use tracing::debug; use url::Url; +use crate::{block::BlockArc, cassandra::CassandraStorage, data_source::create_data_source, ingest::Ingest}; #[derive(clap::Args)] pub struct Args { @@ -28,7 +26,6 @@ pub struct Args { pub parent_block_hash: Option } - pub fn run(args: Args) -> anyhow::Result<()> { tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -36,7 +33,6 @@ pub fn run(args: Args) -> anyhow::Result<()> { .block_on(run_async(args)) } - async fn run_async(args: Args) -> anyhow::Result<()> { let cassandra_session = { use scylla::client::session_builder::SessionBuilder; @@ -50,16 +46,11 @@ async fn run_async(args: Args) -> anyhow::Result<()> { Arc::new(session) }; - let storage = CassandraStorage::new( - cassandra_session, - &args.cassandra_keyspace - ).await?; + let storage = CassandraStorage::new(cassandra_session, &args.cassandra_keyspace).await?; debug!("cassandra storage initialized"); - let data_source = create_data_source( - args.data_source.into_iter().map(ReqwestDataClient::from_url).collect() - ); + let data_source = create_data_source(args.data_source.into_iter().map(ReqwestDataClient::from_url).collect()); let (_, handle) = Ingest::new(storage, data_source) .set_first_block(args.first_block) @@ -68,4 +59,4 @@ async fn run_async(args: Args) -> anyhow::Result<()> { .await?; handle.await -} \ No newline at end of file +} diff --git a/crates/bds/src/cmd/mod.rs b/crates/bds/src/cmd/mod.rs index 39b750a9..704886a4 100644 --- a/crates/bds/src/cmd/mod.rs +++ b/crates/bds/src/cmd/mod.rs @@ -1,2 +1,2 @@ pub mod ingest; -mod util; \ No newline at end of file +mod util; diff --git a/crates/bds/src/cmd/util.rs b/crates/bds/src/cmd/util.rs index df5b8441..f29f3e99 100644 --- a/crates/bds/src/cmd/util.rs +++ b/crates/bds/src/cmd/util.rs @@ -1,11 +1,8 @@ use tokio::signal; - pub async fn shutdown_signal() { let ctrl_c = async { - signal::ctrl_c() - .await - .expect("failed to install Ctrl+C handler"); + signal::ctrl_c().await.expect("failed to install Ctrl+C handler"); }; #[cfg(unix)] @@ -23,4 +20,4 @@ pub async fn shutdown_signal() { _ = ctrl_c => {}, _ = terminate => {}, } -} \ No newline at end of file +} diff --git a/crates/bds/src/data_source.rs b/crates/bds/src/data_source.rs index 6d389e8d..f42b3feb 100644 --- a/crates/bds/src/data_source.rs +++ b/crates/bds/src/data_source.rs @@ -1,4 +1,5 @@ -use crate::block::{Block, BlockArc, BlockHeader}; +use std::{io::Write, sync::Arc}; + use anyhow::ensure; use bytes::Bytes; use futures::StreamExt; @@ -6,16 +7,14 @@ use serde::Deserialize; use sqd_data_client::reqwest::ReqwestDataClient; use sqd_data_source::{DataSource, MappedDataSource, StandardDataSource}; use sqd_primitives::BlockNumber; -use std::io::Write; -use std::sync::Arc; +use crate::block::{Block, BlockArc, BlockHeader}; #[derive(Deserialize)] struct JsonBlock { header: JsonBlockHeader } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] struct JsonBlockHeader { @@ -26,46 +25,40 @@ struct JsonBlockHeader { timestamp: Option } - - #[derive(Debug)] struct ParsedBlock { pub header: BlockHeader<'static>, pub data: Bytes } - - fn parse_block(bytes: Bytes) -> anyhow::Result { let json: JsonBlock = serde_json::from_slice(&bytes)?; - + if let Some(parent_number) = json.header.parent_number { - ensure!(parent_number < json.header.number || json.header.number == 0); + ensure!(parent_number < json.header.number || json.header.number == 0); } - + let header = BlockHeader::<'static> { number: json.header.number, hash: json.header.hash.into(), - parent_number: json.header.parent_number.unwrap_or_else(|| json.header.number.saturating_sub(1)), + parent_number: json + .header + .parent_number + .unwrap_or_else(|| json.header.number.saturating_sub(1)), parent_hash: json.header.parent_hash.into(), timestamp: json.header.timestamp, is_final: false }; - - Ok(ParsedBlock { - header, - data: bytes - }) -} + Ok(ParsedBlock { header, data: bytes }) +} pub fn create_data_source(clients: Vec) -> impl DataSource { MappedDataSource::new( StandardDataSource::new(clients, &parse_block), |mut parsed_block: ParsedBlock, is_final: bool| { let compressed = { - use flate2::*; - use flate2::write::GzEncoder; + use flate2::{write::GzEncoder, *}; let mut encoder = GzEncoder::new(Vec::new(), Compression::fast()); encoder.write_all(&parsed_block.data); encoder.finish().expect("IO errors are not possible") @@ -79,7 +72,6 @@ pub fn create_data_source(clients: Vec) -> impl DataSource BlockNumber { self.header.number @@ -100,4 +92,4 @@ impl sqd_primitives::Block for ParsedBlock { fn timestamp(&self) -> Option { self.header.timestamp } -} \ No newline at end of file +} diff --git a/crates/bds/src/ingest/grow.rs b/crates/bds/src/ingest/grow.rs index c0a26551..00a9a09d 100644 --- a/crates/bds/src/ingest/grow.rs +++ b/crates/bds/src/ingest/grow.rs @@ -1,49 +1,50 @@ -use super::store::Store; -use crate::chain_watch::ChainSender; -use crate::util::compute_fork_base; +use std::time::Duration; + use anyhow::{anyhow, bail, ensure}; use futures::StreamExt; use sqd_data_source::{DataEvent, DataSource}; use sqd_primitives::{Block, BlockNumber, BlockPtr, BlockRef}; -use std::time::Duration; -use tokio::select; -use tokio::time::{sleep_until, Instant}; +use tokio::{ + select, + time::{Instant, sleep_until} +}; use tracing::debug; +use super::store::Store; +use crate::{chain_watch::ChainSender, util::compute_fork_base}; pub async fn grow( mut data_source: impl DataSource, first_block: BlockNumber, parent_block_hash: Option, chain_sender: ChainSender -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { debug!( first_block = data_source.get_next_block(), parent_hash =% data_source.get_parent_block_hash().unwrap_or("None"), "starting data ingestion" ); - + loop { let Some(event) = data_source.next().await else { bail!("unexpected end of data source stream") }; - + match event { DataEvent::FinalizedHead(head) => { // check, that we have all blocks below the head, - // otherwise, we might get erroneous block mismatch condition + // otherwise, we might get erroneous block mismatch condition // due to forked blocks been still present in the chain if head.number < data_source.get_next_block() { chain_sender.finalize(head.ptr())?; } - }, + } DataEvent::Block { block, is_final } => { if !chain_sender.push(is_final, block)? { chain_sender.wait().await; } - }, - DataEvent::MaybeOnHead => {}, + } + DataEvent::MaybeOnHead => {} DataEvent::Fork(prev) => handle_fork( &prev, first_block, @@ -55,19 +56,17 @@ pub async fn grow( } } - fn handle_fork( mut prev: &[BlockRef], first_block: BlockNumber, parent_block_hash: Option<&str>, chain_sender: &ChainSender, data_source: &mut impl DataSource -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { ensure!(data_source.get_parent_block_hash().is_some()); - + ensure!(!prev.is_empty(), "got a fork event with no previous blocks"); - + ensure!( prev.windows(2).all(|s| s[0].number < s[1].number), "got a fork event with a list of previous blocks not in ascending order" @@ -87,24 +86,21 @@ fn handle_fork( let Some(offset) = prev.iter().position(|b| b.number >= first_block) else { // all `prev` blocks are below `first_block`, data_source.set_position(first_block, parent_block_hash.as_deref()); - return Ok(()) + return Ok(()); }; prev = &prev[offset..]; } - + let chain = chain_sender.borrow(); - + if let Some(head) = chain.compute_fork_base(prev)? { if head.number >= first_block { data_source.set_position(head.number + 1, Some(head.hash)); - return Ok(()) + return Ok(()); } } - data_source.set_position( - first_block, - parent_block_hash - ); + data_source.set_position(first_block, parent_block_hash); Ok(()) -} \ No newline at end of file +} diff --git a/crates/bds/src/ingest/ingest.rs b/crates/bds/src/ingest/ingest.rs index eef1663a..ee8c7a61 100644 --- a/crates/bds/src/ingest/ingest.rs +++ b/crates/bds/src/ingest/ingest.rs @@ -1,18 +1,17 @@ -use super::grow::grow; -use super::store::Store; -use super::write::write_chain; -use crate::chain_watch::{ChainReceiver, ChainSender}; -use crate::util::task_termination_error; -use anyhow::{anyhow, ensure, Context}; +use std::{future::Future, pin::Pin, task::Poll}; + +use anyhow::{Context, anyhow, ensure}; use futures::FutureExt; use sqd_data_source::DataSource; use sqd_primitives::BlockNumber; -use std::future::Future; -use std::pin::Pin; -use std::task::Poll; use tokio::task::JoinHandle; use tracing::debug; +use super::{grow::grow, store::Store, write::write_chain}; +use crate::{ + chain_watch::{ChainReceiver, ChainSender}, + util::task_termination_error +}; pub struct Ingest { store: S, @@ -23,7 +22,6 @@ pub struct Ingest { max_queue_size: usize } - impl Ingest where S: Store, @@ -51,40 +49,38 @@ where } pub async fn start(mut self) -> anyhow::Result<(ChainReceiver, IngestHandle)> { - let head_chain = self.store.get_chain_head( - self.first_block, - self.parent_block_hash.as_deref() - ).await?; + let head_chain = self + .store + .get_chain_head(self.first_block, self.parent_block_hash.as_deref()) + .await?; if let Some(head) = head_chain.blocks.last() { self.data_source.set_position(head.number + 1, Some(&head.hash)); } else { - self.data_source.set_position(self.first_block, self.parent_block_hash.as_deref()); + self.data_source + .set_position(self.first_block, self.parent_block_hash.as_deref()); } - let chain_sender = ChainSender::::new( - head_chain, - self.min_queue_size, - self.max_queue_size - ); + let chain_sender = ChainSender::::new(head_chain, self.min_queue_size, self.max_queue_size); let chain_receiver = chain_sender.subscribe(); - let write = tokio::spawn( - write_chain(self.store.clone(), chain_sender.clone()) - ); + let write = tokio::spawn(write_chain(self.store.clone(), chain_sender.clone())); - let head_update = tokio::spawn( - head_update_loop(self.store.clone(), chain_receiver.clone()) - ); + let head_update = tokio::spawn(head_update_loop(self.store.clone(), chain_receiver.clone())); - let finalize = tokio::spawn( - finalize_loop(self.store.clone(), chain_receiver.clone(), self.first_block) - ); + let finalize = tokio::spawn(finalize_loop( + self.store.clone(), + chain_receiver.clone(), + self.first_block + )); - let ingest = tokio::spawn( - grow(self.data_source, self.first_block, self.parent_block_hash, chain_sender) - ); + let ingest = tokio::spawn(grow( + self.data_source, + self.first_block, + self.parent_block_hash, + chain_sender + )); let handle = IngestHandle { write, @@ -98,16 +94,8 @@ where } } - -async fn head_update_loop( - store: S, - mut chain_receiver: ChainReceiver -) -> anyhow::Result<()> -{ - let mut prev = chain_receiver - .borrow() - .stored_head() - .map(|b| b.to_ref()); +async fn head_update_loop(store: S, mut chain_receiver: ChainReceiver) -> anyhow::Result<()> { + let mut prev = chain_receiver.borrow().stored_head().map(|b| b.to_ref()); loop { chain_receiver.changed().await?; { @@ -120,26 +108,17 @@ async fn head_update_loop( } if let Some(head) = prev.as_ref() { store.set_head(head.ptr()).await?; - debug!( - number = head.number, - hash = head.hash, - "head updated" - ); + debug!(number = head.number, hash = head.hash, "head updated"); } } } - async fn finalize_loop( store: S, mut chain_receiver: ChainReceiver, first_block: BlockNumber -) -> anyhow::Result<()> -{ - let mut prev = chain_receiver - .borrow() - .stored_finalized_head() - .map(|b| b.number); +) -> anyhow::Result<()> { + let mut prev = chain_receiver.borrow().stored_finalized_head().map(|b| b.number); loop { chain_receiver.changed().await?; @@ -147,34 +126,27 @@ async fn finalize_loop( let (from, to) = { let chain = chain_receiver.borrow_and_update(); let Some(head) = chain.stored_finalized_head() else { - continue + continue; }; if prev.map_or(false, |n| n == head.number) { - continue + continue; } - ( - prev.unwrap_or(first_block), - head.to_ref() - ) + (prev.unwrap_or(first_block), head.to_ref()) }; ensure!(from <= to.number); - store.finalize(from, to.ptr()).await.with_context(|| { - format!("failed to finalize blocks from {} to {}", from, to.ptr()) - })?; - - debug!( - number = to.number, - hash = to.hash, - "finalized" - ); + store + .finalize(from, to.ptr()) + .await + .with_context(|| format!("failed to finalize blocks from {} to {}", from, to.ptr()))?; + + debug!(number = to.number, hash = to.hash, "finalized"); prev = Some(to.number); } } - pub struct IngestHandle { write: JoinHandle>, head_update: JoinHandle>, @@ -183,7 +155,6 @@ pub struct IngestHandle { terminated: bool } - impl IngestHandle { pub fn abort(&mut self) { self.write.abort(); @@ -194,22 +165,19 @@ impl IngestHandle { } } - impl Future for IngestHandle { type Output = anyhow::Result<()>; fn poll(mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll { if self.terminated { - return Poll::Ready(Err(anyhow!("ingest was already terminated"))) + return Poll::Ready(Err(anyhow!("ingest was already terminated"))); } macro_rules! poll_task { ($name:expr, $handle:ident) => { if let Poll::Ready(res) = self.$handle.poll_unpin(cx) { self.abort(); - return Poll::Ready( - task_termination_error($name, res) - ) + return Poll::Ready(task_termination_error($name, res)); } }; } @@ -218,15 +186,13 @@ impl Future for IngestHandle { poll_task!("head update", head_update); poll_task!("block finalization", finalize); poll_task!("ingest", ingest); - + Poll::Pending } } - impl Drop for IngestHandle { fn drop(&mut self) { self.abort() } } - diff --git a/crates/bds/src/ingest/mod.rs b/crates/bds/src/ingest/mod.rs index 919d9319..144b2133 100644 --- a/crates/bds/src/ingest/mod.rs +++ b/crates/bds/src/ingest/mod.rs @@ -3,6 +3,5 @@ mod ingest; mod store; mod write; - +pub use ingest::*; pub use store::*; -pub use ingest::*; \ No newline at end of file diff --git a/crates/bds/src/ingest/store.rs b/crates/bds/src/ingest/store.rs index c8d0ae10..3655cf5c 100644 --- a/crates/bds/src/ingest/store.rs +++ b/crates/bds/src/ingest/store.rs @@ -1,20 +1,17 @@ -use crate::chain::HeadChain; -use sqd_primitives::{Block, BlockNumber, BlockPtr, BlockRef}; use std::future::Future; +use sqd_primitives::{Block, BlockNumber, BlockPtr, BlockRef}; + +use crate::chain::HeadChain; pub trait Store: Clone + Send + Sync + 'static { type Block: Block + Clone + Send + Sync; - async fn get_chain_head( - &self, - first_block: BlockNumber, - parent_hash: Option<&str> - ) -> anyhow::Result; + async fn get_chain_head(&self, first_block: BlockNumber, parent_hash: Option<&str>) -> anyhow::Result; fn save(&self, block: &Self::Block) -> impl Future> + Send; fn set_head(&self, head: BlockPtr) -> impl Future> + Send; - + fn finalize(&self, from: BlockNumber, to: BlockPtr) -> impl Future> + Send; -} \ No newline at end of file +} diff --git a/crates/bds/src/ingest/write.rs b/crates/bds/src/ingest/write.rs index 4f2f5f21..b6edd967 100644 --- a/crates/bds/src/ingest/write.rs +++ b/crates/bds/src/ingest/write.rs @@ -1,25 +1,19 @@ -use crate::chain::Chain; -use crate::chain_watch::ChainSender; -use crate::ingest::store::Store; -use futures::stream::FuturesUnordered; -use futures::{StreamExt, TryStreamExt}; -use sqd_primitives::{Block, BlockNumber, BlockRef}; use std::future::Future; + +use futures::{StreamExt, TryStreamExt, stream::FuturesUnordered}; +use sqd_primitives::{Block, BlockNumber, BlockRef}; use tokio::select; use tracing::debug; +use crate::{chain::Chain, chain_watch::ChainSender, ingest::store::Store}; -pub async fn write_chain( - store: S, - chain_sender: ChainSender -) -> anyhow::Result<()> -{ +pub async fn write_chain(store: S, chain_sender: ChainSender) -> anyhow::Result<()> { let mut chain_receiver = chain_sender.subscribe(); let mut writes = FuturesUnordered::new(); let mut state = State::new(5); loop { state.advance(&mut chain_receiver, |b| { - writes.push(async { + writes.push(async { store.save(&b).await?; Ok::<_, anyhow::Error>(b) }) @@ -41,7 +35,6 @@ pub async fn write_chain( } } - struct State { pending: Vec, max_pending: usize, @@ -49,7 +42,6 @@ struct State { waits_new_block: bool } - impl State { pub fn new(max_pending: usize) -> Self { assert!(max_pending > 0); @@ -64,12 +56,8 @@ impl State { pub fn has_capacity(&self) -> bool { self.pending.len() < self.max_pending } - - pub fn advance( - &mut self, - chain_receiver: &mut tokio::sync::watch::Receiver>, - mut cb: impl FnMut(B) - ) { + + pub fn advance(&mut self, chain_receiver: &mut tokio::sync::watch::Receiver>, mut cb: impl FnMut(B)) { if !self.has_capacity() { return; } @@ -89,17 +77,17 @@ impl State { fn find_position(&self, chain: &Chain) -> usize { if chain.is_empty() { - return 0 + return 0; } let mut pos = chain.bisect(self.last_block).min(chain.len() - 1); loop { if chain.is_stored(pos) || self.is_pending(&chain[pos]) { - return pos + 1 + return pos + 1; } if pos == 0 { - return 0 + return 0; } else { pos -= 1; } @@ -115,4 +103,4 @@ impl State { let ptr = block.ptr(); self.pending.retain(|b| b.ptr() != ptr) } -} \ No newline at end of file +} diff --git a/crates/bds/src/main.rs b/crates/bds/src/main.rs index 1082ac22..16e3194c 100644 --- a/crates/bds/src/main.rs +++ b/crates/bds/src/main.rs @@ -4,40 +4,34 @@ mod cassandra; mod chain; mod chain_watch; mod cmd; +mod data_source; mod ingest; mod util; -mod data_source; - -use clap::Parser; use std::io::IsTerminal; +use clap::Parser; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; - #[derive(clap::Parser)] struct CLI { #[command(subcommand)] command: Command } - #[derive(clap::Subcommand)] enum Command { /// Run data ingestion Ingest(cmd::ingest::Args) } - fn main() -> anyhow::Result<()> { let cli = CLI::parse(); - let env_filter = tracing_subscriber::EnvFilter::builder().parse_lossy( - std::env::var(tracing_subscriber::EnvFilter::DEFAULT_ENV) - .unwrap_or("info".to_string()), - ); + let env_filter = tracing_subscriber::EnvFilter::builder() + .parse_lossy(std::env::var(tracing_subscriber::EnvFilter::DEFAULT_ENV).unwrap_or("info".to_string())); if std::io::stdout().is_terminal() { tracing_subscriber::fmt() @@ -52,10 +46,8 @@ fn main() -> anyhow::Result<()> { .with_current_span(false) .init(); } - - match cli.command { - Command::Ingest(args) => { - cmd::ingest::run(args) - } + + match cli.command { + Command::Ingest(args) => cmd::ingest::run(args) } -} \ No newline at end of file +} diff --git a/crates/bds/src/util.rs b/crates/bds/src/util.rs index 4165d761..17f32a87 100644 --- a/crates/bds/src/util.rs +++ b/crates/bds/src/util.rs @@ -1,33 +1,31 @@ +use std::{cmp::Ordering, ops::Index}; + use anyhow::bail; use sqd_primitives::{AsBlockPtr, BlockNumber, BlockPtr, BlockRef}; -use std::cmp::Ordering; -use std::ops::Index; - pub fn compute_fork_base<'a, 'b, B: AsBlockPtr + 'a>( - reversed_chain: impl IntoIterator, + reversed_chain: impl IntoIterator, mut prev: &'b [BlockRef] -) -> Option> -{ +) -> Option> { let prev = &mut prev; for b in reversed_chain { let b = b.as_block_ptr(); if prev.last().map_or(false, |p| p.number < b.number) { - continue + continue; } while prev.last().map_or(false, |p| p.number > b.number) { *prev = pop(prev); } - + if prev.is_empty() { - return Some(b) + return Some(b); } - + let p = prev.last().unwrap(); if b.number == p.number && b.hash == p.hash { - return Some(b) + return Some(b); } else { *prev = pop(prev) } @@ -36,21 +34,12 @@ pub fn compute_fork_base<'a, 'b, B: AsBlockPtr + 'a>( None } - fn pop(slice: &[T]) -> &[T] { &slice[0..slice.len() - 1] } - -pub fn bisect( - len: usize, - chain: &impl Index, - block_number: BlockNumber -) -> usize -{ - let Some(mut end) = len.checked_sub(1) else { - return 0 - }; +pub fn bisect(len: usize, chain: &impl Index, block_number: BlockNumber) -> usize { + let Some(mut end) = len.checked_sub(1) else { return 0 }; let mut beg = match chain[end].as_block_ptr().number.cmp(&block_number) { Ordering::Less => return end + 1, @@ -62,10 +51,10 @@ pub fn bisect( }; if chain[beg].as_block_ptr().number >= block_number { - return beg + return beg; } - while end - beg > 1 { + while end - beg > 1 { let mid = beg + (end - beg) / 2; match chain[mid].as_block_ptr().number.cmp(&block_number) { Ordering::Less => beg = mid, @@ -77,15 +66,13 @@ pub fn bisect( end } - pub fn task_termination_error( task_name: &str, res: Result, tokio::task::JoinError> -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { match res { Ok(Ok(_)) => bail!("{} task unexpectedly terminated", task_name), Ok(Err(err)) => Err(err.context(format!("{} task failed", task_name))), Err(join_error) => bail!("{} task terminated: {}", task_name, join_error) } -} \ No newline at end of file +} diff --git a/crates/bloom-filter/benches/main.rs b/crates/bloom-filter/benches/main.rs index df2a770f..909fb35f 100644 --- a/crates/bloom-filter/benches/main.rs +++ b/crates/bloom-filter/benches/main.rs @@ -1,7 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; use sqd_bloom_filter::BloomFilter; - fn bloom_filter_benchmark(c: &mut Criterion) { let mut bloom = BloomFilter::new(64, 7); c.bench_function("insert solana accounts", |bench| { @@ -13,6 +12,5 @@ fn bloom_filter_benchmark(c: &mut Criterion) { }); } - criterion_group!(benches, bloom_filter_benchmark); criterion_main!(benches); diff --git a/crates/bloom-filter/src/lib.rs b/crates/bloom-filter/src/lib.rs index 280f7f97..9d1c3186 100644 --- a/crates/bloom-filter/src/lib.rs +++ b/crates/bloom-filter/src/lib.rs @@ -1,18 +1,17 @@ use std::hash::{Hash, Hasher}; -use xxhash_rust::xxh3::Xxh3Builder; +use xxhash_rust::xxh3::Xxh3Builder; pub struct BloomFilter { bytes: Box<[u8]>, - num_hashes: usize, + num_hashes: usize } - impl BloomFilter { pub fn new(byte_size: usize, num_hashes: usize) -> Self { BloomFilter { bytes: vec![0; byte_size].into_boxed_slice(), - num_hashes, + num_hashes } } @@ -20,7 +19,7 @@ impl BloomFilter { pub fn bytes(&self) -> &[u8] { self.bytes.as_ref() } - + pub fn clear(&mut self) { for i in &mut self.bytes { *i = 0; @@ -52,22 +51,18 @@ impl BloomFilter { } } - fn get_bit(data: &[u8], i: usize) -> bool { data[i / 8] & (1 << (i % 8)) != 0 } - fn set_bit(data: &mut [u8], i: usize) { data[i / 8] |= 1 << (i % 8); } - #[cfg(test)] mod test { use crate::BloomFilter; - #[test] fn basic_test() { let mut bloom_filter = BloomFilter::new(64, 7); diff --git a/crates/data-client/src/lib.rs b/crates/data-client/src/lib.rs index 31ad5323..cb77506f 100644 --- a/crates/data-client/src/lib.rs +++ b/crates/data-client/src/lib.rs @@ -1,5 +1,4 @@ pub mod reqwest; mod types; - -pub use types::*; \ No newline at end of file +pub use types::*; diff --git a/crates/data-client/src/reqwest/client.rs b/crates/data-client/src/reqwest/client.rs index f59308f0..93b662ec 100644 --- a/crates/data-client/src/reqwest/client.rs +++ b/crates/data-client/src/reqwest/client.rs @@ -1,19 +1,23 @@ -use super::lines::LineStream; -use crate::types::{BlockStreamRequest, BlockStreamResponse}; -use crate::DataClient; +use std::{ + fmt::{Debug, Display, Formatter}, + io::ErrorKind, + sync::Arc, + time::Duration +}; + use anyhow::{anyhow, bail, ensure, Context}; use bytes::Bytes; -use futures::future::BoxFuture; -use futures::{FutureExt, StreamExt}; +use futures::{future::BoxFuture, FutureExt, StreamExt}; use reqwest::{Client, IntoUrl, Response, StatusCode, Url}; use serde::Deserialize; use serde_json::json; use sqd_primitives::{BlockNumber, BlockRef}; -use std::fmt::{Debug, Display, Formatter}; -use std::io::ErrorKind; -use std::sync::Arc; -use std::time::Duration; +use super::lines::LineStream; +use crate::{ + types::{BlockStreamRequest, BlockStreamResponse}, + DataClient +}; pub fn default_http_client() -> Client { Client::builder() @@ -24,14 +28,12 @@ pub fn default_http_client() -> Client { .unwrap() } - #[derive(Clone)] pub struct ReqwestDataClient { http: Client, url: Arc } - impl Debug for ReqwestDataClient { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("ReqwestDataClient") @@ -40,13 +42,12 @@ impl Debug for ReqwestDataClient { } } - impl ReqwestDataClient { pub fn from_url(url: impl IntoUrl) -> Self { let http = default_http_client(); Self::new(http, url) } - + pub fn new(http: Client, url: impl IntoUrl) -> Self { Self { http, @@ -54,30 +55,21 @@ impl ReqwestDataClient { } } - pub async fn stream( - &self, - req: BlockStreamRequest - ) -> anyhow::Result> - { + pub async fn stream(&self, req: BlockStreamRequest) -> anyhow::Result> { let mut body = json!({ "fromBlock": req.first_block }); if req.parent_block_hash.is_some() { - body.as_object_mut().unwrap().insert( - "parentBlockHash".into(), - req.parent_block_hash.clone().into() - ); + body.as_object_mut() + .unwrap() + .insert("parentBlockHash".into(), req.parent_block_hash.clone().into()); } let mut url = self.url.as_ref().clone(); url.path_segments_mut().unwrap().push("stream"); - let res = self.http - .post(url) - .json(&body) - .send() - .await?; + let res = self.http.post(url).json(&body).send().await?; match res.status().as_u16() { 200 => { @@ -87,34 +79,26 @@ impl ReqwestDataClient { blocks: blocks.boxed(), finalized_head }) - }, + } 204 => { let finalized_head = extract_finalized_head(&res)?; Ok(BlockStreamResponse::Stream { blocks: futures::stream::empty().boxed(), finalized_head }) - }, + } 409 => { let conflict: BaseBlockConflict = res .json() .await - .context( - "failed to receive a list of previous blocks after base-block hash mismatch" - )?; - ensure!( - !conflict.previous_blocks.is_empty(), - "got an empty list of prev blocks" - ); + .context("failed to receive a list of previous blocks after base-block hash mismatch")?; + ensure!(!conflict.previous_blocks.is_empty(), "got an empty list of prev blocks"); Ok(BlockStreamResponse::Fork(conflict.previous_blocks)) - }, + } _ => { let status = res.status(); let text = res.text().await.unwrap_or_default(); - bail!(UnexpectedHttpStatus { - status, - text - }) + bail!(UnexpectedHttpStatus { status, text }) } } } @@ -131,7 +115,8 @@ impl ReqwestDataClient { let mut url = self.url.as_ref().clone(); url.path_segments_mut().unwrap().push(slug); - let head: Option = self.http + let head: Option = self + .http .get(url) .send() .await? @@ -142,14 +127,14 @@ impl ReqwestDataClient { Ok(head) } - + pub fn is_retryable(&self, err: &anyhow::Error) -> bool { for cause in err.chain() { if let Some(unexpected_status) = cause.downcast_ref::() { return match unexpected_status.status.as_u16() { 429 | 502 | 503 | 504 | 524 => true, _ => false - } + }; } if let Some(reqwest_error) = cause.downcast_ref::() { @@ -158,11 +143,12 @@ impl ReqwestDataClient { _ => {} } if reqwest_error.is_timeout() { - return true + return true; } - if reqwest_error.is_request() && - reqwest_error.to_string() == "connection closed before message completed" { - return true + if reqwest_error.is_request() + && reqwest_error.to_string() == "connection closed before message completed" + { + return true; } } @@ -180,13 +166,12 @@ impl ReqwestDataClient { } false } - + pub fn url(&self) -> &Url { self.url.as_ref() } } - fn extract_finalized_head(res: &Response) -> anyhow::Result> { let number = get_finalized_head_number(res) .transpose() @@ -211,7 +196,6 @@ fn extract_finalized_head(res: &Response) -> anyhow::Result> { } } - fn get_finalized_head_number(res: &Response) -> Option> { res.headers().get("x-sqd-finalized-head-number").map(|v| { let num = v.to_str()?.parse()?; @@ -219,7 +203,6 @@ fn get_finalized_head_number(res: &Response) -> Option Option> { res.headers().get("x-sqd-finalized-head-hash").map(|v| { let hash = v.to_str()?; @@ -227,22 +210,17 @@ fn get_finalized_head_hash(res: &Response) -> Option> { }) } - impl DataClient for ReqwestDataClient { type Block = Bytes; fn stream(&self, req: BlockStreamRequest) -> BoxFuture<'static, anyhow::Result>> { let this = self.clone(); - async move { - this.stream(req).await - }.boxed() + async move { this.stream(req).await }.boxed() } fn get_finalized_head(&self) -> BoxFuture<'static, anyhow::Result>> { let this = self.clone(); - async move { - this.get_finalized_head().await - }.boxed() + async move { this.get_finalized_head().await }.boxed() } fn is_retryable(&self, err: &anyhow::Error) -> bool { @@ -250,14 +228,12 @@ impl DataClient for ReqwestDataClient { } } - #[derive(Debug)] pub struct UnexpectedHttpStatus { pub status: StatusCode, pub text: String } - impl Display for UnexpectedHttpStatus { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match (self.status.is_success(), self.text.is_empty()) { @@ -269,12 +245,10 @@ impl Display for UnexpectedHttpStatus { } } - impl std::error::Error for UnexpectedHttpStatus {} - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] struct BaseBlockConflict { previous_blocks: Vec -} \ No newline at end of file +} diff --git a/crates/data-client/src/reqwest/lines.rs b/crates/data-client/src/reqwest/lines.rs index 44776535..b8addbf7 100644 --- a/crates/data-client/src/reqwest/lines.rs +++ b/crates/data-client/src/reqwest/lines.rs @@ -1,16 +1,14 @@ +use std::{pin::Pin, task::Poll}; + use bytes::{Buf, Bytes, BytesMut}; use futures::Stream; -use std::pin::Pin; -use std::task::Poll; - pub struct LineStream { inner: Option, line: BytesMut, - unchecked_pos: usize, + unchecked_pos: usize } - impl LineStream { pub fn new(body: Body) -> Self { Self { @@ -19,15 +17,15 @@ impl LineStream { unchecked_pos: 0 } } - + fn check_line(&mut self) -> Option { - if let Some(pos) = self.line.as_ref()[self.unchecked_pos..].iter().position(|b| *b == b'\n') { + if let Some(pos) = self.line.as_ref()[self.unchecked_pos..] + .iter() + .position(|b| *b == b'\n') + { let line = self.line.split_to(self.unchecked_pos + pos).freeze(); - self.line.advance(if self.line.get(1).copied() == Some(b'\r') { - 2 - } else { - 1 - }); + self.line + .advance(if self.line.get(1).copied() == Some(b'\r') { 2 } else { 1 }); self.unchecked_pos = 0; Some(line) } else { @@ -35,7 +33,7 @@ impl LineStream { None } } - + fn take_final_line(&mut self) -> Option { let line = std::mem::take(&mut self.line); if line.is_empty() { @@ -46,9 +44,8 @@ impl LineStream { } } - impl Stream for LineStream -where +where Body: Stream> + Unpin, E: Into { @@ -57,31 +54,27 @@ where fn poll_next(mut self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll> { loop { if let Some(line) = self.check_line() { - return Poll::Ready(Some(Ok(line))) + return Poll::Ready(Some(Ok(line))); } - + let Some(inner) = self.inner.as_mut() else { - return Poll::Ready(None) + return Poll::Ready(None); }; match Pin::new(inner).poll_next(cx) { Poll::Ready(None) => { self.inner = None; - return Poll::Ready( - Ok(self.take_final_line()).transpose() - ) - }, - Poll::Ready(Some(Ok(bytes))) => { - self.line.extend_from_slice(&bytes) - }, + return Poll::Ready(Ok(self.take_final_line()).transpose()); + } + Poll::Ready(Some(Ok(bytes))) => self.line.extend_from_slice(&bytes), Poll::Ready(Some(Err(err))) => { self.inner = None; self.line = BytesMut::new(); self.unchecked_pos = 0; - return Poll::Ready(Some(Err(err.into()))) - }, - Poll::Pending => return Poll::Pending, + return Poll::Ready(Some(Err(err.into()))); + } + Poll::Pending => return Poll::Pending } } } -} \ No newline at end of file +} diff --git a/crates/data-client/src/reqwest/mod.rs b/crates/data-client/src/reqwest/mod.rs index e24719aa..3758a4f8 100644 --- a/crates/data-client/src/reqwest/mod.rs +++ b/crates/data-client/src/reqwest/mod.rs @@ -1,5 +1,4 @@ mod client; mod lines; - -pub use client::*; \ No newline at end of file +pub use client::*; diff --git a/crates/data-client/src/types.rs b/crates/data-client/src/types.rs index 8e6a2e5a..d3b26145 100644 --- a/crates/data-client/src/types.rs +++ b/crates/data-client/src/types.rs @@ -1,8 +1,7 @@ -use futures::future::BoxFuture; -use futures::stream::BoxStream; -use sqd_primitives::{BlockNumber, BlockRef}; use std::fmt::Debug; +use futures::{future::BoxFuture, stream::BoxStream}; +use sqd_primitives::{BlockNumber, BlockRef}; #[derive(Debug, Clone, Eq, PartialEq)] pub struct BlockStreamRequest { @@ -10,7 +9,6 @@ pub struct BlockStreamRequest { pub parent_block_hash: Option } - impl BlockStreamRequest { pub fn new(first_block: BlockNumber) -> Self { Self { @@ -18,24 +16,19 @@ impl BlockStreamRequest { parent_block_hash: None } } - + pub fn set_parent_block_hash(&mut self, hash: Option<&str>) { match (hash, self.parent_block_hash.as_mut()) { (Some(src), Some(target)) => { target.clear(); target.push_str(src) - }, - (Some(src), None) => { - self.parent_block_hash = Some(src.to_string()) - }, - (None, _) => { - self.parent_block_hash = None } + (Some(src), None) => self.parent_block_hash = Some(src.to_string()), + (None, _) => self.parent_block_hash = None } } } - pub enum BlockStreamResponse { Stream { blocks: BoxStream<'static, anyhow::Result>, @@ -44,16 +37,12 @@ pub enum BlockStreamResponse { Fork(Vec) } - pub trait DataClient: Send + Sync + Debug + Unpin { type Block; - - fn stream( - &self, - req: BlockStreamRequest - ) -> BoxFuture<'static, anyhow::Result>>; - + + fn stream(&self, req: BlockStreamRequest) -> BoxFuture<'static, anyhow::Result>>; + fn get_finalized_head(&self) -> BoxFuture<'static, anyhow::Result>>; - + fn is_retryable(&self, err: &anyhow::Error) -> bool; -} \ No newline at end of file +} diff --git a/crates/data-core/src/chunk_builder.rs b/crates/data-core/src/chunk_builder.rs index b4cc1ab8..5e050dc7 100644 --- a/crates/data-core/src/chunk_builder.rs +++ b/crates/data-core/src/chunk_builder.rs @@ -1,7 +1,8 @@ -use crate::ChunkProcessor; -use sqd_array::slice::AnyTableSlice; use std::collections::BTreeMap; +use sqd_array::slice::AnyTableSlice; + +use crate::ChunkProcessor; pub trait BlockChunkBuilder: ChunkBuilder { type Block; @@ -9,7 +10,6 @@ pub trait BlockChunkBuilder: ChunkBuilder { fn push(&mut self, block: &Self::Block) -> anyhow::Result<()>; } - pub trait ChunkBuilder { fn dataset_description(&self) -> sqd_dataset::DatasetDescriptionRef; @@ -26,7 +26,6 @@ pub trait ChunkBuilder { fn submit_to_processor(&self, processor: &mut ChunkProcessor) -> anyhow::Result<()>; } - #[macro_export] macro_rules! chunk_builder { ( diff --git a/crates/data-core/src/chunk_processor.rs b/crates/data-core/src/chunk_processor.rs index 9bb4ac5b..8d120b97 100644 --- a/crates/data-core/src/chunk_processor.rs +++ b/crates/data-core/src/chunk_processor.rs @@ -1,26 +1,26 @@ -use crate::{PreparedTable, TableProcessor}; +use std::collections::BTreeMap; + use anyhow::anyhow; use sqd_array::slice::AnyTableSlice; -use std::collections::BTreeMap; +use crate::{PreparedTable, TableProcessor}; type Name = &'static str; - pub struct ChunkProcessor { tables: BTreeMap } - impl ChunkProcessor { pub fn new(tables: BTreeMap) -> ChunkProcessor { Self { tables } } pub fn push_table(&mut self, name: &str, records: &AnyTableSlice<'_>) -> anyhow::Result<()> { - let processor = self.tables.get_mut(name).ok_or_else(|| { - anyhow!("table '{}' is not present in the chunk", name) - })?; + let processor = self + .tables + .get_mut(name) + .ok_or_else(|| anyhow!("table '{}' is not present in the chunk", name))?; processor.push_batch(records) } @@ -35,12 +35,9 @@ impl ChunkProcessor { pub fn finish(self) -> anyhow::Result { self.tables .into_iter() - .map(|(name, table)| { - table.finish().map(|table| (name, table)) - }) + .map(|(name, table)| table.finish().map(|table| (name, table))) .collect::>() } } - -pub type PreparedChunk = BTreeMap; \ No newline at end of file +pub type PreparedChunk = BTreeMap; diff --git a/crates/data-core/src/downcast.rs b/crates/data-core/src/downcast.rs index 8d67fe80..6f7957fa 100644 --- a/crates/data-core/src/downcast.rs +++ b/crates/data-core/src/downcast.rs @@ -1,28 +1,26 @@ -use arrow::datatypes::DataType; -use sqd_array::slice::{AnySlice, Slice}; use std::sync::Arc; +use arrow::datatypes::DataType; +use sqd_array::slice::{AnySlice, Slice}; #[derive(Clone)] pub struct Downcast { inner: Arc> } - impl Default for Downcast { fn default() -> Self { Self::new() } } - impl Downcast { pub fn new() -> Self { Self { inner: Arc::new(parking_lot::Mutex::new(DowncastState::new())) } } - + pub fn reset(&self) { self.inner.lock().reset() } @@ -46,13 +44,11 @@ impl Downcast { } } - pub struct DowncastState { max_block_number: u64, max_item_index: u64 } - impl DowncastState { pub fn new() -> Self { Self { @@ -65,19 +61,13 @@ impl DowncastState { self.max_block_number = u32::MAX as u64; self.max_item_index = u16::MAX as u64; } - + pub fn reg_block_number(&mut self, val: u64) { - self.max_block_number = std::cmp::max( - self.max_block_number, - val - ) + self.max_block_number = std::cmp::max(self.max_block_number, val) } - + pub fn reg_item_index(&mut self, val: u64) { - self.max_item_index = std::cmp::max( - self.max_item_index, - val - ); + self.max_item_index = std::cmp::max(self.max_item_index, val); } pub fn get_block_number_type(&self) -> DataType { @@ -89,7 +79,6 @@ impl DowncastState { } } - fn get_max(array: &AnySlice<'_>) -> u64 { match array { AnySlice::UInt8(s) => s.values().iter().copied().max().unwrap_or(0) as u64, @@ -100,21 +89,20 @@ fn get_max(array: &AnySlice<'_>) -> u64 { AnySlice::List(s) => { let range = s.offsets().range(); get_max(&s.values().item().slice(range.start, range.len())) - }, + } _ => panic!("invalid index type") } } - fn get_minimal_type(max: u64) -> DataType { if u8::try_from(max).is_ok() { - return DataType::UInt8 + return DataType::UInt8; } if u16::try_from(max).is_ok() { - return DataType::UInt16 + return DataType::UInt16; } if u32::try_from(max).is_ok() { - return DataType::UInt32 + return DataType::UInt32; } DataType::UInt64 -} \ No newline at end of file +} diff --git a/crates/data-core/src/lib.rs b/crates/data-core/src/lib.rs index cf8132d1..8e59cf5c 100644 --- a/crates/data-core/src/lib.rs +++ b/crates/data-core/src/lib.rs @@ -8,7 +8,6 @@ mod table_file; mod table_processor; mod table_sort; - pub use chunk_builder::*; pub use chunk_processor::*; pub use downcast::Downcast; diff --git a/crates/data-core/src/serde.rs b/crates/data-core/src/serde.rs index b5c978ff..dc224eda 100644 --- a/crates/data-core/src/serde.rs +++ b/crates/data-core/src/serde.rs @@ -1,13 +1,10 @@ -use std::marker::PhantomData; -use std::str::FromStr; - +use std::{marker::PhantomData, str::FromStr}; struct StringParser { phantom_data: PhantomData } - -impl StringParser { +impl StringParser { pub fn new() -> Self { Self { phantom_data: PhantomData @@ -15,8 +12,7 @@ impl StringParser { } } - -impl <'de, T: FromStr> serde::de::Visitor<'de> for StringParser { +impl<'de, T: FromStr> serde::de::Visitor<'de> for StringParser { type Value = T; fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -25,23 +21,23 @@ impl <'de, T: FromStr> serde::de::Visitor<'de> for StringParser { fn visit_str(self, v: &str) -> Result where - E: serde::de::Error, + E: serde::de::Error { T::from_str(v).map_err(|_| { - serde::de::Error::custom( - format!("failed to deserialize `{}` as {}", v, std::any::type_name::()) - ) + serde::de::Error::custom(format!( + "failed to deserialize `{}` as {}", + v, + std::any::type_name::() + )) }) } } - struct StringOptionParser { phantom_data: PhantomData } - -impl StringOptionParser { +impl StringOptionParser { pub fn new() -> Self { Self { phantom_data: PhantomData @@ -49,41 +45,44 @@ impl StringOptionParser { } } - -impl <'de, T: FromStr> serde::de::Visitor<'de> for StringOptionParser { +impl<'de, T: FromStr> serde::de::Visitor<'de> for StringOptionParser { type Value = Option; fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "an optional string literal representing {}", std::any::type_name::()) + write!( + f, + "an optional string literal representing {}", + std::any::type_name::() + ) } fn visit_none(self) -> Result where - E: serde::de::Error, + E: serde::de::Error { Ok(None) } fn visit_some(self, deserializer: D) -> Result where - D: serde::Deserializer<'de>, + D: serde::Deserializer<'de> { deserializer.deserialize_str(StringParser::::new()).map(Some) } } - pub fn decode_string<'de, T, D>(deserializer: D) -> Result -where D: serde::Deserializer<'de>, - T: FromStr +where + D: serde::Deserializer<'de>, + T: FromStr { deserializer.deserialize_str(StringParser::::new()) } - pub fn decode_string_option<'de, T, D>(deserializer: D) -> Result, D::Error> -where D: serde::Deserializer<'de>, - T: FromStr +where + D: serde::Deserializer<'de>, + T: FromStr { deserializer.deserialize_option(StringOptionParser::::new()) -} \ No newline at end of file +} diff --git a/crates/data-core/src/struct_builder.rs b/crates/data-core/src/struct_builder.rs index 0fdc7085..affa5c82 100644 --- a/crates/data-core/src/struct_builder.rs +++ b/crates/data-core/src/struct_builder.rs @@ -26,14 +26,14 @@ macro_rules! struct_builder { )), )* ]); - + Self { $( $field, )* _nulls: sqd_array::builder::nullmask::NullmaskBuilder::new(0), _fields: fields } } - + #[inline] pub fn append(&mut self, is_valid: bool) { self._nulls.append(is_valid) @@ -48,7 +48,7 @@ macro_rules! struct_builder { pub fn append_null(&mut self) { self.append(false) } - + pub fn finish(self) -> arrow::array::StructArray { arrow::array::StructArray::new( self._fields, @@ -61,21 +61,21 @@ macro_rules! struct_builder { ) } } - + impl sqd_array::builder::ArrayBuilder for $name { fn data_type(&self) -> arrow::datatypes::DataType { arrow::datatypes::DataType::Struct(self._fields.clone()) } - + fn len(&self) -> usize { self._nulls.len() } - + fn byte_size(&self) -> usize { use sqd_array::builder::ArrayBuilder; self._nulls.byte_size() $(+ self.$field.byte_size())* } - + fn clear(&mut self) { use sqd_array::builder::ArrayBuilder; self._nulls.clear(); @@ -83,12 +83,12 @@ macro_rules! struct_builder { self.$field.clear(); )* } - + fn finish(self) -> arrow::array::ArrayRef { std::sync::Arc::new(self.finish()) } } - + impl sqd_array::slice::AsSlice for $name { type Slice<'a> = sqd_array::slice::AnyStructSlice<'a>; @@ -111,4 +111,4 @@ macro_rules! struct_builder { } } }; -} \ No newline at end of file +} diff --git a/crates/data-core/src/table_builder.rs b/crates/data-core/src/table_builder.rs index 0b2b2a40..6bb3efd1 100644 --- a/crates/data-core/src/table_builder.rs +++ b/crates/data-core/src/table_builder.rs @@ -13,7 +13,7 @@ macro_rules! table_builder { )* _schema: arrow::datatypes::SchemaRef } - + impl $name { pub fn new() -> Self { use arrow::datatypes::{Schema, Field, FieldRef}; @@ -35,19 +35,19 @@ macro_rules! table_builder { ), )* ]; - + let schema = Schema::new(schema_fields); - + Self { $($field,)* _schema: Arc::new(schema) } } - + pub fn table_description() -> &'static sqd_dataset::TableDescription { use std::sync::LazyLock; use sqd_dataset::TableDescription; - + static DESC: LazyLock = LazyLock::new(|| { let mut $desc = TableDescription::default(); { @@ -55,7 +55,7 @@ macro_rules! table_builder { }; $desc }); - + &DESC } @@ -63,28 +63,28 @@ macro_rules! table_builder { let desc = Self::table_description(); sqd_data_core::TableProcessor::new(downcast, self.schema(), desc) } - + #[inline] pub fn schema(&self) -> arrow::datatypes::SchemaRef { self._schema.clone() } - + pub fn len(&self) -> usize { sqd_data_core::_table_builder_len_impl!(self, $($field,)*) } - + pub fn byte_size(&self) -> usize { use sqd_array::builder::ArrayBuilder; 0 $(+ self.$field.byte_size())* } - + pub fn clear(&mut self) { use sqd_array::builder::ArrayBuilder; $( self.$field.clear(); )* } - + pub fn finish(self) -> arrow::array::RecordBatch { arrow::array::RecordBatch::try_new( self._schema, @@ -94,10 +94,10 @@ macro_rules! table_builder { ).unwrap() } } - + impl sqd_array::slice::AsSlice for $name { type Slice<'a> = sqd_array::slice::AnyTableSlice<'a>; - + fn as_slice(&self) -> Self::Slice<'_> { use sqd_array::slice::*; AnyTableSlice::new( @@ -105,11 +105,11 @@ macro_rules! table_builder { $( AnySlice::from(self.$field.as_slice()), )* - ].into() + ].into() ) } } - + impl Default for $name { fn default() -> Self { Self::new() @@ -118,7 +118,6 @@ macro_rules! table_builder { }; } - #[macro_export] macro_rules! _table_builder_len_impl { ($this:ident,) => { 0 }; @@ -127,7 +126,7 @@ macro_rules! _table_builder_len_impl { let len = $this.$field.len(); $( assert_eq!( - len, $this.$rest.len(), + len, $this.$rest.len(), "columns {} and {} have different lengths", stringify!($field), stringify!($rest) ); @@ -136,18 +135,21 @@ macro_rules! _table_builder_len_impl { }}; } - #[macro_export] macro_rules! _optionize { - ($x:expr) => { Some($x) }; - () => { None }; + ($x:expr) => { + Some($x) + }; + () => { + None + }; } -// +// // table_builder! { // Transactions { // hash: StringBuilder, // } -// +// // description(d) {} -// } \ No newline at end of file +// } diff --git a/crates/data-core/src/table_file.rs b/crates/data-core/src/table_file.rs index f3a658a3..9e581190 100644 --- a/crates/data-core/src/table_file.rs +++ b/crates/data-core/src/table_file.rs @@ -1,25 +1,26 @@ use arrow::datatypes::Fields; -use sqd_array::io::file::{ArrayFile, ArrayFileReader, ArrayFileWriter}; -use sqd_array::reader::ArrayReader; -use sqd_array::slice::{AnyTableSlice, Slice}; -use sqd_array::writer::ArrayWriter; - +use sqd_array::{ + io::file::{ArrayFile, ArrayFileReader, ArrayFileWriter}, + reader::ArrayReader, + slice::{AnyTableSlice, Slice}, + writer::ArrayWriter +}; pub struct TableFileWriter { columns: Vec } - impl TableFileWriter { pub fn new(fields: &Fields) -> anyhow::Result { - let columns = fields.iter().map(|f| { - let file = ArrayFile::new_temporary(f.data_type().clone())?; - file.write() - }).collect::>>()?; + let columns = fields + .iter() + .map(|f| { + let file = ArrayFile::new_temporary(f.data_type().clone())?; + file.write() + }) + .collect::>>()?; - Ok(Self { - columns - }) + Ok(Self { columns }) } pub fn push_batch(&mut self, records: &AnyTableSlice<'_>) -> anyhow::Result<()> { @@ -30,49 +31,43 @@ impl TableFileWriter { } pub fn finish(self) -> anyhow::Result { - let columns = self.columns.into_iter() + let columns = self + .columns + .into_iter() .map(|col| col.finish()) .collect::, _>>()?; - - let readers = columns.iter() - .map(|c| c.read()) - .collect::, _>>()?; - - Ok(TableFile { - columns, - readers - }) + + let readers = columns.iter().map(|c| c.read()).collect::, _>>()?; + + Ok(TableFile { columns, readers }) } } - pub struct TableFile { columns: Vec, readers: Vec } - impl TableFile { pub fn read_column( - &mut self, - dst: &mut impl ArrayWriter, - i: usize, + &mut self, + dst: &mut impl ArrayWriter, + i: usize, offset: usize, len: usize - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { self.readers[i].read_slice(dst, offset, len) } - + pub fn into_writer(self) -> anyhow::Result { drop(self.readers); - - let columns = self.columns.into_iter() + + let columns = self + .columns + .into_iter() .map(|file| file.write()) .collect::, _>>()?; - - Ok(TableFileWriter { - columns - }) + + Ok(TableFileWriter { columns }) } -} \ No newline at end of file +} diff --git a/crates/data-core/src/table_processor.rs b/crates/data-core/src/table_processor.rs index aa710e40..5b318930 100644 --- a/crates/data-core/src/table_processor.rs +++ b/crates/data-core/src/table_processor.rs @@ -1,33 +1,38 @@ -use crate::downcast::Downcast; -use crate::table_file::{TableFile, TableFileWriter}; -use crate::{SortedTable, TableSorter}; -use arrow::array::RecordBatch; -use arrow::datatypes::{DataType, Field, SchemaRef}; -use sqd_array::builder::{AnyBuilder, AnyTableBuilder, ArrayBuilder}; -use sqd_array::item_index_cast::cast_item_index; -use sqd_array::slice::{AnyTableSlice, AsSlice, Slice}; -use sqd_array::util::build_field_offsets; -use sqd_array::writer::ArrayWriter; +use std::{collections::HashMap, sync::Arc}; + +use arrow::{ + array::RecordBatch, + datatypes::{DataType, Field, SchemaRef} +}; +use sqd_array::{ + builder::{AnyBuilder, AnyTableBuilder, ArrayBuilder}, + item_index_cast::cast_item_index, + schema_patch::SchemaPatch, + slice::{AnyTableSlice, AsSlice, Slice}, + util::build_field_offsets, + writer::ArrayWriter +}; use sqd_dataset::TableDescription; -use std::collections::HashMap; -use std::sync::Arc; -use sqd_array::schema_patch::SchemaPatch; +use crate::{ + downcast::Downcast, + table_file::{TableFile, TableFileWriter}, + SortedTable, TableSorter +}; enum TableWriter { Plain(TableFileWriter), Sort(TableSorter) } - impl TableWriter { fn push_batch(&mut self, records: &AnyTableSlice<'_>) -> anyhow::Result<()> { match self { TableWriter::Plain(w) => w.push_batch(records), - TableWriter::Sort(w) => w.push_batch(records), + TableWriter::Sort(w) => w.push_batch(records) } } - + fn into_reader(self) -> anyhow::Result { match self { TableWriter::Plain(w) => w.finish().map(TableReader::Plain), @@ -36,40 +41,27 @@ impl TableWriter { } } - enum TableReader { Plain(TableFile), Sort(SortedTable) } - impl TableReader { - fn read_column( - &mut self, - dst: &mut impl ArrayWriter, - i: usize, - offset: usize, - len: usize - ) -> anyhow::Result<()> { + fn read_column(&mut self, dst: &mut impl ArrayWriter, i: usize, offset: usize, len: usize) -> anyhow::Result<()> { match self { - TableReader::Plain(reader) => { - reader.read_column(dst, i, offset, len) - }, - TableReader::Sort(reader) => { - reader.read_column(dst, i, offset, len) - } + TableReader::Plain(reader) => reader.read_column(dst, i, offset, len), + TableReader::Sort(reader) => reader.read_column(dst, i, offset, len) } } - + fn into_writer(self) -> anyhow::Result { match self { TableReader::Plain(reader) => reader.into_writer().map(TableWriter::Plain), - TableReader::Sort(reader) => reader.into_sorter().map(TableWriter::Sort), + TableReader::Sort(reader) => reader.into_sorter().map(TableWriter::Sort) } } } - pub struct TableProcessor { downcast: Downcast, schema: SchemaRef, @@ -80,34 +72,32 @@ pub struct TableProcessor { byte_size: usize } - impl TableProcessor { - pub fn new( - downcast: Downcast, - schema: SchemaRef, - desc: &TableDescription - ) -> anyhow::Result - { - let block_number_columns = desc.downcast.block_number.iter().map(|name| { - schema.index_of(name) - }).collect::, _>>()?; - - let item_index_columns = desc.downcast.item_index.iter().map(|name| { - schema.index_of(name) - }).collect::, _>>()?; - - let sort_key = desc.sort_key.iter().map(|name| { - schema.index_of(name) - }).collect::, _>>()?; + pub fn new(downcast: Downcast, schema: SchemaRef, desc: &TableDescription) -> anyhow::Result { + let block_number_columns = desc + .downcast + .block_number + .iter() + .map(|name| schema.index_of(name)) + .collect::, _>>()?; + + let item_index_columns = desc + .downcast + .item_index + .iter() + .map(|name| schema.index_of(name)) + .collect::, _>>()?; + + let sort_key = desc + .sort_key + .iter() + .map(|name| schema.index_of(name)) + .collect::, _>>()?; let writer = if sort_key.len() > 0 { - TableWriter::Sort( - TableSorter::new(schema.fields(), sort_key)? - ) + TableWriter::Sort(TableSorter::new(schema.fields(), sort_key)?) } else { - TableWriter::Plain( - TableFileWriter::new(schema.fields())? - ) + TableWriter::Plain(TableFileWriter::new(schema.fields())?) }; Ok(Self { @@ -124,7 +114,7 @@ impl TableProcessor { pub fn num_rows(&self) -> usize { self.num_rows } - + pub fn byte_size(&self) -> usize { self.byte_size } @@ -133,14 +123,14 @@ impl TableProcessor { for i in self.block_number_columns.iter().copied() { self.downcast.reg_block_number(&records.column(i)) } - + for i in self.item_index_columns.iter().copied() { self.downcast.reg_item_index(&records.column(i)) } - + self.num_rows += records.len(); self.byte_size += records.byte_size(); - + self.writer.push_batch(records) } @@ -149,7 +139,6 @@ impl TableProcessor { } } - pub struct PreparedTable { downcast: Downcast, block_number_columns: Vec, @@ -162,7 +151,6 @@ pub struct PreparedTable { num_rows: usize } - impl PreparedTable { fn new(processor: TableProcessor) -> anyhow::Result { let prepared_schema = downcast_schema( @@ -172,11 +160,11 @@ impl PreparedTable { processor.downcast.get_block_number_type(), processor.downcast.get_item_index_type() ); - + let column_offsets = build_field_offsets(0, processor.schema.fields()); let num_rows = processor.num_rows; let reader = processor.writer.into_reader()?; - + Ok(Self { downcast: processor.downcast, block_number_columns: processor.block_number_columns, @@ -189,7 +177,7 @@ impl PreparedTable { num_rows }) } - + pub fn into_processor(self) -> anyhow::Result { self.downcast.reset(); Ok(TableProcessor { @@ -210,27 +198,21 @@ impl PreparedTable { pub fn num_columns(&self) -> usize { self.column_offsets.len() - 1 } - + pub fn num_rows(&self) -> usize { self.num_rows } - + pub fn read_record_batch(&mut self, offset: usize, len: usize) -> anyhow::Result { let mut builder = AnyTableBuilder::new(self.prepared_schema.clone()); self.read(&mut builder, offset, len)?; Ok(builder.finish()) } - pub fn read( - &mut self, - dst: &mut impl ArrayWriter, - offset: usize, - len: usize - ) -> anyhow::Result<()> - { + pub fn read(&mut self, dst: &mut impl ArrayWriter, offset: usize, len: usize) -> anyhow::Result<()> { assert!(offset + len <= self.num_rows()); if len == 0 { - return Ok(()) + return Ok(()); } for i in 0..self.num_columns() { @@ -247,12 +229,11 @@ impl PreparedTable { i: usize, mut offset: usize, mut len: usize - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { assert!(i < self.num_columns()); assert!(offset + len <= self.num_rows()); if len == 0 { - return Ok(()) + return Ok(()); } let src_dt = self.writer_schema.field(i).data_type(); @@ -260,9 +241,10 @@ impl PreparedTable { if src_dt == target_dt { self.reader.read_column(dst, i, offset, len) } else { - let buf = self.buffers.entry(src_dt.clone()).or_insert_with(|| { - AnyBuilder::new(src_dt) - }); + let buf = self + .buffers + .entry(src_dt.clone()) + .or_insert_with(|| AnyBuilder::new(src_dt)); while len > 0 { let step_len = std::cmp::min(len, 1000); @@ -280,15 +262,13 @@ impl PreparedTable { } } - fn downcast_schema( schema: SchemaRef, block_number_columns: &[usize], item_index_columns: &[usize], block_number_type: DataType, item_index_type: DataType -) -> SchemaRef -{ +) -> SchemaRef { let mut patch = SchemaPatch::new(schema.clone()); for (columns, ty) in [ @@ -297,17 +277,15 @@ fn downcast_schema( ] { for idx in columns.iter().copied() { let f = schema.field(idx); - + let target_type = match f.data_type() { - DataType::List(f) => DataType::List( - Arc::new(Field::new(f.name(), ty.clone(), f.is_nullable())) - ), + DataType::List(f) => DataType::List(Arc::new(Field::new(f.name(), ty.clone(), f.is_nullable()))), _ => ty.clone() }; - + patch.set_field_type(idx, target_type) } } patch.finish() -} \ No newline at end of file +} diff --git a/crates/data-core/src/table_sort.rs b/crates/data-core/src/table_sort.rs index 764bc46a..b27cc93e 100644 --- a/crates/data-core/src/table_sort.rs +++ b/crates/data-core/src/table_sort.rs @@ -1,46 +1,48 @@ use arrow::datatypes::FieldRef; -use sqd_array::builder::{AnyBuilder, ArrayBuilder}; -use sqd_array::chunking::ChunkRange; -use sqd_array::io::file::{ArrayFile, ArrayFileWriter, FileReader}; -use sqd_array::reader::{AnyChunkedReader, ChunkedArrayReader}; -use sqd_array::slice::{AnyTableSlice, AsSlice, Slice}; -use sqd_array::sort::sort_table_to_indexes; -use sqd_array::util::{build_offsets, get_offset_position}; -use sqd_array::writer::ArrayWriter; - +use sqd_array::{ + builder::{AnyBuilder, ArrayBuilder}, + chunking::ChunkRange, + io::file::{ArrayFile, ArrayFileWriter, FileReader}, + reader::{AnyChunkedReader, ChunkedArrayReader}, + slice::{AnyTableSlice, AsSlice, Slice}, + sort::sort_table_to_indexes, + util::{build_offsets, get_offset_position}, + writer::ArrayWriter +}; pub struct TableSorter { data_table: Vec, data_key: Vec, sort_table: Vec, sort_key: Vec, - batch_offsets: Vec, + batch_offsets: Vec } - impl TableSorter { pub fn new(fields: &[FieldRef], sort_key: Vec) -> anyhow::Result { assert!(sort_key.len() > 0); - let sort_table = sort_key.iter().map(|i| { - AnyBuilder::new(fields[*i].data_type()) - }).collect(); - - let data_key: Vec = (0..fields.len()) - .filter(|i| !sort_key.contains(i)) + let sort_table = sort_key + .iter() + .map(|i| AnyBuilder::new(fields[*i].data_type())) .collect(); - let data_table = data_key.iter().map(|i| { - let file = ArrayFile::new_temporary(fields[*i].data_type().clone())?; - file.write() - }).collect::>()?; + let data_key: Vec = (0..fields.len()).filter(|i| !sort_key.contains(i)).collect(); + + let data_table = data_key + .iter() + .map(|i| { + let file = ArrayFile::new_temporary(fields[*i].data_type().clone())?; + file.write() + }) + .collect::>()?; Ok(Self { data_table, data_key, sort_table, sort_key, - batch_offsets: vec![0], + batch_offsets: vec![0] }) } @@ -68,19 +70,24 @@ impl TableSorter { } pub fn finish(self) -> anyhow::Result { - let data_table = self.data_table.into_iter() + let data_table = self + .data_table + .into_iter() .map(|c| c.finish()) .collect::>>()?; let num_batches = self.batch_offsets.len() - 1; - let data_readers = data_table.iter().map(|c| { - let mut chunked = AnyChunkedReader::with_capacity(num_batches, c.data_type()); - for _ in 0..num_batches { - chunked.push(c.read()?); - } - Ok(chunked) - }).collect::>>()?; + let data_readers = data_table + .iter() + .map(|c| { + let mut chunked = AnyChunkedReader::with_capacity(num_batches, c.data_type()); + for _ in 0..num_batches { + chunked.push(c.read()?); + } + Ok(chunked) + }) + .collect::>>()?; Ok(SortedTable { len: self.sort_table[0].len(), @@ -90,12 +97,11 @@ impl TableSorter { sort_key: self.sort_key, batch_offsets: self.batch_offsets, order: None, - data_readers, + data_readers }) } } - pub struct SortedTable { data_table: Vec, data_key: Vec, @@ -107,12 +113,13 @@ pub struct SortedTable { len: usize } - impl SortedTable { pub fn into_sorter(mut self) -> anyhow::Result { drop(self.data_readers); - let data_table = self.data_table.into_iter() + let data_table = self + .data_table + .into_iter() .map(|c| c.write()) .collect::>()?; @@ -125,7 +132,7 @@ impl SortedTable { data_key: self.data_key, sort_table: self.sort_table, sort_key: self.sort_key, - batch_offsets: self.batch_offsets, + batch_offsets: self.batch_offsets }) } @@ -139,8 +146,7 @@ impl SortedTable { i: usize, offset: usize, len: usize - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { assert!(offset + len <= self.num_rows()); assert!( i < self.sort_key.len() + self.data_key.len(), @@ -159,7 +165,7 @@ impl SortedTable { if let Some(pos) = self.sort_key.iter().position(|c| *c == i) { self.sort_table[pos].as_slice().write_indexes( dst, - self.order.as_ref().unwrap().0[offset..offset + len].iter().copied(), + self.order.as_ref().unwrap().0[offset..offset + len].iter().copied() ) } else { let pos = self.data_key.iter().position(|c| *c == i).unwrap(); @@ -173,14 +179,9 @@ impl SortedTable { } fn prepare_order(&mut self) { - let sort_table = AnyTableSlice::new( - self.sort_table.iter().map(|c| c.as_slice()).collect() - ); + let sort_table = AnyTableSlice::new(self.sort_table.iter().map(|c| c.as_slice()).collect()); - let order = sort_table_to_indexes( - &sort_table, - &(0..sort_table.num_columns()).collect::>(), - ); + let order = sort_table_to_indexes(&sort_table, &(0..sort_table.num_columns()).collect::>()); let chunk_tracker = ChunkTracker::new(&self.batch_offsets, &order); @@ -188,15 +189,13 @@ impl SortedTable { } } - struct ChunkTracker { chunks: Vec, offsets: Vec, last_start_pos: usize, - last_end_pos: usize, + last_end_pos: usize } - impl ChunkTracker { fn new(batch_offsets: &[usize], order: &[usize]) -> Self { let chunks = ChunkRange::build_abs_order_list(batch_offsets, order); @@ -205,7 +204,7 @@ impl ChunkTracker { chunks, offsets, last_start_pos: 0, - last_end_pos: 0, + last_end_pos: 0 } } @@ -224,7 +223,7 @@ impl ChunkTracker { first_chunk = Some(ChunkRange { chunk: ch.chunk, offset: ch.offset + ch_offset, - len: std::cmp::min(ch.len - ch_offset, len), + len: std::cmp::min(ch.len - ch_offset, len) }); } @@ -235,7 +234,7 @@ impl ChunkTracker { Some(ChunkRange { chunk: ch.chunk, offset: ch.offset, - len, + len }) }), &[], @@ -257,7 +256,7 @@ impl ChunkTracker { Some(ChunkRange { chunk: ch.chunk, offset: ch.offset, - len: ch.len + end - self.offsets[ep + 1], + len: ch.len + end - self.offsets[ep + 1] }) ) } else { @@ -274,4 +273,4 @@ impl ChunkTracker { self.last_end_pos = get_offset_position(&self.offsets, index, self.last_end_pos); self.last_end_pos } -} \ No newline at end of file +} diff --git a/crates/data-core/tests/processor_reuse.rs b/crates/data-core/tests/processor_reuse.rs index 35b00149..22fa324a 100644 --- a/crates/data-core/tests/processor_reuse.rs +++ b/crates/data-core/tests/processor_reuse.rs @@ -1,8 +1,6 @@ -use sqd_array::builder::UInt32Builder; -use sqd_array::slice::AsSlice; +use sqd_array::{builder::UInt32Builder, slice::AsSlice}; use sqd_data_core::{table_builder, Downcast}; - table_builder! { TableBuilder { col: UInt32Builder, @@ -11,7 +9,6 @@ table_builder! { description(d) {} } - #[test] fn test_processor_reuse() -> anyhow::Result<()> { let mut table_builder = TableBuilder::new(); @@ -22,15 +19,15 @@ fn test_processor_reuse() -> anyhow::Result<()> { } processor.push_batch(&table_builder.as_slice())?; - + let mut prepared = processor.finish()?; let _ = prepared.read_record_batch(0, prepared.num_rows())?; - + processor = prepared.into_processor()?; processor.push_batch(&table_builder.as_slice())?; prepared = processor.finish()?; let _records = prepared.read_record_batch(0, prepared.num_rows())?; - + Ok(()) -} \ No newline at end of file +} diff --git a/crates/data-core/tests/sorter.rs b/crates/data-core/tests/sorter.rs index 934b9d1b..67866058 100644 --- a/crates/data-core/tests/sorter.rs +++ b/crates/data-core/tests/sorter.rs @@ -1,11 +1,9 @@ +use std::{fs::File, path::Path}; + use arrow::array::RecordBatchReader; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; -use sqd_array::builder::AnyTableBuilder; -use sqd_array::slice::AsSlice; +use sqd_array::{builder::AnyTableBuilder, slice::AsSlice}; use sqd_data_core::TableSorter; -use std::fs::File; -use std::path::Path; - #[test] fn sort_ethereum_transactions() -> anyhow::Result<()> { @@ -13,14 +11,10 @@ fn sort_ethereum_transactions() -> anyhow::Result<()> { let mut sorter = TableSorter::new( src_reader.schema().fields(), - [ - "sighash", - "to", - "block_number", - "transaction_index" - ].iter().map(|name| { - src_reader.schema().index_of(name).unwrap() - }).collect(), + ["sighash", "to", "block_number", "transaction_index"] + .iter() + .map(|name| src_reader.schema().index_of(name).unwrap()) + .collect() )?; for record_batch in src_reader { @@ -37,7 +31,7 @@ fn sort_ethereum_transactions() -> anyhow::Result<()> { let mut pos = 0; for record_batch in ref_reader { let record_batch = record_batch?; - + let mut builder = AnyTableBuilder::new(record_batch.schema()); for i in 0..builder.num_columns() { let dst = builder.column_writer(i); @@ -64,11 +58,9 @@ fn sort_ethereum_transactions() -> anyhow::Result<()> { Ok(()) } - fn open_parquet(path: &str) -> anyhow::Result> { let path = Path::new(env!("CARGO_MANIFEST_DIR")).join(path); let file = File::open(path)?; - let reader = ParquetRecordBatchReaderBuilder::try_new(file)? - .with_batch_size(4000); + let reader = ParquetRecordBatchReaderBuilder::try_new(file)?.with_batch_size(4000); Ok(reader) } diff --git a/crates/data-source/src/lib.rs b/crates/data-source/src/lib.rs index 6cb39993..d0cdaf8a 100644 --- a/crates/data-source/src/lib.rs +++ b/crates/data-source/src/lib.rs @@ -2,7 +2,6 @@ mod map; mod standard; mod types; - pub use map::MappedDataSource; pub use standard::StandardDataSource; pub use types::*; diff --git a/crates/data-source/src/map.rs b/crates/data-source/src/map.rs index 34edc467..bbe560e4 100644 --- a/crates/data-source/src/map.rs +++ b/crates/data-source/src/map.rs @@ -1,26 +1,24 @@ -use crate::{DataEvent, DataSource}; +use std::{ + pin::Pin, + task::{Context, Poll} +}; + use futures::{Stream, StreamExt}; use sqd_primitives::{Block, BlockNumber}; -use std::pin::Pin; -use std::task::{Context, Poll}; +use crate::{DataEvent, DataSource}; pub struct MappedDataSource { inner: S, map: F } - impl MappedDataSource { pub fn new(inner: S, map: F) -> Self { - Self { - inner, - map - } + Self { inner, map } } } - impl DataSource for MappedDataSource where S: DataSource, @@ -42,7 +40,6 @@ where } } - impl Stream for MappedDataSource where S: DataSource, @@ -61,4 +58,4 @@ where fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } -} \ No newline at end of file +} diff --git a/crates/data-source/src/standard.rs b/crates/data-source/src/standard.rs index 2d7127c5..f896c2dd 100644 --- a/crates/data-source/src/standard.rs +++ b/crates/data-source/src/standard.rs @@ -1,17 +1,13 @@ -use crate::types::{DataEvent, DataSource}; +use std::{future::Future, pin::Pin, task::Poll, time::Duration}; + use anyhow::Context; -use futures::future::BoxFuture; -use futures::stream::BoxStream; -use futures::{FutureExt, Stream, StreamExt}; +use futures::{future::BoxFuture, stream::BoxStream, FutureExt, Stream, StreamExt}; use sqd_data_client::{BlockStreamRequest, BlockStreamResponse, DataClient}; use sqd_primitives::{Block, BlockNumber, BlockRef}; -use std::future::Future; -use std::pin::Pin; -use std::task::Poll; -use std::time::Duration; use tokio::time::Sleep; use tracing::warn; +use crate::types::{DataEvent, DataSource}; struct Endpoint { client: C, @@ -20,7 +16,6 @@ struct Endpoint { last_committed_block: Option } - enum EndpointState { Ready, Req { @@ -38,13 +33,11 @@ enum EndpointState { Backoff(Pin>) } - pub struct StandardDataSource { endpoints: Vec>, state: DataSourceState } - struct DataSourceState { parse: F, finalized_head: Option, @@ -54,13 +47,8 @@ struct DataSourceState { fork_consensus_timeout: Option>> } - impl DataSourceState { - fn poll_endpoint( - &mut self, - ep: &mut Endpoint, - cx: &mut std::task::Context<'_> - ) -> Poll> + fn poll_endpoint(&mut self, ep: &mut Endpoint, cx: &mut std::task::Context<'_>) -> Poll> where B: Block, C: DataClient, @@ -73,89 +61,70 @@ impl DataSourceState { req: self.position.clone(), future: ep.client.stream(self.position.clone()) } - }, - EndpointState::Req { req, future } => { - match future.poll_unpin(cx) { - Poll::Ready(Ok(BlockStreamResponse::Stream { - finalized_head, - blocks - })) => { - let finalized_head_updated = self.on_new_finalized_head( - finalized_head.as_ref() - ); - - ep.error_counter = 0; - ep.state = EndpointState::Stream { - finalized_head: finalized_head.as_ref().map_or(0, |b| b.number), - blocks - }; - - if finalized_head_updated { - return Poll::Ready(DataEvent::FinalizedHead( - finalized_head.unwrap() - )) - } - }, - Poll::Ready(Ok(BlockStreamResponse::Fork(prev_blocks))) => { - ep.error_counter = 0; - ep.state = EndpointState::Fork { - req: req.clone(), - prev_blocks - }; - }, - Poll::Ready(Err(err)) => ep.on_error(err), - Poll::Pending => return Poll::Pending + } + EndpointState::Req { req, future } => match future.poll_unpin(cx) { + Poll::Ready(Ok(BlockStreamResponse::Stream { finalized_head, blocks })) => { + let finalized_head_updated = self.on_new_finalized_head(finalized_head.as_ref()); + + ep.error_counter = 0; + ep.state = EndpointState::Stream { + finalized_head: finalized_head.as_ref().map_or(0, |b| b.number), + blocks + }; + + if finalized_head_updated { + return Poll::Ready(DataEvent::FinalizedHead(finalized_head.unwrap())); + } + } + Poll::Ready(Ok(BlockStreamResponse::Fork(prev_blocks))) => { + ep.error_counter = 0; + ep.state = EndpointState::Fork { + req: req.clone(), + prev_blocks + }; } + Poll::Ready(Err(err)) => ep.on_error(err), + Poll::Pending => return Poll::Pending }, - EndpointState::Stream { - finalized_head, - blocks - } => { - match blocks.poll_next_unpin(cx) { - Poll::Ready(None) => { - ep.error_counter = 0; - ep.state = EndpointState::Ready; - let prev_block = self.position.first_block.saturating_sub(1); - if prev_block >= self.max_seen_finalized_block && ep.last_committed_block == Some(prev_block) { - return Poll::Ready(DataEvent::MaybeOnHead) - } - }, - Poll::Ready(Some(Ok(new_block))) => { - match (self.parse)(new_block).context("failed to parse a block") { - Ok(block) => { - ep.error_counter = 0; - if block.number() >= self.position.first_block { - let is_final = *finalized_head >= block.number(); - if self.accept_new_block(&block, is_final) { - ep.last_committed_block = Some(block.number()); - return Poll::Ready(DataEvent::Block { - block, - is_final - }) - } else { - ep.state = EndpointState::Ready; - } + EndpointState::Stream { finalized_head, blocks } => match blocks.poll_next_unpin(cx) { + Poll::Ready(None) => { + ep.error_counter = 0; + ep.state = EndpointState::Ready; + let prev_block = self.position.first_block.saturating_sub(1); + if prev_block >= self.max_seen_finalized_block && ep.last_committed_block == Some(prev_block) { + return Poll::Ready(DataEvent::MaybeOnHead); + } + } + Poll::Ready(Some(Ok(new_block))) => { + match (self.parse)(new_block).context("failed to parse a block") { + Ok(block) => { + ep.error_counter = 0; + if block.number() >= self.position.first_block { + let is_final = *finalized_head >= block.number(); + if self.accept_new_block(&block, is_final) { + ep.last_committed_block = Some(block.number()); + return Poll::Ready(DataEvent::Block { block, is_final }); + } else { + ep.state = EndpointState::Ready; } - }, - Err(err) => ep.on_error(err), + } } - }, - Poll::Ready(Some(Err(err))) => ep.on_error(err), - Poll::Pending => return Poll::Pending + Err(err) => ep.on_error(err) + } } + Poll::Ready(Some(Err(err))) => ep.on_error(err), + Poll::Pending => return Poll::Pending }, EndpointState::Fork { req, .. } => { if req == &self.position { - return Poll::Pending + return Poll::Pending; } else { ep.state = EndpointState::Ready; } - }, - EndpointState::Backoff(sleep) => { - match sleep.as_mut().poll(cx) { - Poll::Ready(_) => ep.state = EndpointState::Ready, - Poll::Pending => return Poll::Pending - } + } + EndpointState::Backoff(sleep) => match sleep.as_mut().poll(cx) { + Poll::Ready(_) => ep.state = EndpointState::Ready, + Poll::Pending => return Poll::Pending } } } @@ -166,7 +135,7 @@ impl DataSourceState { if let Some(parent_hash) = self.position.parent_block_hash.as_mut() { if block.parent_hash() != parent_hash { - return false + return false; } parent_hash.clear(); parent_hash.push_str(block.hash()); @@ -185,29 +154,25 @@ impl DataSourceState { } fn on_new_finalized_head(&mut self, new_head: Option<&BlockRef>) -> bool { - let Some(new_head) = new_head else { - return false - }; + let Some(new_head) = new_head else { return false }; - self.max_seen_finalized_block = std::cmp::max( - self.max_seen_finalized_block, - new_head.number - ); + self.max_seen_finalized_block = std::cmp::max(self.max_seen_finalized_block, new_head.number); if self.position.first_block == 0 { - return false + return false; } let Some(current_parent_hash) = self.position.parent_block_hash.as_ref() else { - return false + return false; }; - let is_behind = self.finalized_head + let is_behind = self + .finalized_head .as_ref() .map_or(false, |c| c.number >= new_head.number); if is_behind { - return false + return false; } let mut new_number = new_head.number; @@ -215,7 +180,7 @@ impl DataSourceState { if new_head.number >= self.position.first_block { if !self.position_is_canonical { - return false + return false; } new_number = self.position.first_block - 1; new_hash = current_parent_hash; @@ -227,7 +192,6 @@ impl DataSourceState { } } - fn set_head(head: &mut Option, number: BlockNumber, hash: &str) { if let Some(current) = head.as_mut() { current.number = number; @@ -241,7 +205,6 @@ fn set_head(head: &mut Option, number: BlockNumber, hash: &str) { } } - impl Endpoint { fn is_on_fork(&self) -> bool { match self.state { @@ -284,7 +247,6 @@ impl Endpoint { } } - impl StandardDataSource where B: Block, @@ -292,14 +254,15 @@ where F: Fn(C::Block) -> anyhow::Result { pub fn new(clients: Vec, parse: F) -> Self { - let endpoints = clients.into_iter().map(|client| { - Endpoint { + let endpoints = clients + .into_iter() + .map(|client| Endpoint { client, error_counter: 0, state: EndpointState::Ready, last_committed_block: None - } - }).collect(); + }) + .collect(); let state = DataSourceState { parse, @@ -313,28 +276,24 @@ where fork_consensus_timeout: None }; - Self { - endpoints, - state - } + Self { endpoints, state } } fn poll_next_event(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { for ep in self.endpoints.iter_mut() { let event = self.state.poll_endpoint(ep, cx); if event.is_ready() { - return event + return event; } } let forks = self.endpoints.iter().filter(|ep| ep.is_on_fork()).count(); if forks > 0 { - if - forks > self.endpoints.len() / 2 || - forks == self.endpoints.iter().filter(|ep| ep.is_active()).count() || - self.fork_consensus_timeout(cx) + if forks > self.endpoints.len() / 2 + || forks == self.endpoints.iter().filter(|ep| ep.is_active()).count() + || self.fork_consensus_timeout(cx) { - return Poll::Ready(DataEvent::Fork(self.extract_fork())) + return Poll::Ready(DataEvent::Fork(self.extract_fork())); } } else { self.state.fork_consensus_timeout = None @@ -344,13 +303,12 @@ where } fn fork_consensus_timeout(&mut self, cx: &mut std::task::Context<'_>) -> bool { - let mut timeout = self.state + let mut timeout = self + .state .fork_consensus_timeout .take() - .unwrap_or_else(|| { - Box::pin(tokio::time::sleep(Duration::from_secs(2))) - }); - + .unwrap_or_else(|| Box::pin(tokio::time::sleep(Duration::from_secs(2)))); + if timeout.poll_unpin(cx) == Poll::Pending { self.state.fork_consensus_timeout = Some(timeout); false @@ -358,7 +316,7 @@ where true } } - + fn extract_fork(&mut self) -> Vec { self.state.fork_consensus_timeout = None; let mut chain = Vec::new(); @@ -377,7 +335,6 @@ where } } - impl Stream for StandardDataSource where B: Block, @@ -391,7 +348,6 @@ where } } - impl DataSource for StandardDataSource where B: Block, @@ -418,4 +374,4 @@ where fn get_parent_block_hash(&self) -> Option<&str> { self.state.position.parent_block_hash.as_deref() } -} \ No newline at end of file +} diff --git a/crates/data-source/src/types.rs b/crates/data-source/src/types.rs index ff03fbfa..65a42421 100644 --- a/crates/data-source/src/types.rs +++ b/crates/data-source/src/types.rs @@ -1,18 +1,13 @@ use futures::Stream; use sqd_primitives::{Block, BlockNumber, BlockRef}; - pub enum DataEvent { FinalizedHead(BlockRef), - Block { - block: B, - is_final: bool - }, + Block { block: B, is_final: bool }, Fork(Vec), MaybeOnHead } - pub trait DataSource: Stream> + Unpin { type Block: Block; @@ -23,7 +18,6 @@ pub trait DataSource: Stream> + Unpin { fn get_parent_block_hash(&self) -> Option<&str>; } - impl DataEvent { pub fn map(self, f: impl FnOnce(B, bool) -> T) -> DataEvent { match self { @@ -36,4 +30,4 @@ impl DataEvent { DataEvent::MaybeOnHead => DataEvent::MaybeOnHead } } -} \ No newline at end of file +} diff --git a/crates/data/src/bitcoin/mod.rs b/crates/data/src/bitcoin/mod.rs index 7e26db9b..a0c66c0a 100644 --- a/crates/data/src/bitcoin/mod.rs +++ b/crates/data/src/bitcoin/mod.rs @@ -1,2 +1,2 @@ pub mod model; -pub mod tables; \ No newline at end of file +pub mod tables; diff --git a/crates/data/src/bitcoin/model.rs b/crates/data/src/bitcoin/model.rs index 57e59b2f..7055019b 100644 --- a/crates/data/src/bitcoin/model.rs +++ b/crates/data/src/bitcoin/model.rs @@ -1,7 +1,8 @@ -use crate::types::HexBytes; use serde::Deserialize; use sqd_primitives::BlockNumber; +use crate::types::HexBytes; + #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct BlockHeader { @@ -19,14 +20,14 @@ pub struct BlockHeader { pub chain_work: HexBytes, pub stripped_size: u64, pub size: u64, - pub weight: u64, + pub weight: u64 } #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct ScriptSig { pub hex: HexBytes, - pub asm: Option, + pub asm: Option } #[derive(Deserialize)] @@ -37,7 +38,7 @@ pub struct ScriptPubKey { pub desc: Option, #[serde(rename = "type")] pub type_: Option, - pub address: Option, + pub address: Option } #[derive(Deserialize)] @@ -46,7 +47,7 @@ pub struct Prevout { pub generated: bool, pub height: BlockNumber, pub value: f64, - pub script_pub_key: ScriptPubKey, + pub script_pub_key: ScriptPubKey } #[derive(Deserialize)] @@ -57,7 +58,7 @@ pub struct TransactionInputTx { pub script_sig: ScriptSig, pub sequence: u32, pub tx_in_witness: Option>, - pub prevout: Option, + pub prevout: Option } #[derive(Deserialize)] @@ -65,7 +66,7 @@ pub struct TransactionInputTx { pub struct TransactionInputCoinbase { pub coinbase: HexBytes, pub sequence: u32, - pub tx_in_witness: Option>, + pub tx_in_witness: Option> } #[derive(Deserialize)] @@ -74,7 +75,7 @@ pub enum TransactionInput { #[serde(rename = "tx")] Tx(TransactionInputTx), #[serde(rename = "coinbase")] - Coinbase(TransactionInputCoinbase), + Coinbase(TransactionInputCoinbase) } #[derive(Deserialize)] @@ -82,7 +83,7 @@ pub enum TransactionInput { pub struct TransactionOutput { pub value: f64, pub n: u32, - pub script_pub_key: ScriptPubKey, + pub script_pub_key: ScriptPubKey } #[derive(Deserialize)] @@ -97,14 +98,14 @@ pub struct Transaction { pub version: u32, pub locktime: u32, pub vin: Vec, - pub vout: Vec, + pub vout: Vec } #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Block { pub header: BlockHeader, - pub transactions: Vec, + pub transactions: Vec } impl sqd_primitives::Block for Block { diff --git a/crates/data/src/bitcoin/tables/block.rs b/crates/data/src/bitcoin/tables/block.rs index 4ae5ed89..17d5a86f 100644 --- a/crates/data/src/bitcoin/tables/block.rs +++ b/crates/data/src/bitcoin/tables/block.rs @@ -1,10 +1,7 @@ -use crate::bitcoin::model::BlockHeader; -use crate::bitcoin::tables::common::*; -use sqd_array::builder::{ - Float64Builder, TimestampSecondBuilder, UInt32Builder, UInt64Builder, -}; +use sqd_array::builder::{Float64Builder, TimestampSecondBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::bitcoin::{model::BlockHeader, tables::common::*}; table_builder! { BlockBuilder { @@ -33,7 +30,6 @@ table_builder! { } } - impl BlockBuilder { pub fn push(&mut self, row: &BlockHeader) { self.number.append(row.number); diff --git a/crates/data/src/bitcoin/tables/input.rs b/crates/data/src/bitcoin/tables/input.rs index 69b9488c..ad5c53bf 100644 --- a/crates/data/src/bitcoin/tables/input.rs +++ b/crates/data/src/bitcoin/tables/input.rs @@ -1,10 +1,10 @@ -use crate::bitcoin::model::{Block, TransactionInput}; -use crate::bitcoin::tables::common::*; -use sqd_array::builder::{ - BooleanBuilder, Float64Builder, StringBuilder, UInt32Builder, UInt64Builder, -}; +use sqd_array::builder::{BooleanBuilder, Float64Builder, StringBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::bitcoin::{ + model::{Block, TransactionInput}, + tables::common::* +}; table_builder! { InputBuilder { @@ -47,15 +47,8 @@ table_builder! { } } - impl InputBuilder { - pub fn push( - &mut self, - block: &Block, - transaction_index: u32, - input_index: u32, - row: &TransactionInput, - ) { + pub fn push(&mut self, block: &Block, transaction_index: u32, input_index: u32, row: &TransactionInput) { self.block_number.append(block.header.number); self.transaction_index.append(transaction_index); self.input_index.append(input_index); @@ -66,8 +59,7 @@ impl InputBuilder { self.txid.append(&tx_input.txid); self.vout.append(tx_input.vout); self.script_sig_hex.append(&tx_input.script_sig.hex); - self.script_sig_asm - .append_option(tx_input.script_sig.asm.as_deref()); + self.script_sig_asm.append_option(tx_input.script_sig.asm.as_deref()); self.sequence.append(tx_input.sequence); self.coinbase.append_option(None::<&str>); @@ -81,8 +73,7 @@ impl InputBuilder { self.prevout_generated.append(prevout.generated); self.prevout_height.append(prevout.height); self.prevout_value.append(prevout.value); - self.prevout_script_pub_key_hex - .append(&prevout.script_pub_key.hex); + self.prevout_script_pub_key_hex.append(&prevout.script_pub_key.hex); self.prevout_script_pub_key_asm .append_option(prevout.script_pub_key.asm.as_deref()); self.prevout_script_pub_key_desc @@ -99,8 +90,7 @@ impl InputBuilder { self.prevout_script_pub_key_asm.append_option(None::<&str>); self.prevout_script_pub_key_desc.append_option(None::<&str>); self.prevout_script_pub_key_type.append_option(None::<&str>); - self.prevout_script_pub_key_address - .append_option(None::<&str>); + self.prevout_script_pub_key_address.append_option(None::<&str>); } } TransactionInput::Coinbase(cb_input) => { @@ -125,8 +115,7 @@ impl InputBuilder { self.prevout_script_pub_key_asm.append_option(None::<&str>); self.prevout_script_pub_key_desc.append_option(None::<&str>); self.prevout_script_pub_key_type.append_option(None::<&str>); - self.prevout_script_pub_key_address - .append_option(None::<&str>); + self.prevout_script_pub_key_address.append_option(None::<&str>); } } } diff --git a/crates/data/src/bitcoin/tables/mod.rs b/crates/data/src/bitcoin/tables/mod.rs index df05c92a..2e0c8e4b 100644 --- a/crates/data/src/bitcoin/tables/mod.rs +++ b/crates/data/src/bitcoin/tables/mod.rs @@ -7,10 +7,10 @@ mod transaction; pub use block::*; pub use input::*; pub use output::*; +use sqd_data_core::chunk_builder; pub use transaction::*; use super::model::Block; -use sqd_data_core::chunk_builder; chunk_builder! { BitcoinChunkBuilder { diff --git a/crates/data/src/bitcoin/tables/output.rs b/crates/data/src/bitcoin/tables/output.rs index 68565c5e..f1b1c79a 100644 --- a/crates/data/src/bitcoin/tables/output.rs +++ b/crates/data/src/bitcoin/tables/output.rs @@ -1,8 +1,10 @@ -use crate::bitcoin::model::{Block, TransactionOutput}; -use crate::bitcoin::tables::common::*; use sqd_array::builder::{Float64Builder, StringBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::bitcoin::{ + model::{Block, TransactionOutput}, + tables::common::* +}; table_builder! { OutputBuilder { @@ -31,21 +33,14 @@ table_builder! { } } - impl OutputBuilder { - pub fn push( - &mut self, - block: &Block, - transaction_index: u32, - row: &TransactionOutput, - ) { + pub fn push(&mut self, block: &Block, transaction_index: u32, row: &TransactionOutput) { self.block_number.append(block.header.number); self.transaction_index.append(transaction_index); self.output_index.append(row.n); self.value.append(row.value); self.script_pub_key_hex.append(&row.script_pub_key.hex); - self.script_pub_key_asm - .append_option(row.script_pub_key.asm.as_deref()); + self.script_pub_key_asm.append_option(row.script_pub_key.asm.as_deref()); self.script_pub_key_desc .append_option(row.script_pub_key.desc.as_deref()); self.script_pub_key_type diff --git a/crates/data/src/bitcoin/tables/transaction.rs b/crates/data/src/bitcoin/tables/transaction.rs index 9fb4a143..84510a5e 100644 --- a/crates/data/src/bitcoin/tables/transaction.rs +++ b/crates/data/src/bitcoin/tables/transaction.rs @@ -1,8 +1,10 @@ -use crate::bitcoin::model::{Block, Transaction}; -use crate::bitcoin::tables::common::*; use sqd_array::builder::{UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::bitcoin::{ + model::{Block, Transaction}, + tables::common::* +}; table_builder! { TransactionBuilder { @@ -28,7 +30,6 @@ table_builder! { } } - impl TransactionBuilder { pub fn push(&mut self, block: &Block, transaction_index: u32, row: &Transaction) { self.block_number.append(block.header.number); diff --git a/crates/data/src/evm/mod.rs b/crates/data/src/evm/mod.rs index 7e26db9b..a0c66c0a 100644 --- a/crates/data/src/evm/mod.rs +++ b/crates/data/src/evm/mod.rs @@ -1,2 +1,2 @@ pub mod model; -pub mod tables; \ No newline at end of file +pub mod tables; diff --git a/crates/data/src/evm/model.rs b/crates/data/src/evm/model.rs index 35a99800..dc575998 100644 --- a/crates/data/src/evm/model.rs +++ b/crates/data/src/evm/model.rs @@ -1,14 +1,15 @@ -use crate::types::HexBytes; use serde::Deserialize; use sqd_primitives::{BlockNumber, DataMask, ItemIndex}; +use crate::types::HexBytes; + #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Withdrawal { pub address: HexBytes, pub amount: HexBytes, pub index: HexBytes, - pub validator_index: HexBytes, + pub validator_index: HexBytes } #[derive(Deserialize)] @@ -44,7 +45,7 @@ pub struct BlockHeader { // Tempo-specific block header fields pub main_block_general_gas_limit: Option, pub shared_gas_limit: Option, - pub timestamp_millis_part: Option, + pub timestamp_millis_part: Option } #[derive(Deserialize)] @@ -52,11 +53,11 @@ pub struct BlockHeader { pub struct EIP7702Authorization { pub chain_id: HexBytes, pub address: HexBytes, - #[serde(deserialize_with="sqd_data_core::serde::decode_string")] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string")] pub nonce: u64, pub y_parity: u8, pub r: HexBytes, - pub s: HexBytes, + pub s: HexBytes } #[derive(Deserialize)] @@ -64,7 +65,7 @@ pub struct EIP7702Authorization { pub struct TempoCall { pub to: Option, pub value: HexBytes, - pub input: HexBytes, + pub input: HexBytes } #[derive(Deserialize)] @@ -75,7 +76,7 @@ pub enum TempoPrimitiveSignature { r: HexBytes, s: HexBytes, y_parity: Option, - v: Option, + v: Option }, #[serde(rename = "p256", rename_all = "camelCase")] P256 { @@ -83,7 +84,7 @@ pub enum TempoPrimitiveSignature { s: HexBytes, pub_key_x: HexBytes, pub_key_y: HexBytes, - pre_hash: bool, + pre_hash: bool }, #[serde(rename = "webAuthn", rename_all = "camelCase")] WebAuthn { @@ -91,8 +92,8 @@ pub enum TempoPrimitiveSignature { s: HexBytes, pub_key_x: HexBytes, pub_key_y: HexBytes, - webauthn_data: HexBytes, - }, + webauthn_data: HexBytes + } } #[derive(Deserialize)] @@ -100,14 +101,14 @@ pub enum TempoPrimitiveSignature { pub struct TempoKeychainSignature { pub user_address: HexBytes, pub signature: TempoPrimitiveSignature, - pub version: Option, + pub version: Option } #[derive(Deserialize)] #[serde(untagged)] pub enum TempoSignature { Keychain(TempoKeychainSignature), - Primitive(TempoPrimitiveSignature), + Primitive(TempoPrimitiveSignature) } #[derive(Deserialize)] @@ -116,14 +117,14 @@ pub struct TempoSignedAuthorization { pub chain_id: HexBytes, pub address: HexBytes, pub nonce: u64, - pub signature: TempoSignature, + pub signature: TempoSignature } #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct TempoTokenLimit { pub token: HexBytes, - pub limit: HexBytes, + pub limit: HexBytes } #[derive(Deserialize)] @@ -134,7 +135,7 @@ pub struct TempoSignedKeyAuthorization { pub key_id: HexBytes, pub expiry: Option, pub limits: Option>, - pub signature: TempoPrimitiveSignature, + pub signature: TempoPrimitiveSignature } #[derive(Deserialize)] @@ -142,14 +143,14 @@ pub struct TempoSignedKeyAuthorization { pub struct TempoFeePayerSignature { pub v: u8, pub r: HexBytes, - pub s: HexBytes, + pub s: HexBytes } #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct AccessListItem { pub address: HexBytes, - pub storage_keys: Vec, + pub storage_keys: Vec } #[derive(Deserialize)] @@ -207,7 +208,7 @@ pub struct Transaction { pub l1_fee: Option, pub l1_fee_scalar: Option, pub l1_gas_price: Option, - pub l1_gas_used: Option, + pub l1_gas_used: Option } #[derive(Deserialize)] @@ -239,7 +240,7 @@ pub struct TraceActionCall { pub value: Option, pub gas: HexBytes, pub input: HexBytes, - pub call_type: String, + pub call_type: String } #[derive(Deserialize)] @@ -247,7 +248,7 @@ pub struct TraceActionCall { pub struct TraceActionReward { pub author: HexBytes, pub value: HexBytes, - pub reward_type: String, + pub reward_type: String } #[derive(Deserialize)] @@ -263,7 +264,7 @@ pub struct TraceActionSelfDestruct { pub struct TraceResultCreate { pub gas_used: HexBytes, pub code: Option, - pub address: Option, + pub address: Option } #[derive(Deserialize)] @@ -299,13 +300,9 @@ pub enum TraceOp { result: Option }, #[serde(rename = "selfdestruct")] - SelfDestruct { - action: TraceActionSelfDestruct - }, + SelfDestruct { action: TraceActionSelfDestruct }, #[serde(rename = "reward")] - Reward { - action: TraceActionReward - } + Reward { action: TraceActionReward } } #[derive(Deserialize)] @@ -326,7 +323,7 @@ pub struct Block { pub transactions: Vec, pub logs: Option>, pub traces: Option>, - pub state_diffs: Option>, + pub state_diffs: Option> } impl sqd_primitives::Block for Block { @@ -372,4 +369,4 @@ impl sqd_primitives::Block for Block { _ => true } } -} \ No newline at end of file +} diff --git a/crates/data/src/evm/tables/block.rs b/crates/data/src/evm/tables/block.rs index 89f784c6..92dcce45 100644 --- a/crates/data/src/evm/tables/block.rs +++ b/crates/data/src/evm/tables/block.rs @@ -1,8 +1,7 @@ -use crate::evm::model::BlockHeader; -use crate::evm::tables::common::*; use sqd_array::builder::{ListBuilder, TimestampSecondBuilder, UInt64Builder}; use sqd_data_core::{struct_builder, table_builder}; +use crate::evm::{model::BlockHeader, tables::common::*}; type WithdrawalListBuilder = ListBuilder; struct_builder! { @@ -14,7 +13,6 @@ struct_builder! { } } - table_builder! { BlockBuilder { number: UInt64Builder, @@ -62,7 +60,6 @@ table_builder! { } } - impl BlockBuilder { pub fn push(&mut self, row: &BlockHeader) { self.number.append(row.number); @@ -103,12 +100,15 @@ impl BlockBuilder { self.withdrawals_root.append_option(row.withdrawals_root.as_deref()); self.blob_gas_used.append_option(row.blob_gas_used.as_deref()); self.excess_blob_gas.append_option(row.excess_blob_gas.as_deref()); - self.parent_beacon_block_root.append_option(row.parent_beacon_block_root.as_deref()); + self.parent_beacon_block_root + .append_option(row.parent_beacon_block_root.as_deref()); self.requests_hash.append_option(row.requests_hash.as_deref()); self.l1_block_number.append_option(row.l1_block_number); - self.main_block_general_gas_limit.append_option(row.main_block_general_gas_limit.as_deref()); + self.main_block_general_gas_limit + .append_option(row.main_block_general_gas_limit.as_deref()); self.shared_gas_limit.append_option(row.shared_gas_limit.as_deref()); - self.timestamp_millis_part.append_option(row.timestamp_millis_part.as_deref()); + self.timestamp_millis_part + .append_option(row.timestamp_millis_part.as_deref()); self.extra_data_size.append(row.extra_data.len() as u64); diff --git a/crates/data/src/evm/tables/common.rs b/crates/data/src/evm/tables/common.rs index 81c3b886..5aa5043f 100644 --- a/crates/data/src/evm/tables/common.rs +++ b/crates/data/src/evm/tables/common.rs @@ -1,13 +1,11 @@ use sqd_array::builder::{ListBuilder, StringBuilder, UInt32Builder}; - pub type HexBytesBuilder = StringBuilder; pub type BlobHashesListBuilder = ListBuilder; pub type TraceAddressListBuilder = ListBuilder; pub type UncleListBuilder = ListBuilder; pub type StorageKeyListBuilder = ListBuilder; - pub fn sighash(bytes: &str) -> Option<&str> { - (bytes.len() >= 10).then(|| { &bytes[0..10] }) -} \ No newline at end of file + (bytes.len() >= 10).then(|| &bytes[0..10]) +} diff --git a/crates/data/src/evm/tables/logs.rs b/crates/data/src/evm/tables/logs.rs index 3faaeb01..d34603d8 100644 --- a/crates/data/src/evm/tables/logs.rs +++ b/crates/data/src/evm/tables/logs.rs @@ -1,9 +1,8 @@ -use crate::evm::model::{Block, Log}; -use sqd_array::builder::{UInt64Builder, UInt32Builder}; +use sqd_array::builder::{UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; use super::common::HexBytesBuilder; - +use crate::evm::model::{Block, Log}; table_builder! { LogBuilder { diff --git a/crates/data/src/evm/tables/mod.rs b/crates/data/src/evm/tables/mod.rs index 4988a108..5d855f84 100644 --- a/crates/data/src/evm/tables/mod.rs +++ b/crates/data/src/evm/tables/mod.rs @@ -1,19 +1,18 @@ -mod common; mod block; -mod transaction; +mod common; mod logs; mod state_diff; mod trace; +mod transaction; pub use block::*; -pub use transaction::*; pub use logs::*; +use sqd_data_core::chunk_builder; pub use state_diff::*; pub use trace::*; +pub use transaction::*; use super::model::Block; -use sqd_data_core::chunk_builder; - chunk_builder! { EvmChunkBuilder { @@ -25,7 +24,6 @@ chunk_builder! { } } - impl sqd_data_core::BlockChunkBuilder for EvmChunkBuilder { type Block = Block; @@ -49,4 +47,4 @@ impl sqd_data_core::BlockChunkBuilder for EvmChunkBuilder { } Ok(()) } -} \ No newline at end of file +} diff --git a/crates/data/src/evm/tables/state_diff.rs b/crates/data/src/evm/tables/state_diff.rs index 3739390e..bd0ea05a 100644 --- a/crates/data/src/evm/tables/state_diff.rs +++ b/crates/data/src/evm/tables/state_diff.rs @@ -1,9 +1,8 @@ -use crate::evm::model::{Block, StateDiff}; -use sqd_array::builder::{StringBuilder, UInt64Builder, UInt32Builder}; +use sqd_array::builder::{StringBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; use super::common::HexBytesBuilder; - +use crate::evm::model::{Block, StateDiff}; table_builder! { StateDiffBuilder { diff --git a/crates/data/src/evm/tables/trace.rs b/crates/data/src/evm/tables/trace.rs index 8095be81..9f87fb8f 100644 --- a/crates/data/src/evm/tables/trace.rs +++ b/crates/data/src/evm/tables/trace.rs @@ -1,8 +1,8 @@ -use super::common::{sighash, HexBytesBuilder, TraceAddressListBuilder}; -use crate::evm::model::{Block, Trace, TraceOp}; use sqd_array::builder::{StringBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use super::common::{sighash, HexBytesBuilder, TraceAddressListBuilder}; +use crate::evm::model::{Block, Trace, TraceOp}; table_builder! { TraceBuilder { @@ -90,7 +90,8 @@ impl TraceBuilder { self.create_result_gas_used.append(&res.gas_used); self.create_result_code.append_option(res.code.as_deref()); self.create_result_address.append_option(res.address.as_deref()); - self.create_result_code_size.append(res.code.as_ref().map_or(0, |v| v.len()) as u64); + self.create_result_code_size + .append(res.code.as_ref().map_or(0, |v| v.len()) as u64); } else { self.create_result_gas_used.append_null(); self.create_result_code.append_null(); @@ -122,7 +123,8 @@ impl TraceBuilder { if let Some(res) = result { self.call_result_gas_used.append_option(res.gas_used.as_deref()); self.call_result_output.append_option(res.output.as_deref()); - self.call_result_output_size.append(res.output.as_ref().map_or(0, |v| v.len()) as u64); + self.call_result_output_size + .append(res.output.as_ref().map_or(0, |v| v.len()) as u64); } else { self.call_result_gas_used.append_null(); self.call_result_output.append_null(); @@ -153,7 +155,7 @@ impl TraceBuilder { self.reward_type.append_null(); } - if let TraceOp::SelfDestruct { action} = &row.op { + if let TraceOp::SelfDestruct { action } = &row.op { self.r#type.append("suicide"); self.suicide_address.append_option(action.address.as_deref()); self.suicide_refund_address.append(&action.refund_address); @@ -164,4 +166,4 @@ impl TraceBuilder { self.suicide_balance.append_option(None); } } -} \ No newline at end of file +} diff --git a/crates/data/src/evm/tables/transaction.rs b/crates/data/src/evm/tables/transaction.rs index 61953470..75d7e846 100644 --- a/crates/data/src/evm/tables/transaction.rs +++ b/crates/data/src/evm/tables/transaction.rs @@ -1,8 +1,12 @@ -use crate::evm::model::{Block, Transaction, TempoPrimitiveSignature, TempoSignature, TempoKeychainSignature}; -use crate::evm::tables::common::*; -use sqd_array::builder::{BooleanBuilder, ListBuilder, StringBuilder, UInt32Builder, UInt64Builder, UInt8Builder, Float64Builder}; +use sqd_array::builder::{ + BooleanBuilder, Float64Builder, ListBuilder, StringBuilder, UInt32Builder, UInt64Builder, UInt8Builder +}; use sqd_data_core::{struct_builder, table_builder}; +use crate::evm::{ + model::{Block, TempoKeychainSignature, TempoPrimitiveSignature, TempoSignature, Transaction}, + tables::common::* +}; type EIP7702AuthorizationListBuilder = ListBuilder; struct_builder! { @@ -88,7 +92,6 @@ struct_builder! { } } - table_builder! { TransactionBuilder { block_number: UInt64Builder, @@ -165,7 +168,6 @@ table_builder! { } } - fn push_prim_sig(b: &mut TempoPrimSigBuilder, sig: &TempoPrimitiveSignature) { match sig { TempoPrimitiveSignature::Secp256k1 { r, s, y_parity, v } => { @@ -179,7 +181,13 @@ fn push_prim_sig(b: &mut TempoPrimSigBuilder, sig: &TempoPrimitiveSignature) { b.pre_hash.append_option(None); b.webauthn_data.append_null(); } - TempoPrimitiveSignature::P256 { r, s, pub_key_x, pub_key_y, pre_hash } => { + TempoPrimitiveSignature::P256 { + r, + s, + pub_key_x, + pub_key_y, + pre_hash + } => { b.sig_type.append("p256"); b.r.append(r); b.s.append(s); @@ -190,7 +198,13 @@ fn push_prim_sig(b: &mut TempoPrimSigBuilder, sig: &TempoPrimitiveSignature) { b.pre_hash.append(*pre_hash); b.webauthn_data.append_null(); } - TempoPrimitiveSignature::WebAuthn { r, s, pub_key_x, pub_key_y, webauthn_data } => { + TempoPrimitiveSignature::WebAuthn { + r, + s, + pub_key_x, + pub_key_y, + webauthn_data + } => { b.sig_type.append("webAuthn"); b.r.append(r); b.s.append(s); @@ -225,7 +239,11 @@ fn push_tempo_sig(b: &mut TempoSigBuilder, sig: &TempoSignature) { b.user_address.append_null(); b.version.append_null(); } - TempoSignature::Keychain(TempoKeychainSignature { user_address, signature, version }) => { + TempoSignature::Keychain(TempoKeychainSignature { + user_address, + signature, + version + }) => { push_prim_sig(&mut b.primitive, signature); b.user_address.append(user_address); b.version.append_option(version.as_deref()); @@ -241,7 +259,6 @@ fn push_tempo_sig_null(b: &mut TempoSigBuilder) { b.append_null(); } - impl TransactionBuilder { pub fn push(&mut self, block: &Block, row: &Transaction) { self.block_number.append(block.header.number); @@ -256,7 +273,8 @@ impl TransactionBuilder { self.gas.append(&row.gas); self.gas_price.append_option(row.gas_price.as_deref()); self.max_fee_per_gas.append_option(row.max_fee_per_gas.as_deref()); - self.max_priority_fee_per_gas.append_option(row.max_priority_fee_per_gas.as_deref()); + self.max_priority_fee_per_gas + .append_option(row.max_priority_fee_per_gas.as_deref()); self.v.append_option(row.v.as_deref()); self.r.append_option(row.r.as_deref()); self.s.append_option(row.s.as_deref()); @@ -276,7 +294,8 @@ impl TransactionBuilder { self.access_list.append(); self.chain_id.append_option(row.chain_id); - self.max_fee_per_blob_gas.append_option(row.max_fee_per_blob_gas.as_deref()); + self.max_fee_per_blob_gas + .append_option(row.max_fee_per_blob_gas.as_deref()); for blob_hash in row.blob_versioned_hashes.iter().flatten() { self.blob_versioned_hashes.values().append(blob_hash); @@ -371,7 +390,8 @@ impl TransactionBuilder { self.contract_address.append_option(row.contract_address.as_deref()); self.cumulative_gas_used.append(&row.cumulative_gas_used); - self.effective_gas_price.append_option(row.effective_gas_price.as_deref()); + self.effective_gas_price + .append_option(row.effective_gas_price.as_deref()); self.gas_used.append(&row.gas_used); self.logs_bloom.append(&row.logs_bloom); self.sighash.append_option(row.input.as_deref().and_then(sighash)); diff --git a/crates/data/src/hyperliquid_fills/model.rs b/crates/data/src/hyperliquid_fills/model.rs index c5600360..7fc0122c 100644 --- a/crates/data/src/hyperliquid_fills/model.rs +++ b/crates/data/src/hyperliquid_fills/model.rs @@ -1,17 +1,14 @@ use serde::Deserialize; use sqd_primitives::{BlockNumber, ItemIndex}; - pub type Bytes = String; - #[derive(Deserialize)] pub enum Side { A, B } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Fill { @@ -34,27 +31,24 @@ pub struct Fill { pub cloid: Option, pub fee_token: String, pub builder: Option, - pub twap_id: Option, + pub twap_id: Option } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct BlockHeader { pub number: BlockNumber, pub hash: Bytes, pub parent_hash: Bytes, - pub timestamp: i64, + pub timestamp: i64 } - #[derive(Deserialize)] pub struct Block { pub header: BlockHeader, - pub fills: Vec, + pub fills: Vec } - impl sqd_primitives::Block for Block { fn number(&self) -> BlockNumber { self.header.number diff --git a/crates/data/src/hyperliquid_fills/tables/block.rs b/crates/data/src/hyperliquid_fills/tables/block.rs index 114da0ea..6a0f25d3 100644 --- a/crates/data/src/hyperliquid_fills/tables/block.rs +++ b/crates/data/src/hyperliquid_fills/tables/block.rs @@ -1,7 +1,7 @@ -use crate::hyperliquid_fills::model::BlockHeader; -use sqd_array::builder::{StringBuilder, UInt64Builder, TimestampMillisecondBuilder}; +use sqd_array::builder::{StringBuilder, TimestampMillisecondBuilder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::hyperliquid_fills::model::BlockHeader; table_builder! { BlockBuilder { @@ -19,7 +19,6 @@ table_builder! { } } - impl BlockBuilder { pub fn push(&mut self, block: &BlockHeader) { self.number.append(block.number); diff --git a/crates/data/src/hyperliquid_fills/tables/fill.rs b/crates/data/src/hyperliquid_fills/tables/fill.rs index d08f972f..38e7d8c4 100644 --- a/crates/data/src/hyperliquid_fills/tables/fill.rs +++ b/crates/data/src/hyperliquid_fills/tables/fill.rs @@ -1,7 +1,9 @@ -use crate::hyperliquid_fills::model::{Block, Fill, Side}; -use sqd_array::builder::{BooleanBuilder, Float64Builder, StringBuilder, TimestampMillisecondBuilder, UInt32Builder, UInt64Builder}; +use sqd_array::builder::{ + BooleanBuilder, Float64Builder, StringBuilder, TimestampMillisecondBuilder, UInt32Builder, UInt64Builder +}; use sqd_data_core::table_builder; +use crate::hyperliquid_fills::model::{Block, Fill, Side}; table_builder! { FillBuilder { @@ -55,7 +57,6 @@ table_builder! { } } - impl FillBuilder { pub fn push(&mut self, block: &Block, fill: &Fill) -> anyhow::Result<()> { self.block_number.append(block.header.number); diff --git a/crates/data/src/hyperliquid_fills/tables/mod.rs b/crates/data/src/hyperliquid_fills/tables/mod.rs index 47faf319..ba41704b 100644 --- a/crates/data/src/hyperliquid_fills/tables/mod.rs +++ b/crates/data/src/hyperliquid_fills/tables/mod.rs @@ -3,10 +3,9 @@ mod fill; pub use block::*; pub use fill::*; - -use super::model::Block; use sqd_data_core::chunk_builder; +use super::model::Block; chunk_builder! { HyperliquidFillsChunkBuilder { @@ -15,7 +14,6 @@ chunk_builder! { } } - impl sqd_data_core::BlockChunkBuilder for HyperliquidFillsChunkBuilder { type Block = Block; @@ -29,4 +27,3 @@ impl sqd_data_core::BlockChunkBuilder for HyperliquidFillsChunkBuilder { Ok(()) } } - diff --git a/crates/data/src/hyperliquid_replica_cmds/model.rs b/crates/data/src/hyperliquid_replica_cmds/model.rs index 03307cca..7f5da604 100644 --- a/crates/data/src/hyperliquid_replica_cmds/model.rs +++ b/crates/data/src/hyperliquid_replica_cmds/model.rs @@ -1,38 +1,33 @@ use serde::{Deserialize, Serialize}; use sqd_primitives::{BlockNumber, ItemIndex}; - #[derive(Deserialize, Serialize)] pub struct Signature { pub r: String, pub s: String, - pub v: u64, + pub v: u64 } - #[derive(Deserialize, Serialize)] pub struct ActionData { pub r#type: String, #[serde(flatten)] - pub data: serde_json::Value, // type-specific data + pub data: serde_json::Value // type-specific data } - #[derive(Deserialize, Serialize)] #[serde(rename_all = "lowercase")] pub enum Status { Ok, - Err, + Err } - #[derive(Deserialize, Serialize)] pub struct HardforkInfo { pub version: u64, - pub round: u64, + pub round: u64 } - #[derive(Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct Action { @@ -43,10 +38,9 @@ pub struct Action { pub vault_address: Option, pub user: Option, pub status: Status, - pub response: serde_json::Value, + pub response: serde_json::Value } - #[derive(Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct BlockHeader { @@ -57,17 +51,15 @@ pub struct BlockHeader { pub parent_round: u64, pub proposer: String, pub timestamp: i64, - pub hardfork: HardforkInfo, + pub hardfork: HardforkInfo } - #[derive(Deserialize, Serialize)] pub struct Block { pub header: BlockHeader, - pub actions: Vec, + pub actions: Vec } - impl sqd_primitives::Block for Block { fn number(&self) -> BlockNumber { self.header.height diff --git a/crates/data/src/hyperliquid_replica_cmds/tables/action.rs b/crates/data/src/hyperliquid_replica_cmds/tables/action.rs index 2ebb185f..9406d8d2 100644 --- a/crates/data/src/hyperliquid_replica_cmds/tables/action.rs +++ b/crates/data/src/hyperliquid_replica_cmds/tables/action.rs @@ -1,14 +1,13 @@ -use crate::hyperliquid_replica_cmds::model::{Action, ActionData, Block, Status}; use anyhow::Context; use sqd_array::builder::{ListBuilder, StringBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::hyperliquid_replica_cmds::model::{Action, ActionData, Block, Status}; type JsonBuilder = StringBuilder; type AssetListBuilder = ListBuilder; type CloidListBuilder = ListBuilder; - table_builder! { ActionBuilder { block_number: UInt64Builder, @@ -53,7 +52,6 @@ table_builder! { } } - impl ActionBuilder { pub fn push(&mut self, block: &Block, action: &Action) -> anyhow::Result<()> { self.block_number.append(block.header.height); @@ -73,7 +71,7 @@ impl ActionBuilder { let status = match action.status { Status::Ok => "ok", - Status::Err => "err", + Status::Err => "err" }; self.status.append(status); diff --git a/crates/data/src/hyperliquid_replica_cmds/tables/block.rs b/crates/data/src/hyperliquid_replica_cmds/tables/block.rs index 70985dc8..d639ba93 100644 --- a/crates/data/src/hyperliquid_replica_cmds/tables/block.rs +++ b/crates/data/src/hyperliquid_replica_cmds/tables/block.rs @@ -1,11 +1,10 @@ -use crate::hyperliquid_replica_cmds::model::BlockHeader; -use sqd_array::builder::{StringBuilder, UInt64Builder, TimestampMillisecondBuilder}; +use sqd_array::builder::{StringBuilder, TimestampMillisecondBuilder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::hyperliquid_replica_cmds::model::BlockHeader; type JsonBuilder = StringBuilder; - table_builder! { BlockBuilder { number: UInt64Builder, @@ -26,7 +25,6 @@ table_builder! { } } - impl BlockBuilder { pub fn push(&mut self, block: &BlockHeader) -> anyhow::Result<()> { self.number.append(block.height); diff --git a/crates/data/src/hyperliquid_replica_cmds/tables/mod.rs b/crates/data/src/hyperliquid_replica_cmds/tables/mod.rs index 367a3a7b..c807dfc4 100644 --- a/crates/data/src/hyperliquid_replica_cmds/tables/mod.rs +++ b/crates/data/src/hyperliquid_replica_cmds/tables/mod.rs @@ -3,10 +3,9 @@ mod block; pub use action::*; pub use block::*; - -use super::model::Block; use sqd_data_core::chunk_builder; +use super::model::Block; chunk_builder! { HyperliquidReplicaCmdsChunkBuilder { @@ -15,7 +14,6 @@ chunk_builder! { } } - impl sqd_data_core::BlockChunkBuilder for HyperliquidReplicaCmdsChunkBuilder { type Block = Block; @@ -29,4 +27,3 @@ impl sqd_data_core::BlockChunkBuilder for HyperliquidReplicaCmdsChunkBuilder { Ok(()) } } - diff --git a/crates/data/src/solana/mod.rs b/crates/data/src/solana/mod.rs index 7e26db9b..a0c66c0a 100644 --- a/crates/data/src/solana/mod.rs +++ b/crates/data/src/solana/mod.rs @@ -1,2 +1,2 @@ pub mod model; -pub mod tables; \ No newline at end of file +pub mod tables; diff --git a/crates/data/src/solana/model.rs b/crates/data/src/solana/model.rs index 6ef656fa..ff129023 100644 --- a/crates/data/src/solana/model.rs +++ b/crates/data/src/solana/model.rs @@ -1,12 +1,11 @@ -use crate::types::{Base58Bytes, JsonValue}; use anyhow::anyhow; use serde::{Deserialize, Serialize}; use sqd_primitives::{BlockNumber, ItemIndex}; +use crate::types::{Base58Bytes, JsonValue}; pub type AccountIndex = u32; - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct BlockHeader { @@ -15,35 +14,31 @@ pub struct BlockHeader { pub parent_number: BlockNumber, pub parent_hash: Base58Bytes, pub height: BlockNumber, - pub timestamp: Option, + pub timestamp: Option } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct AddressTableLookup { pub account_key: AccountIndex, pub readonly_indexes: Vec, - pub writable_indexes: Vec, + pub writable_indexes: Vec } - #[derive(Deserialize)] pub struct LoadedAddresses { pub readonly: Vec, - pub writable: Vec, + pub writable: Vec } - #[derive(Deserialize)] #[serde(rename_all = "lowercase")] pub enum TransactionVersion { Legacy, #[serde(untagged)] - Other(u8), + Other(u8) } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Transaction { @@ -57,17 +52,16 @@ pub struct Transaction { pub recent_blockhash: Base58Bytes, pub signatures: Vec, pub err: Option, - #[serde(deserialize_with="sqd_data_core::serde::decode_string_option", default)] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub compute_units_consumed: Option, - #[serde(deserialize_with="sqd_data_core::serde::decode_string_option", default)] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub cost_units: Option, - #[serde(deserialize_with="sqd_data_core::serde::decode_string")] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string")] pub fee: u64, pub loaded_addresses: LoadedAddresses, - pub has_dropped_log_messages: bool, + pub has_dropped_log_messages: bool } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Instruction { @@ -76,24 +70,22 @@ pub struct Instruction { pub program_id: AccountIndex, pub accounts: Vec, pub data: Base58Bytes, - #[serde(deserialize_with="sqd_data_core::serde::decode_string_option", default)] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub compute_units_consumed: Option, #[serde(default)] pub error: Option, pub is_committed: bool, - pub has_dropped_log_messages: bool, + pub has_dropped_log_messages: bool } - #[derive(Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum LogMessageKind { Log, Data, - Other, + Other } - impl LogMessageKind { pub fn to_str(&self) -> &'static str { match self { @@ -104,7 +96,6 @@ impl LogMessageKind { } } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct LogMessage { @@ -113,22 +104,20 @@ pub struct LogMessage { pub instruction_address: Vec, pub program_id: AccountIndex, pub kind: LogMessageKind, - pub message: String, + pub message: String } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Balance { pub transaction_index: ItemIndex, pub account: AccountIndex, - #[serde(deserialize_with="sqd_data_core::serde::decode_string")] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string")] pub pre: u64, - #[serde(deserialize_with="sqd_data_core::serde::decode_string")] - pub post: u64, + #[serde(deserialize_with = "sqd_data_core::serde::decode_string")] + pub post: u64 } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct TokenBalance { @@ -150,28 +139,26 @@ pub struct TokenBalance { pub pre_owner: Option, #[serde(default)] pub post_owner: Option, - #[serde(deserialize_with="sqd_data_core::serde::decode_string_option", default)] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub pre_amount: Option, - #[serde(deserialize_with="sqd_data_core::serde::decode_string_option", default)] - pub post_amount: Option, + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] + pub post_amount: Option } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Reward { pub pubkey: AccountIndex, - #[serde(deserialize_with="sqd_data_core::serde::decode_string")] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string")] pub lamports: i64, - #[serde(deserialize_with="sqd_data_core::serde::decode_string")] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string")] pub post_balance: u64, #[serde(default)] pub reward_type: Option, #[serde(default)] - pub commission: Option, + pub commission: Option } - #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct Block { @@ -185,21 +172,19 @@ pub struct Block { pub accounts: Vec } - impl Block { pub fn get_account(&self, idx: AccountIndex) -> anyhow::Result<&str> { self.accounts.get(idx as usize).map(|s| s.as_str()).ok_or_else(|| { anyhow!( - "invalid account reference {} in block {}#{}", - idx, - self.header.number, + "invalid account reference {} in block {}#{}", + idx, + self.header.number, self.header.hash ) }) } } - impl sqd_primitives::Block for Block { fn number(&self) -> BlockNumber { self.header.number @@ -220,4 +205,4 @@ impl sqd_primitives::Block for Block { fn timestamp(&self) -> Option { self.header.timestamp.map(|seconds| seconds * 1000) } -} \ No newline at end of file +} diff --git a/crates/data/src/solana/tables/balance.rs b/crates/data/src/solana/tables/balance.rs index 265a2a4d..3edaa59a 100644 --- a/crates/data/src/solana/tables/balance.rs +++ b/crates/data/src/solana/tables/balance.rs @@ -1,8 +1,10 @@ -use crate::solana::model::{Balance, Block}; -use crate::solana::tables::common::Base58Builder; use sqd_array::builder::{UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::solana::{ + model::{Balance, Block}, + tables::common::Base58Builder +}; table_builder! { BalanceBuilder { @@ -24,7 +26,6 @@ table_builder! { } } - impl BalanceBuilder { pub fn push(&mut self, block: &Block, row: &Balance) -> anyhow::Result<()> { self.block_number.append(block.header.number); @@ -34,4 +35,4 @@ impl BalanceBuilder { self.post.append(row.post); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/data/src/solana/tables/block.rs b/crates/data/src/solana/tables/block.rs index f2480c07..82342ebd 100644 --- a/crates/data/src/solana/tables/block.rs +++ b/crates/data/src/solana/tables/block.rs @@ -1,8 +1,7 @@ -use crate::solana::model::BlockHeader; -use crate::solana::tables::common::*; use sqd_array::builder::{TimestampSecondBuilder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::solana::{model::BlockHeader, tables::common::*}; table_builder! { BlockBuilder { @@ -22,7 +21,6 @@ table_builder! { } } - impl BlockBuilder { pub fn push(&mut self, row: &BlockHeader) { self.number.append(row.number); diff --git a/crates/data/src/solana/tables/common.rs b/crates/data/src/solana/tables/common.rs index 2f065885..35921fbc 100644 --- a/crates/data/src/solana/tables/common.rs +++ b/crates/data/src/solana/tables/common.rs @@ -1,6 +1,5 @@ use sqd_array::builder::{ListBuilder, StringBuilder, UInt32Builder, UInt8Builder}; - pub type Base58Builder = StringBuilder; pub type JsonBuilder = StringBuilder; pub type AccountListBuilder = ListBuilder; diff --git a/crates/data/src/solana/tables/instruction.rs b/crates/data/src/solana/tables/instruction.rs index 4ec93380..4d93c041 100644 --- a/crates/data/src/solana/tables/instruction.rs +++ b/crates/data/src/solana/tables/instruction.rs @@ -1,15 +1,18 @@ -use crate::solana::model::{AccountIndex, Block, Instruction}; -use crate::solana::tables::common::{AccountListBuilder, Base58Builder, InstructionAddressListBuilder}; use anyhow::Context; -use sqd_array::builder::{BooleanBuilder, FixedSizeBinaryBuilder, StringBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder}; +use sqd_array::builder::{ + BooleanBuilder, FixedSizeBinaryBuilder, StringBuilder, UInt16Builder, UInt32Builder, UInt64Builder, UInt8Builder +}; use sqd_bloom_filter::BloomFilter; use sqd_data_core::table_builder; +use crate::solana::{ + model::{AccountIndex, Block, Instruction}, + tables::common::{AccountListBuilder, Base58Builder, InstructionAddressListBuilder} +}; const ACCOUNT_BLOOM_BYTES: usize = 64; const ACCOUNT_BLOOM_NUM_HASHES: usize = 7; - table_builder! { InstructionBuilder { block_number: UInt64Builder, @@ -113,7 +116,6 @@ table_builder! { } } - impl InstructionBuilder { pub fn push(&mut self, block: &Block, row: &Instruction) -> anyhow::Result<()> { self.block_number.append(block.header.number); @@ -127,22 +129,38 @@ impl InstructionBuilder { self.program_id.append(block.get_account(row.program_id)?); self.data.append(&row.data); self.data_size.append(row.data.len() as u64); - self.a0.append_option(row.accounts.first().map(|i| block.get_account(*i)).transpose()?); - self.a1.append_option(row.accounts.get(1).map(|i| block.get_account(*i)).transpose()?); - self.a2.append_option(row.accounts.get(2).map(|i| block.get_account(*i)).transpose()?); - self.a3.append_option(row.accounts.get(3).map(|i| block.get_account(*i)).transpose()?); - self.a4.append_option(row.accounts.get(4).map(|i| block.get_account(*i)).transpose()?); - self.a5.append_option(row.accounts.get(5).map(|i| block.get_account(*i)).transpose()?); - self.a6.append_option(row.accounts.get(6).map(|i| block.get_account(*i)).transpose()?); - self.a7.append_option(row.accounts.get(7).map(|i| block.get_account(*i)).transpose()?); - self.a8.append_option(row.accounts.get(8).map(|i| block.get_account(*i)).transpose()?); - self.a9.append_option(row.accounts.get(9).map(|i| block.get_account(*i)).transpose()?); - self.a10.append_option(row.accounts.get(10).map(|i| block.get_account(*i)).transpose()?); - self.a11.append_option(row.accounts.get(11).map(|i| block.get_account(*i)).transpose()?); - self.a12.append_option(row.accounts.get(12).map(|i| block.get_account(*i)).transpose()?); - self.a13.append_option(row.accounts.get(13).map(|i| block.get_account(*i)).transpose()?); - self.a14.append_option(row.accounts.get(14).map(|i| block.get_account(*i)).transpose()?); - self.a15.append_option(row.accounts.get(15).map(|i| block.get_account(*i)).transpose()?); + self.a0 + .append_option(row.accounts.first().map(|i| block.get_account(*i)).transpose()?); + self.a1 + .append_option(row.accounts.get(1).map(|i| block.get_account(*i)).transpose()?); + self.a2 + .append_option(row.accounts.get(2).map(|i| block.get_account(*i)).transpose()?); + self.a3 + .append_option(row.accounts.get(3).map(|i| block.get_account(*i)).transpose()?); + self.a4 + .append_option(row.accounts.get(4).map(|i| block.get_account(*i)).transpose()?); + self.a5 + .append_option(row.accounts.get(5).map(|i| block.get_account(*i)).transpose()?); + self.a6 + .append_option(row.accounts.get(6).map(|i| block.get_account(*i)).transpose()?); + self.a7 + .append_option(row.accounts.get(7).map(|i| block.get_account(*i)).transpose()?); + self.a8 + .append_option(row.accounts.get(8).map(|i| block.get_account(*i)).transpose()?); + self.a9 + .append_option(row.accounts.get(9).map(|i| block.get_account(*i)).transpose()?); + self.a10 + .append_option(row.accounts.get(10).map(|i| block.get_account(*i)).transpose()?); + self.a11 + .append_option(row.accounts.get(11).map(|i| block.get_account(*i)).transpose()?); + self.a12 + .append_option(row.accounts.get(12).map(|i| block.get_account(*i)).transpose()?); + self.a13 + .append_option(row.accounts.get(13).map(|i| block.get_account(*i)).transpose()?); + self.a14 + .append_option(row.accounts.get(14).map(|i| block.get_account(*i)).transpose()?); + self.a15 + .append_option(row.accounts.get(15).map(|i| block.get_account(*i)).transpose()?); if let Some(accounts) = row.accounts.get(16..) { for account in accounts { @@ -173,7 +191,8 @@ impl InstructionBuilder { macro_rules! desc { ($ty:ty) => { - data.get(..size_of::<$ty>()).map(|slice| <$ty>::from_be_bytes(slice.try_into().unwrap())) + data.get(..size_of::<$ty>()) + .map(|slice| <$ty>::from_be_bytes(slice.try_into().unwrap())) }; } diff --git a/crates/data/src/solana/tables/log_message.rs b/crates/data/src/solana/tables/log_message.rs index 8f4ace43..f4f92146 100644 --- a/crates/data/src/solana/tables/log_message.rs +++ b/crates/data/src/solana/tables/log_message.rs @@ -1,8 +1,11 @@ -use crate::solana::model::{Block, LogMessage}; -use crate::solana::tables::common::{Base58Builder, InstructionAddressListBuilder}; use sqd_array::builder::{StringBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::solana::{ + model::{Block, LogMessage}, + tables::common::{Base58Builder, InstructionAddressListBuilder} +}; + table_builder! { LogMessageBuilder { block_number: UInt64Builder, @@ -27,7 +30,6 @@ table_builder! { } } - impl LogMessageBuilder { pub fn push(&mut self, block: &Block, row: &LogMessage) -> anyhow::Result<()> { self.block_number.append(block.header.number); @@ -45,4 +47,4 @@ impl LogMessageBuilder { self.message_size.append(row.message.len() as u64); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/data/src/solana/tables/mod.rs b/crates/data/src/solana/tables/mod.rs index 8c43343b..28b451a3 100644 --- a/crates/data/src/solana/tables/mod.rs +++ b/crates/data/src/solana/tables/mod.rs @@ -1,24 +1,22 @@ -mod common; +mod balance; mod block; -mod transaction; +mod common; mod instruction; mod log_message; -mod balance; -mod token_balance; mod reward; - +mod token_balance; +mod transaction; pub use balance::*; pub use block::*; pub use instruction::*; pub use log_message::*; pub use reward::*; +use sqd_data_core::chunk_builder; pub use token_balance::*; pub use transaction::*; use super::model::Block; -use sqd_data_core::chunk_builder; - chunk_builder! { SolanaChunkBuilder { @@ -32,7 +30,6 @@ chunk_builder! { } } - impl sqd_data_core::BlockChunkBuilder for SolanaChunkBuilder { type Block = Block; @@ -62,7 +59,7 @@ impl sqd_data_core::BlockChunkBuilder for SolanaChunkBuilder { for row in block.rewards.iter() { self.rewards.push(block, row)? } - + Ok(()) } -} \ No newline at end of file +} diff --git a/crates/data/src/solana/tables/reward.rs b/crates/data/src/solana/tables/reward.rs index bdc700c6..8e63f525 100644 --- a/crates/data/src/solana/tables/reward.rs +++ b/crates/data/src/solana/tables/reward.rs @@ -1,8 +1,10 @@ -use crate::solana::model::{Block, Reward}; -use crate::solana::tables::common::Base58Builder; use sqd_array::builder::{Int64Builder, StringBuilder, UInt64Builder, UInt8Builder}; use sqd_data_core::table_builder; +use crate::solana::{ + model::{Block, Reward}, + tables::common::Base58Builder +}; table_builder! { RewardBuilder { @@ -24,7 +26,6 @@ table_builder! { } } - impl RewardBuilder { pub fn push(&mut self, block: &Block, row: &Reward) -> anyhow::Result<()> { self.block_number.append(block.header.number); @@ -35,4 +36,4 @@ impl RewardBuilder { self.commission.append_option(row.commission); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/data/src/solana/tables/token_balance.rs b/crates/data/src/solana/tables/token_balance.rs index 6ef4b322..55efeba7 100644 --- a/crates/data/src/solana/tables/token_balance.rs +++ b/crates/data/src/solana/tables/token_balance.rs @@ -1,8 +1,11 @@ -use crate::solana::model::{Block, TokenBalance}; -use crate::solana::tables::common::Base58Builder; use sqd_array::builder::{UInt16Builder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::solana::{ + model::{Block, TokenBalance}, + tables::common::Base58Builder +}; + table_builder! { TokenBalanceBuilder { block_number: UInt64Builder, @@ -49,22 +52,27 @@ table_builder! { } } - impl TokenBalanceBuilder { pub fn push(&mut self, block: &Block, row: &TokenBalance) -> anyhow::Result<()> { self.block_number.append(block.header.number); self.transaction_index.append(row.transaction_index); self.account.append(block.get_account(row.account)?); - self.pre_mint.append_option(row.pre_mint.map(|i| block.get_account(i)).transpose()?); - self.post_mint.append_option(row.post_mint.map(|i| block.get_account(i)).transpose()?); + self.pre_mint + .append_option(row.pre_mint.map(|i| block.get_account(i)).transpose()?); + self.post_mint + .append_option(row.post_mint.map(|i| block.get_account(i)).transpose()?); self.pre_decimals.append_option(row.pre_decimals); self.post_decimals.append_option(row.post_decimals); - self.pre_program_id.append_option(row.pre_program_id.map(|i| block.get_account(i)).transpose()?); - self.post_program_id.append_option(row.post_program_id.map(|i| block.get_account(i)).transpose()?); - self.pre_owner.append_option(row.pre_owner.map(|i| block.get_account(i)).transpose()?); - self.post_owner.append_option(row.post_owner.map(|i| block.get_account(i)).transpose()?); + self.pre_program_id + .append_option(row.pre_program_id.map(|i| block.get_account(i)).transpose()?); + self.post_program_id + .append_option(row.post_program_id.map(|i| block.get_account(i)).transpose()?); + self.pre_owner + .append_option(row.pre_owner.map(|i| block.get_account(i)).transpose()?); + self.post_owner + .append_option(row.post_owner.map(|i| block.get_account(i)).transpose()?); self.pre_amount.append_option(row.pre_amount); self.post_amount.append_option(row.post_amount); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/data/src/solana/tables/transaction.rs b/crates/data/src/solana/tables/transaction.rs index 9cc61f70..d2613a1a 100644 --- a/crates/data/src/solana/tables/transaction.rs +++ b/crates/data/src/solana/tables/transaction.rs @@ -1,9 +1,15 @@ -use crate::solana::model::{Block, Transaction, TransactionVersion}; -use crate::solana::tables::common::{AccountIndexList, AccountListBuilder, AddressListBuilder, Base58Builder, JsonBuilder, SignatureListBuilder}; -use sqd_array::builder::{BooleanBuilder, FixedSizeBinaryBuilder, Int16Builder, ListBuilder, UInt32Builder, UInt64Builder, UInt8Builder}; +use sqd_array::builder::{ + BooleanBuilder, FixedSizeBinaryBuilder, Int16Builder, ListBuilder, UInt32Builder, UInt64Builder, UInt8Builder +}; use sqd_bloom_filter::BloomFilter; use sqd_data_core::{struct_builder, table_builder}; +use crate::solana::{ + model::{Block, Transaction, TransactionVersion}, + tables::common::{ + AccountIndexList, AccountListBuilder, AddressListBuilder, Base58Builder, JsonBuilder, SignatureListBuilder + } +}; type AddressTableLookupListBuilder = ListBuilder; struct_builder! { @@ -14,7 +20,6 @@ struct_builder! { } } - struct_builder! { LoadedAddressBuilder { readonly: AddressListBuilder, @@ -22,11 +27,9 @@ struct_builder! { } } - const ACCOUNT_BLOOM_BYTES: usize = 64; const ACCOUNT_BLOOM_NUM_HASHES: usize = 7; - table_builder! { TransactionBuilder { block_number: UInt64Builder, @@ -68,7 +71,6 @@ table_builder! { } } - impl TransactionBuilder { pub fn push(&mut self, block: &Block, row: &Transaction) -> anyhow::Result<()> { self.block_number.append(block.header.number); @@ -80,9 +82,7 @@ impl TransactionBuilder { }); for account in row.account_keys.iter().copied() { - self.account_keys.values().append( - block.get_account(account)? - ) + self.account_keys.values().append(block.get_account(account)?) } self.account_keys.append(); @@ -101,8 +101,10 @@ impl TransactionBuilder { } self.address_table_lookups.append(); - self.num_readonly_signed_accounts.append(row.num_readonly_signed_accounts); - self.num_readonly_unsigned_accounts.append(row.num_readonly_unsigned_accounts); + self.num_readonly_signed_accounts + .append(row.num_readonly_signed_accounts); + self.num_readonly_unsigned_accounts + .append(row.num_readonly_unsigned_accounts); self.num_required_signatures.append(row.num_required_signatures); self.recent_blockhash.append(&row.recent_blockhash); @@ -115,21 +117,23 @@ impl TransactionBuilder { let err = row.err.as_ref().map(|val| val.to_string()); self.err.append_option(err.as_ref().map(|s| s.as_ref())); } - + self.compute_units_consumed.append_option(row.compute_units_consumed); self.cost_units.append_option(row.cost_units); self.fee.append(row.fee); for address in &row.loaded_addresses.readonly { - self.loaded_addresses.readonly.values().append( - block.get_account(*address)? - ); + self.loaded_addresses + .readonly + .values() + .append(block.get_account(*address)?); } self.loaded_addresses.readonly.append(); for address in &row.loaded_addresses.writable { - self.loaded_addresses.writable.values().append( - block.get_account(*address)? - ); + self.loaded_addresses + .writable + .values() + .append(block.get_account(*address)?); } self.loaded_addresses.writable.append(); self.loaded_addresses.append(true); @@ -140,10 +144,14 @@ impl TransactionBuilder { let account_keys_size = row.account_keys.len() * 44; self.account_keys_size.append(account_keys_size as u64); - let address_table_lookups_size = 44usize * row.address_table_lookups.iter() - .map(|l| 1 + l.readonly_indexes.len() + l.writable_indexes.len()) - .sum::(); - self.address_table_lookups_size.append(address_table_lookups_size as u64); + let address_table_lookups_size = 44usize + * row + .address_table_lookups + .iter() + .map(|l| 1 + l.readonly_indexes.len() + l.writable_indexes.len()) + .sum::(); + self.address_table_lookups_size + .append(address_table_lookups_size as u64); let signatures_size = row.signatures.iter().map(|val| val.len() as u64).sum(); self.signatures_size.append(signatures_size); diff --git a/crates/data/src/tron/model.rs b/crates/data/src/tron/model.rs index 0944fd17..03672493 100644 --- a/crates/data/src/tron/model.rs +++ b/crates/data/src/tron/model.rs @@ -1,7 +1,8 @@ -use crate::types::{HexBytes, JsonValue}; use serde::Deserialize; use sqd_primitives::{BlockNumber, ItemIndex}; +use crate::types::{HexBytes, JsonValue}; + #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub struct BlockHeader { @@ -12,13 +13,13 @@ pub struct BlockHeader { pub version: Option, pub timestamp: i64, pub witness_address: HexBytes, - pub witness_signature: Option, + pub witness_signature: Option } #[derive(Deserialize, serde::Serialize)] #[serde(rename_all = "camelCase")] pub struct TransactionResult { - pub contract_ret: Option, // eg. "SUCCESS", "REVERT", etc. + pub contract_ret: Option // eg. "SUCCESS", "REVERT", etc. } #[derive(Deserialize)] @@ -34,78 +35,39 @@ pub struct Transaction { pub permission_id: Option, pub ref_block_bytes: Option, pub ref_block_hash: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub fee_limit: Option, pub expiration: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub timestamp: Option, pub raw_data_hex: HexBytes, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub fee: Option, pub contract_result: Option, pub contract_address: Option, pub res_message: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub withdraw_amount: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub unfreeze_amount: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub withdraw_expire_amount: Option, pub cancel_unfreeze_v2_amount: Option, pub result: Option, // Result from receipt, eg. "SUCCESS", "REVERT", etc. - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub energy_fee: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub energy_usage: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub energy_usage_total: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub net_usage: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub net_fee: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub origin_energy_usage: Option, - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] - pub energy_penalty_total: Option, + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] + pub energy_penalty_total: Option } #[derive(Deserialize)] @@ -115,18 +77,15 @@ pub struct Log { pub log_index: ItemIndex, pub address: HexBytes, pub data: Option, - pub topics: Option>, + pub topics: Option> } #[derive(Deserialize, serde::Serialize)] #[serde(rename_all = "camelCase")] pub struct CallValueInfo { - #[serde( - deserialize_with = "sqd_data_core::serde::decode_string_option", - default - )] + #[serde(deserialize_with = "sqd_data_core::serde::decode_string_option", default)] pub call_value: Option, - pub token_id: Option, + pub token_id: Option } #[derive(Deserialize)] @@ -140,7 +99,7 @@ pub struct InternalTransaction { pub call_value_info: Vec, pub note: HexBytes, pub rejected: Option, - pub extra: Option, + pub extra: Option } #[derive(Deserialize)] @@ -149,7 +108,7 @@ pub struct Block { pub header: BlockHeader, pub transactions: Vec, pub logs: Vec, - pub internal_transactions: Vec, + pub internal_transactions: Vec } impl sqd_primitives::Block for Block { diff --git a/crates/data/src/tron/tables/block.rs b/crates/data/src/tron/tables/block.rs index d271b30d..5f51f838 100644 --- a/crates/data/src/tron/tables/block.rs +++ b/crates/data/src/tron/tables/block.rs @@ -1,8 +1,7 @@ -use crate::tron::model::BlockHeader; -use crate::tron::tables::common::*; use sqd_array::builder::{Int32Builder, TimestampMillisecondBuilder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::tron::{model::BlockHeader, tables::common::*}; table_builder! { BlockBuilder { @@ -24,7 +23,6 @@ table_builder! { } } - impl BlockBuilder { pub fn push(&mut self, row: &BlockHeader) { self.number.append(row.height); diff --git a/crates/data/src/tron/tables/common.rs b/crates/data/src/tron/tables/common.rs index 5fa2644c..50f20590 100644 --- a/crates/data/src/tron/tables/common.rs +++ b/crates/data/src/tron/tables/common.rs @@ -1,8 +1,8 @@ -use sqd_array::builder::{StringBuilder}; +use sqd_array::builder::StringBuilder; pub type HexBytesBuilder = StringBuilder; pub type JsonBuilder = StringBuilder; pub fn sighash(bytes: &str) -> Option<&str> { - (bytes.len() >= 8).then(|| { &bytes[0..8] }) -} \ No newline at end of file + (bytes.len() >= 8).then(|| &bytes[0..8]) +} diff --git a/crates/data/src/tron/tables/internal_transaction.rs b/crates/data/src/tron/tables/internal_transaction.rs index d15fbd2b..c70651af 100644 --- a/crates/data/src/tron/tables/internal_transaction.rs +++ b/crates/data/src/tron/tables/internal_transaction.rs @@ -1,8 +1,10 @@ -use crate::tron::model::{Block, InternalTransaction}; -use crate::tron::tables::common::*; use sqd_array::builder::{BooleanBuilder, UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::tron::{ + model::{Block, InternalTransaction}, + tables::common::* +}; table_builder! { InternalTransactionBuilder { @@ -37,7 +39,6 @@ table_builder! { } } - impl InternalTransactionBuilder { pub fn push(&mut self, block: &Block, row: &InternalTransaction) { self.block_number.append(block.header.height); @@ -45,7 +46,8 @@ impl InternalTransactionBuilder { self.internal_transaction_index.append(row.internal_transaction_index); self.hash.append(&row.hash); self.caller_address.append(&row.caller_address); - self.transfer_to_address.append_option(row.transfer_to_address.as_deref()); + self.transfer_to_address + .append_option(row.transfer_to_address.as_deref()); let call_value_info = serde_json::to_string(&row.call_value_info).unwrap(); self.call_value_info.append(&call_value_info); diff --git a/crates/data/src/tron/tables/log.rs b/crates/data/src/tron/tables/log.rs index ca4c1826..a0398f49 100644 --- a/crates/data/src/tron/tables/log.rs +++ b/crates/data/src/tron/tables/log.rs @@ -1,8 +1,10 @@ -use crate::tron::model::{Block, Log}; -use crate::tron::tables::common::HexBytesBuilder; use sqd_array::builder::{UInt32Builder, UInt64Builder}; use sqd_data_core::table_builder; +use crate::tron::{ + model::{Block, Log}, + tables::common::HexBytesBuilder +}; table_builder! { LogBuilder { @@ -33,7 +35,6 @@ table_builder! { } } - impl LogBuilder { pub fn push(&mut self, block: &Block, row: &Log) { self.block_number.append(block.header.height); diff --git a/crates/data/src/tron/tables/mod.rs b/crates/data/src/tron/tables/mod.rs index e7b575d6..fae22fc3 100644 --- a/crates/data/src/tron/tables/mod.rs +++ b/crates/data/src/tron/tables/mod.rs @@ -1,17 +1,16 @@ -mod common; mod block; -mod transaction; -mod log; +mod common; mod internal_transaction; +mod log; +mod transaction; pub use block::*; -pub use transaction::*; -pub use log::*; pub use internal_transaction::*; - -use super::model::Block; +pub use log::*; use sqd_data_core::chunk_builder; +pub use transaction::*; +use super::model::Block; chunk_builder! { TronChunkBuilder { @@ -22,7 +21,6 @@ chunk_builder! { } } - impl sqd_data_core::BlockChunkBuilder for TronChunkBuilder { type Block = Block; diff --git a/crates/data/src/tron/tables/transaction.rs b/crates/data/src/tron/tables/transaction.rs index 77945816..cea57808 100644 --- a/crates/data/src/tron/tables/transaction.rs +++ b/crates/data/src/tron/tables/transaction.rs @@ -1,12 +1,15 @@ -use crate::tron::model::{Block, Transaction}; -use crate::tron::tables::common::*; -use sqd_array::builder::{Int32Builder, Int64Builder, ListBuilder, StringBuilder, TimestampMillisecondBuilder, UInt32Builder, UInt64Builder}; +use sqd_array::builder::{ + Int32Builder, Int64Builder, ListBuilder, StringBuilder, TimestampMillisecondBuilder, UInt32Builder, UInt64Builder +}; use sqd_data_core::table_builder; +use crate::tron::{ + model::{Block, Transaction}, + tables::common::* +}; type SignatureListBuilder = ListBuilder; - table_builder! { TransactionBuilder { block_number: UInt64Builder, @@ -94,7 +97,6 @@ table_builder! { } } - impl TransactionBuilder { pub fn push(&mut self, block: &Block, row: &Transaction) { self.block_number.append(block.header.height); @@ -126,26 +128,36 @@ impl TransactionBuilder { self.contract_result.append_option(row.contract_result.as_deref()); self.contract_address.append_option(row.contract_address.as_deref()); self.res_message.append_option(row.res_message.as_deref()); - self.withdraw_amount.append_option(row.withdraw_amount.map(|v| v as i64)); - self.unfreeze_amount.append_option(row.unfreeze_amount.map(|v| v as i64)); - self.withdraw_expire_amount.append_option(row.withdraw_expire_amount.map(|v| v as i64)); - - let cancel = row.cancel_unfreeze_v2_amount.as_ref().map(|val| serde_json::to_string(val).unwrap()); + self.withdraw_amount + .append_option(row.withdraw_amount.map(|v| v as i64)); + self.unfreeze_amount + .append_option(row.unfreeze_amount.map(|v| v as i64)); + self.withdraw_expire_amount + .append_option(row.withdraw_expire_amount.map(|v| v as i64)); + + let cancel = row + .cancel_unfreeze_v2_amount + .as_ref() + .map(|val| serde_json::to_string(val).unwrap()); self.cancel_unfreeze_v2_amount.append_option(cancel.as_deref()); self.result.append_option(row.result.as_deref()); self.energy_fee.append_option(row.energy_fee.map(|v| v as i64)); self.energy_usage.append_option(row.energy_usage.map(|v| v as i64)); - self.energy_usage_total.append_option(row.energy_usage_total.map(|v| v as i64)); + self.energy_usage_total + .append_option(row.energy_usage_total.map(|v| v as i64)); self.net_usage.append_option(row.net_usage.map(|v| v as i64)); self.net_fee.append_option(row.net_fee.map(|v| v as i64)); - self.origin_energy_usage.append_option(row.origin_energy_usage.map(|v| v as i64)); - self.energy_penalty_total.append_option(row.energy_penalty_total.map(|v| v as i64)); + self.origin_energy_usage + .append_option(row.origin_energy_usage.map(|v| v as i64)); + self.energy_penalty_total + .append_option(row.energy_penalty_total.map(|v| v as i64)); let value = &row.parameter["value"]; if row.r#type == "TransferContract" { - self._transfer_contract_owner.append_option(value["owner_address"].as_str()); + self._transfer_contract_owner + .append_option(value["owner_address"].as_str()); self._transfer_contract_to.append_option(value["to_address"].as_str()); } else { self._transfer_contract_owner.append_null(); @@ -153,9 +165,12 @@ impl TransactionBuilder { } if row.r#type == "TransferAssetContract" { - self._transfer_asset_contract_owner.append_option(value["owner_address"].as_str()); - self._transfer_asset_contract_to.append_option(value["to_address"].as_str()); - self._transfer_asset_contract_asset.append_option(value["asset_name"].as_str()); + self._transfer_asset_contract_owner + .append_option(value["owner_address"].as_str()); + self._transfer_asset_contract_to + .append_option(value["to_address"].as_str()); + self._transfer_asset_contract_asset + .append_option(value["asset_name"].as_str()); } else { self._transfer_asset_contract_owner.append_null(); self._transfer_asset_contract_to.append_null(); @@ -163,11 +178,12 @@ impl TransactionBuilder { } if row.r#type == "TriggerSmartContract" { - self._trigger_smart_contract_owner.append_option(value["owner_address"].as_str()); - self._trigger_smart_contract_contract.append_option(value["contract_address"].as_str()); - self._trigger_smart_contract_sighash.append_option( - value["data"].as_str().and_then(sighash) - ); + self._trigger_smart_contract_owner + .append_option(value["owner_address"].as_str()); + self._trigger_smart_contract_contract + .append_option(value["contract_address"].as_str()); + self._trigger_smart_contract_sighash + .append_option(value["data"].as_str().and_then(sighash)); } else { self._trigger_smart_contract_owner.append_null(); self._trigger_smart_contract_contract.append_null(); diff --git a/crates/data/src/types.rs b/crates/data/src/types.rs index e70194c7..9b5df4c2 100644 --- a/crates/data/src/types.rs +++ b/crates/data/src/types.rs @@ -1,3 +1,3 @@ pub use serde_json::Value as JsonValue; pub type Base58Bytes = String; -pub type HexBytes = String; \ No newline at end of file +pub type HexBytes = String; diff --git a/crates/dataset/src/lib.rs b/crates/dataset/src/lib.rs index 3c8be946..e3414cc6 100644 --- a/crates/dataset/src/lib.rs +++ b/crates/dataset/src/lib.rs @@ -1,18 +1,14 @@ use std::collections::BTreeMap; - type Name = &'static str; - pub type DatasetDescriptionRef = std::sync::Arc; - #[derive(Clone, Debug, Default)] pub struct DatasetDescription { pub tables: BTreeMap } - #[derive(Clone, Debug, Default)] pub struct TableDescription { pub downcast: DowncastColumns, @@ -20,14 +16,12 @@ pub struct TableDescription { pub options: TableOptions } - #[derive(Clone, Debug, Default)] pub struct DowncastColumns { pub block_number: Vec, pub item_index: Vec } - #[derive(Clone, Debug)] pub struct TableOptions { pub column_options: BTreeMap, @@ -35,16 +29,15 @@ pub struct TableOptions { pub row_group_size: usize } - impl TableOptions { pub fn has_stats(&self, name: &str) -> bool { self.column_options.get(name).map_or(false, |c| c.stats_enable) } - + pub fn get_stats_partition(&self, name: &str) -> Option { self.column_options.get(name).map(|c| c.stats_partition) } - + pub fn add_stats(&mut self, name: Name) { let options = self.column_options.entry(name).or_default(); options.stats_enable = true @@ -56,7 +49,6 @@ impl TableOptions { } } - impl Default for TableOptions { fn default() -> Self { Self { @@ -67,21 +59,19 @@ impl Default for TableOptions { } } - #[derive(Clone, Debug)] pub struct ColumnOptions { pub stats_enable: bool, pub stats_partition: usize, - pub dictionary_encoding: bool, + pub dictionary_encoding: bool } - impl Default for ColumnOptions { fn default() -> Self { Self { stats_enable: false, stats_partition: 4096, - dictionary_encoding: false, + dictionary_encoding: false } } -} \ No newline at end of file +} diff --git a/crates/hotblocks-retain/src/cli.rs b/crates/hotblocks-retain/src/cli.rs index 1da7ecad..cf509ddc 100644 --- a/crates/hotblocks-retain/src/cli.rs +++ b/crates/hotblocks-retain/src/cli.rs @@ -28,5 +28,5 @@ pub struct Cli { /// Additional delay in seconds after effective_from before applying retention #[arg(long, default_value = "0")] - pub retain_delay_secs: u64, + pub retain_delay_secs: u64 } diff --git a/crates/hotblocks-retain/src/datasets.rs b/crates/hotblocks-retain/src/datasets.rs index df5109c7..97f91e31 100644 --- a/crates/hotblocks-retain/src/datasets.rs +++ b/crates/hotblocks-retain/src/datasets.rs @@ -1,25 +1,24 @@ -use crate::types::DatasetId; -use serde::Deserialize; use std::collections::HashMap; + +use serde::Deserialize; use url::Url; +use crate::types::DatasetId; + #[derive(Deserialize)] struct Dataset { name: String, - id: DatasetId, + id: DatasetId } #[derive(Deserialize)] struct DatasetsFile { #[serde(rename = "sqd-network-datasets")] - sqd_network_datasets: Vec, + sqd_network_datasets: Vec } /// Downloads the datasets manifest and returns a map from dataset name to dataset ID. -pub async fn get_name_to_id( - client: &reqwest::Client, - url: &Url, -) -> anyhow::Result> { +pub async fn get_name_to_id(client: &reqwest::Client, url: &Url) -> anyhow::Result> { let bytes = client .get(url.as_str()) .send() @@ -30,11 +29,7 @@ pub async fn get_name_to_id( let file: DatasetsFile = serde_yaml::from_slice(&bytes)?; - let map = file - .sqd_network_datasets - .into_iter() - .map(|d| (d.name, d.id)) - .collect(); + let map = file.sqd_network_datasets.into_iter().map(|d| (d.name, d.id)).collect(); Ok(map) } diff --git a/crates/hotblocks-retain/src/hotblocks.rs b/crates/hotblocks-retain/src/hotblocks.rs index 75acbef7..8e15ab2e 100644 --- a/crates/hotblocks-retain/src/hotblocks.rs +++ b/crates/hotblocks-retain/src/hotblocks.rs @@ -6,7 +6,7 @@ pub async fn set_retention( client: &Client, base_url: &Url, dataset: &str, - from_block: BlockNumber, + from_block: BlockNumber ) -> anyhow::Result<()> { let retention_url = base_url.join(&format!("/datasets/{dataset}/retention"))?; diff --git a/crates/hotblocks-retain/src/main.rs b/crates/hotblocks-retain/src/main.rs index 0b7e3514..7eceeba4 100644 --- a/crates/hotblocks-retain/src/main.rs +++ b/crates/hotblocks-retain/src/main.rs @@ -4,11 +4,14 @@ mod hotblocks; mod status; mod types; +use std::{ + collections::HashMap, + time::{Duration, SystemTime, UNIX_EPOCH} +}; + use anyhow::Context; use clap::Parser; use cli::Cli; -use std::collections::HashMap; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::time::Instant; use types::{DatasetId, DatasetsConfig}; use url::Url; @@ -35,9 +38,9 @@ fn main() -> anyhow::Result<()> { args.datasets_url, datasets, Duration::from_secs(args.datasets_update_interval_secs), - Duration::from_secs(args.retain_delay_secs), + Duration::from_secs(args.retain_delay_secs) ) - .run(), + .run() )?; Ok(()) @@ -53,7 +56,7 @@ struct HotblocksRetain { retain_delay: Duration, name_to_id: HashMap, last_datasets_refresh: Instant, - last_effective_from: Option, + last_effective_from: Option } impl HotblocksRetain { @@ -63,7 +66,7 @@ impl HotblocksRetain { datasets_url: Url, datasets: DatasetsConfig, datasets_update_interval: Duration, - retain_delay: Duration, + retain_delay: Duration ) -> Self { Self { client: reqwest::Client::new(), @@ -75,7 +78,7 @@ impl HotblocksRetain { retain_delay, name_to_id: HashMap::new(), last_datasets_refresh: Instant::now() - datasets_update_interval, - last_effective_from: None, + last_effective_from: None } } @@ -98,10 +101,7 @@ impl HotblocksRetain { continue; } - let now = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_secs(); + let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs(); let apply_at = status.effective_from + self.retain_delay.as_secs(); if now < apply_at { @@ -161,11 +161,7 @@ impl HotblocksRetain { match self.name_to_id.get(network_dataset) { Some(id) => id.as_str(), None => { - tracing::warn!( - dataset, - network_dataset, - "dataset not found in manifest, skipping" - ); + tracing::warn!(dataset, network_dataset, "dataset not found in manifest, skipping"); continue; } } @@ -173,14 +169,7 @@ impl HotblocksRetain { match statuses.get(dataset_id) { Some(Some(height)) => { - match hotblocks::set_retention( - &self.client, - &self.hotblocks_url, - dataset, - *height, - ) - .await - { + match hotblocks::set_retention(&self.client, &self.hotblocks_url, dataset, *height).await { Ok(()) => { tracing::info!(dataset, height, "applied retention policy"); } @@ -206,15 +195,11 @@ impl HotblocksRetain { fn init_tracing() { use std::io::IsTerminal; - let env_filter = tracing_subscriber::EnvFilter::builder().parse_lossy( - std::env::var(tracing_subscriber::EnvFilter::DEFAULT_ENV).unwrap_or("info".to_string()), - ); + let env_filter = tracing_subscriber::EnvFilter::builder() + .parse_lossy(std::env::var(tracing_subscriber::EnvFilter::DEFAULT_ENV).unwrap_or("info".to_string())); if std::io::stdout().is_terminal() { - tracing_subscriber::fmt() - .with_env_filter(env_filter) - .compact() - .init(); + tracing_subscriber::fmt().with_env_filter(env_filter).compact().init(); } else { tracing_subscriber::fmt() .with_env_filter(env_filter) diff --git a/crates/hotblocks-retain/src/status.rs b/crates/hotblocks-retain/src/status.rs index ecab3512..077d4d0b 100644 --- a/crates/hotblocks-retain/src/status.rs +++ b/crates/hotblocks-retain/src/status.rs @@ -1,17 +1,18 @@ -use crate::types::DatasetId; use serde::Deserialize; use sqd_primitives::BlockNumber; +use crate::types::DatasetId; + #[derive(Debug, Deserialize)] pub struct SchedulingStatus { pub datasets: Vec, - pub effective_from: u64, + pub effective_from: u64 } #[derive(Debug, Deserialize)] pub struct DatasetStatus { pub id: DatasetId, - pub height: Option, + pub height: Option } pub async fn get_status(client: &reqwest::Client, url: &str) -> anyhow::Result { diff --git a/crates/hotblocks-retain/src/types.rs b/crates/hotblocks-retain/src/types.rs index a6ce02a1..ebe18fcf 100644 --- a/crates/hotblocks-retain/src/types.rs +++ b/crates/hotblocks-retain/src/types.rs @@ -1,13 +1,14 @@ -use serde::Deserialize; use std::collections::HashMap; +use serde::Deserialize; + pub type DatasetId = String; // s3:// #[derive(Debug, Clone, Deserialize)] pub struct DatasetProps { pub id: Option, #[serde(rename = "name", alias = "network_dataset")] - pub network_dataset: Option, + pub network_dataset: Option } pub type DatasetsConfig = HashMap>; diff --git a/crates/hotblocks/src/api.rs b/crates/hotblocks/src/api.rs index a0f96265..c5cb4fef 100644 --- a/crates/hotblocks/src/api.rs +++ b/crates/hotblocks/src/api.rs @@ -1,30 +1,33 @@ -use crate::cli::App; -use crate::dataset_controller::DatasetController; -use crate::errors::{ - BlockItemIsNotAvailable, BlockRangeMissing, Busy, QueryIsAboveTheHead, QueryKindMismatch, - UnknownDataset, -}; -use crate::query::QueryResponse; -use crate::types::{ClientId, RetentionStrategy}; +use std::{sync::Arc, time::Instant}; + use anyhow::bail; use async_stream::try_stream; -use axum::body::{Body, Bytes}; -use axum::extract::{Path, Request}; -use axum::http::StatusCode; -use axum::http::Uri; -use axum::response::{IntoResponse, Response}; -use axum::routing::{get, post}; -use axum::{BoxError, Extension, Json, Router}; +use axum::{ + BoxError, Extension, Json, Router, + body::{Body, Bytes}, + extract::{Path, Request}, + http::{StatusCode, Uri}, + response::{IntoResponse, Response}, + routing::{get, post} +}; use futures::TryStream; use serde::Serialize; use sqd_primitives::BlockRef; use sqd_query::{Query, UnexpectedBaseBlock}; use sqd_storage::db::DatasetId; -use std::sync::Arc; -use std::time::Instant; use tower_http::request_id::{MakeRequestUuid, RequestId, SetRequestIdLayer}; use tracing::{Instrument, error}; +use crate::{ + cli::App, + dataset_controller::DatasetController, + errors::{ + BlockItemIsNotAvailable, BlockRangeMissing, Busy, QueryIsAboveTheHead, QueryKindMismatch, UnknownDataset + }, + query::QueryResponse, + types::{ClientId, RetentionStrategy} +}; + const DEFAULT_CLIENT_ID: &str = "unknown"; macro_rules! json_ok { @@ -43,7 +46,7 @@ macro_rules! get_dataset { ($app:expr, $dataset_id:expr) => { match $app.data_service.get_dataset($dataset_id) { Ok(ds) => ds, - Err(err) => return text!(StatusCode::NOT_FOUND, "{}", err), + Err(err) => return text!(StatusCode::NOT_FOUND, "{}", err) } }; } @@ -52,18 +55,12 @@ type AppRef = Arc; pub fn build_api(app: App) -> Router { Router::new() - .route( - "/", - get(|| async { "Welcome to SQD hot block data service!" }), - ) + .route("/", get(|| async { "Welcome to SQD hot block data service!" })) .route("/datasets/{id}/stream", post(stream)) .route("/datasets/{id}/finalized-stream", post(finalized_stream)) .route("/datasets/{id}/head", get(get_head)) .route("/datasets/{id}/finalized-head", get(get_finalized_head)) - .route( - "/datasets/{id}/retention", - get(get_retention).post(set_retention), - ) + .route("/datasets/{id}/retention", get(get_retention).post(set_retention)) .route("/datasets/{id}/status", get(get_status)) .route("/datasets/{id}/metadata", get(get_metadata)) .route("/metrics", get(get_metrics)) @@ -81,10 +78,7 @@ pub async fn middleware(mut req: Request, next: axum::middleware::Next) -> impl let version = req.version(); let start = Instant::now(); - let app = req - .extensions() - .get::>() - .expect("App extension should be set"); + let app = req.extensions().get::>().expect("App extension should be set"); let client_id = req .headers() @@ -137,21 +131,19 @@ pub struct Labels(Vec<(&'static str, String)>); pub struct ResponseWithMetadata { pub labels: Labels, - pub response: Option, + pub response: Option } impl ResponseWithMetadata { fn new() -> Self { Self { labels: Labels(vec![]), - response: None, + response: None } } pub fn with_client_id(mut self, client_id: &ClientId) -> Self { - self.labels - .0 - .push(("client_id", client_id.as_str().to_owned())); + self.labels.0.push(("client_id", client_id.as_str().to_owned())); self } @@ -167,7 +159,7 @@ impl ResponseWithMetadata { pub fn with_response(mut self, clause: F) -> Self where - F: FnOnce() -> Response, + F: FnOnce() -> Response { self.response = Some(clause()); self @@ -186,7 +178,7 @@ async fn stream( Extension(app): Extension, Extension(client_id): Extension, Path(dataset_id): Path, - body: Bytes, + body: Bytes ) -> impl IntoResponse { let response = stream_internal(app, dataset_id, body, false, client_id.clone()).await; ResponseWithMetadata::new() @@ -200,7 +192,7 @@ async fn finalized_stream( Extension(app): Extension, Extension(client_id): Extension, Path(dataset_id): Path, - body: Bytes, + body: Bytes ) -> impl IntoResponse { let response = stream_internal(app, dataset_id, body, true, client_id.clone()).await; ResponseWithMetadata::new() @@ -215,13 +207,13 @@ async fn stream_internal( dataset_id: DatasetId, body: Bytes, finalized: bool, - client_id: ClientId, + client_id: ClientId ) -> Response { let dataset = get_dataset!(app, dataset_id); let query: Query = match Json::::from_bytes(&body) { Ok(Json(q)) => q, - Err(rejection) => return rejection.into_response(), + Err(rejection) => return rejection.into_response() }; if let Err(err) = query.validate() { @@ -229,9 +221,7 @@ async fn stream_internal( } let query_result = if finalized { - app.query_service - .query_finalized(&dataset, query, client_id) - .await + app.query_service.query_finalized(&dataset, query, client_id).await } else { app.query_service.query(&dataset, query, client_id).await }; @@ -248,9 +238,7 @@ async fn stream_internal( // For finalized stream, use the finalized head as the head res = res.header("x-sqd-head-number", finalized_head.number); } else { - let head_block = finalized_head - .number - .max(dataset.get_head_block_number().unwrap_or(0)); + let head_block = finalized_head.number.max(dataset.get_head_block_number().unwrap_or(0)); res = res.header("x-sqd-head-number", head_block); } res = res.header("x-sqd-finalized-head-number", finalized_head.number); @@ -263,13 +251,11 @@ async fn stream_internal( res.body(body).unwrap() } - Err(err) => error_to_response(err, &body), + Err(err) => error_to_response(err, &body) } } -fn stream_query_response( - mut stream: QueryResponse, -) -> impl TryStream { +fn stream_query_response(mut stream: QueryResponse) -> impl TryStream { try_stream! { while let Some(pack_result) = stream.next_data_pack().await.transpose() { match pack_result { @@ -304,8 +290,8 @@ fn error_to_response(err: anyhow::Error, body: &Bytes) -> Response { return ( StatusCode::CONFLICT, Json(BaseBlockConflict { - previous_blocks: &fork.prev_blocks, - }), + previous_blocks: &fork.prev_blocks + }) ) .into_response(); } @@ -341,13 +327,13 @@ fn error_to_response(err: anyhow::Error, body: &Bytes) -> Response { #[derive(Serialize)] #[serde(rename_all = "camelCase")] struct BaseBlockConflict<'a> { - previous_blocks: &'a [BlockRef], + previous_blocks: &'a [BlockRef] } async fn get_finalized_head( Extension(app): Extension, Extension(client_id): Extension, - Path(dataset_id): Path, + Path(dataset_id): Path ) -> impl IntoResponse { ResponseWithMetadata::new() .with_client_id(&client_id) @@ -363,7 +349,7 @@ async fn get_finalized_head( async fn get_head( Extension(app): Extension, Extension(client_id): Extension, - Path(dataset_id): Path, + Path(dataset_id): Path ) -> impl IntoResponse { ResponseWithMetadata::new() .with_client_id(&client_id) @@ -379,7 +365,7 @@ async fn get_head( async fn get_retention( Extension(app): Extension, Extension(client_id): Extension, - Path(dataset_id): Path, + Path(dataset_id): Path ) -> impl IntoResponse { ResponseWithMetadata::new() .with_client_id(&client_id) @@ -396,7 +382,7 @@ async fn set_retention( Extension(app): Extension, Extension(client_id): Extension, Path(dataset_id): Path, - Json(strategy): Json, + Json(strategy): Json ) -> impl IntoResponse { ResponseWithMetadata::new() .with_client_id(&client_id) @@ -420,7 +406,7 @@ async fn set_retention( async fn get_status( Extension(app): Extension, Extension(client_id): Extension, - Path(dataset_id): Path, + Path(dataset_id): Path ) -> impl IntoResponse { let read_status = |ctl: Arc| -> anyhow::Result<_> { let db = app.db.snapshot(); @@ -462,7 +448,7 @@ async fn get_status( let ctl = get_dataset!(app, dataset_id); match read_status(ctl) { Ok(status) => json_ok!(status), - Err(err) => text!(StatusCode::INTERNAL_SERVER_ERROR, "{:?}", err), + Err(err) => text!(StatusCode::INTERNAL_SERVER_ERROR, "{:?}", err) } }) } @@ -470,7 +456,7 @@ async fn get_status( async fn get_metadata( Extension(app): Extension, Extension(client_id): Extension, - Path(dataset_id): Path, + Path(dataset_id): Path ) -> impl IntoResponse { ResponseWithMetadata::new() .with_client_id(&client_id) @@ -483,7 +469,7 @@ async fn get_metadata( let first_chunk = match db.get_first_chunk(dataset_id) { Ok(chunk) => chunk, - Err(err) => return text!(StatusCode::INTERNAL_SERVER_ERROR, "{:?}", err), + Err(err) => return text!(StatusCode::INTERNAL_SERVER_ERROR, "{:?}", err) }; json_ok!(serde_json::json! {{ @@ -497,12 +483,11 @@ async fn get_metadata( async fn get_metrics( Extension(app): Extension, - Extension(client_id): Extension, + Extension(client_id): Extension ) -> impl IntoResponse { let mut metrics = String::new(); - prometheus_client::encoding::text::encode(&mut metrics, &app.metrics_registry) - .expect("String IO is infallible"); + prometheus_client::encoding::text::encode(&mut metrics, &app.metrics_registry).expect("String IO is infallible"); ResponseWithMetadata::new() .with_client_id(&client_id) @@ -512,7 +497,7 @@ async fn get_metrics( async fn get_rocks_stats( Extension(app): Extension, - Extension(client_id): Extension, + Extension(client_id): Extension ) -> impl IntoResponse { ResponseWithMetadata::new() .with_client_id(&client_id) @@ -521,10 +506,7 @@ async fn get_rocks_stats( if let Some(stats) = app.db.get_statistics() { stats.into_response() } else { - text!( - StatusCode::INTERNAL_SERVER_ERROR, - "rocksdb stats are not enabled" - ) + text!(StatusCode::INTERNAL_SERVER_ERROR, "rocksdb stats are not enabled") } }) } @@ -532,7 +514,7 @@ async fn get_rocks_stats( async fn get_rocks_prop( Extension(app): Extension, Extension(client_id): Extension, - Path((cf, name)): Path<(String, String)>, + Path((cf, name)): Path<(String, String)> ) -> impl IntoResponse { ResponseWithMetadata::new() .with_client_id(&client_id) @@ -540,7 +522,7 @@ async fn get_rocks_prop( .with_response(|| match app.db.get_property(&cf, &name) { Ok(Some(s)) => s.into_response(), Ok(None) => text!(StatusCode::NOT_FOUND, "property not found"), - Err(err) => text!(StatusCode::INTERNAL_SERVER_ERROR, "{}", err), + Err(err) => text!(StatusCode::INTERNAL_SERVER_ERROR, "{}", err) }) } diff --git a/crates/hotblocks/src/cli.rs b/crates/hotblocks/src/cli.rs index 80687984..94fea87a 100644 --- a/crates/hotblocks/src/cli.rs +++ b/crates/hotblocks/src/cli.rs @@ -1,13 +1,19 @@ -use crate::data_service::{DataService, DataServiceRef}; -use crate::dataset_config::{DatasetConfig, RetentionConfig}; -use crate::metrics::DatasetMetricsCollector; -use crate::query::{QueryService, QueryServiceRef}; -use crate::types::DBRef; +use std::{ + collections::{BTreeSet, HashSet}, + sync::Arc +}; + use anyhow::Context; use clap::Parser; use sqd_storage::db::{DatabaseSettings, DatasetId}; -use std::collections::{BTreeSet, HashSet}; -use std::sync::Arc; + +use crate::{ + data_service::{DataService, DataServiceRef}, + dataset_config::{DatasetConfig, RetentionConfig}, + metrics::DatasetMetricsCollector, + query::{QueryService, QueryServiceRef}, + types::DBRef +}; #[derive(Parser, Debug)] #[command(version, about, long_about = None)] @@ -50,7 +56,7 @@ pub struct CLI { /// Known client IDs for metrics labeling. Client IDs not in this list /// will be reported as "unknown" to prevent metrics cardinality abuse. #[arg(long = "known-client", value_name = "ID")] - pub known_clients: Vec, + pub known_clients: Vec } pub struct App { @@ -59,13 +65,12 @@ pub struct App { pub query_service: QueryServiceRef, pub api_controlled_datasets: BTreeSet, pub metrics_registry: prometheus_client::registry::Registry, - pub known_clients: HashSet, + pub known_clients: HashSet } impl CLI { pub async fn build_app(&self) -> anyhow::Result { - let datasets = DatasetConfig::read_config_file(&self.datasets) - .context("failed to read datasets config")?; + let datasets = DatasetConfig::read_config_file(&self.datasets).context("failed to read datasets config")?; let db = DatabaseSettings::default() .with_data_cache_size(self.data_cache_size) @@ -77,8 +82,8 @@ impl CLI { let mut metrics_registry = crate::metrics::build_metrics_registry(); metrics_registry.register_collector(Box::new(DatasetMetricsCollector { - db: db.clone(), - datasets: datasets.keys().copied().collect(), + db: db.clone(), + datasets: datasets.keys().copied().collect() })); let api_controlled_datasets = datasets @@ -86,9 +91,7 @@ impl CLI { .filter_map(|(id, cfg)| (cfg.retention_strategy == RetentionConfig::Api).then_some(*id)) .collect(); - let data_service = DataService::start(db.clone(), datasets) - .await - .map(Arc::new)?; + let data_service = DataService::start(db.clone(), datasets).await.map(Arc::new)?; let query_service = { let mut builder = QueryService::builder(db.clone()); @@ -115,7 +118,7 @@ impl CLI { query_service, api_controlled_datasets, metrics_registry, - known_clients, + known_clients }) } } diff --git a/crates/hotblocks/src/data_service.rs b/crates/hotblocks/src/data_service.rs index db3e2f3c..a95256c1 100644 --- a/crates/hotblocks/src/data_service.rs +++ b/crates/hotblocks/src/data_service.rs @@ -1,25 +1,27 @@ -use crate::dataset_config::{DatasetConfig, RetentionConfig}; -use crate::dataset_controller::DatasetController; -use crate::errors::UnknownDataset; -use crate::types::{DBRef, RetentionStrategy}; -use anyhow::{anyhow, Context}; -use futures::FutureExt; -use futures::{StreamExt, TryStreamExt}; +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc +}; + +use anyhow::{Context, anyhow}; +use futures::{FutureExt, StreamExt, TryStreamExt}; use sqd_data_client::reqwest::ReqwestDataClient; use sqd_storage::db::DatasetId; use tracing::{error, info}; -use std::collections::{BTreeMap, HashMap}; -use std::sync::Arc; +use crate::{ + dataset_config::{DatasetConfig, RetentionConfig}, + dataset_controller::DatasetController, + errors::UnknownDataset, + types::{DBRef, RetentionStrategy} +}; pub type DataServiceRef = Arc; - pub struct DataService { - datasets: HashMap>, + datasets: HashMap> } - impl DataService { pub async fn start(db: DBRef, datasets: BTreeMap) -> anyhow::Result { let all_datasets = db.get_all_datasets()?; @@ -38,35 +40,27 @@ impl DataService { let http_client = sqd_data_client::reqwest::default_http_client(); - let data_sources = cfg.data_sources.into_iter() + let data_sources = cfg + .data_sources + .into_iter() .map(|url| ReqwestDataClient::new(http_client.clone(), url)) .collect(); let retention = match cfg.retention_strategy { - RetentionConfig::FromBlock { number, parent_hash } => RetentionStrategy::FromBlock { - number , - parent_hash - }, + RetentionConfig::FromBlock { number, parent_hash } => { + RetentionStrategy::FromBlock { number, parent_hash } + } RetentionConfig::Head(n) => RetentionStrategy::Head(n), RetentionConfig::Api | RetentionConfig::None => RetentionStrategy::None }; tokio::task::spawn_blocking(move || { - DatasetController::new( - db, - dataset_id, - cfg.kind, - retention, - data_sources - ).map(|c| { + DatasetController::new(db, dataset_id, cfg.kind, retention, data_sources).map(|c| { c.enable_compaction(!cfg.disable_compaction); Arc::new(c) }) - }).map(move |res| { - res.with_context(|| { - anyhow!("failed to initialize dataset {}", dataset_id) - }) }) + .map(move |res| res.with_context(|| anyhow!("failed to initialize dataset {}", dataset_id))) }) .buffered(5); @@ -76,16 +70,13 @@ impl DataService { datasets.insert(ctl.dataset_id(), ctl); } - Ok(Self { - datasets - }) + Ok(Self { datasets }) } pub fn get_dataset(&self, dataset_id: DatasetId) -> Result, UnknownDataset> { - self.datasets.get(&dataset_id) + self.datasets + .get(&dataset_id) .map(Arc::clone) - .ok_or(UnknownDataset { - dataset_id - }) + .ok_or(UnknownDataset { dataset_id }) } -} \ No newline at end of file +} diff --git a/crates/hotblocks/src/dataset_config.rs b/crates/hotblocks/src/dataset_config.rs index 204e73b2..5ae1723e 100644 --- a/crates/hotblocks/src/dataset_config.rs +++ b/crates/hotblocks/src/dataset_config.rs @@ -1,26 +1,26 @@ -use crate::types::DatasetKind; +use std::collections::BTreeMap; + use serde::{Deserialize, Serialize}; use sqd_query::BlockNumber; use sqd_storage::db::DatasetId; -use std::collections::BTreeMap; use url::Url; +use crate::types::DatasetKind; #[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] pub enum RetentionConfig { // Fixed, starting from the block number FromBlock { number: BlockNumber, - parent_hash: Option, + parent_hash: Option }, // Moving window that keeps up to N blocks Head(u64), // Retention is set dynamically from the portal Api, - None, + None } - #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct DatasetConfig { @@ -28,10 +28,9 @@ pub struct DatasetConfig { pub retention_strategy: RetentionConfig, #[serde(default)] pub disable_compaction: bool, - pub data_sources: Vec, + pub data_sources: Vec } - impl DatasetConfig { pub fn read_config_file(file: &str) -> anyhow::Result> { let reader = std::io::BufReader::new(std::fs::File::open(file)?); diff --git a/crates/hotblocks/src/dataset_controller/dataset_controller.rs b/crates/hotblocks/src/dataset_controller/dataset_controller.rs index a55a5f9f..8568b79a 100644 --- a/crates/hotblocks/src/dataset_controller/dataset_controller.rs +++ b/crates/hotblocks/src/dataset_controller/dataset_controller.rs @@ -1,22 +1,22 @@ -use crate::dataset_controller::ingest::ingest; -use crate::dataset_controller::ingest_generic::{IngestMessage, NewChunk}; -use crate::dataset_controller::write_controller::WriteController; -use crate::types::{DBRef, DatasetKind, RetentionStrategy}; +use std::{collections::BTreeMap, ops::Add, time::Duration}; + use anyhow::{Context, anyhow}; -use futures::future::BoxFuture; -use futures::stream::FuturesUnordered; -use futures::{FutureExt, StreamExt}; +use futures::{FutureExt, StreamExt, future::BoxFuture, stream::FuturesUnordered}; use sqd_data_client::reqwest::ReqwestDataClient; use sqd_primitives::{BlockNumber, BlockRef}; use sqd_storage::db::{Chunk, CompactionStatus, DatasetId}; -use std::collections::BTreeMap; -use std::ops::Add; -use std::time::Duration; -use tokio::select; -use tokio::task::JoinHandle; -use tokio::time::Instant; +use tokio::{select, task::JoinHandle, time::Instant}; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; +use crate::{ + dataset_controller::{ + ingest::ingest, + ingest_generic::{IngestMessage, NewChunk}, + write_controller::WriteController + }, + types::{DBRef, DatasetKind, RetentionStrategy} +}; + pub struct DatasetController { dataset_id: DatasetId, dataset_kind: DatasetKind, @@ -25,7 +25,7 @@ pub struct DatasetController { finalized_head_receiver: tokio::sync::watch::Receiver>, compaction_enabled_sender: tokio::sync::watch::Sender, task: JoinHandle<()>, - compaction_task: JoinHandle<()>, + compaction_task: JoinHandle<()> } impl Drop for DatasetController { @@ -42,23 +42,18 @@ impl DatasetController { dataset_id: DatasetId, dataset_kind: DatasetKind, retention: RetentionStrategy, - data_sources: Vec, + data_sources: Vec ) -> anyhow::Result { let mut write = WriteController::new(db.clone(), dataset_id, dataset_kind)?; - if let RetentionStrategy::FromBlock { - number, - parent_hash, - } = &retention - { + if let RetentionStrategy::FromBlock { number, parent_hash } = &retention { write.init_retention(*number, parent_hash.clone())?; } let (retention_sender, retention_recv) = tokio::sync::watch::channel(retention); let (head_sender, head_receiver) = tokio::sync::watch::channel(None); let (finalized_head_sender, finalized_head_receiver) = tokio::sync::watch::channel(None); - let (compaction_enabled_sender, compaction_enabled_receiver) = - tokio::sync::watch::channel(false); + let (compaction_enabled_sender, compaction_enabled_receiver) = tokio::sync::watch::channel(false); let _ = head_sender.send(write.head().cloned()); let _ = finalized_head_sender.send(write.finalized_head().cloned()); @@ -70,14 +65,13 @@ impl DatasetController { data_sources, retention_recv, head_sender, - finalized_head_sender, + finalized_head_sender }; let task = tokio::spawn(ctl.run(write).in_current_span()); - let compaction_task = tokio::spawn( - compaction_loop(db, dataset_id, compaction_enabled_receiver).in_current_span(), - ); + let compaction_task = + tokio::spawn(compaction_loop(db, dataset_id, compaction_enabled_receiver).in_current_span()); Ok(Self { dataset_id, @@ -87,7 +81,7 @@ impl DatasetController { finalized_head_receiver, compaction_enabled_sender, task, - compaction_task, + compaction_task }) } @@ -152,7 +146,7 @@ struct WriteCtx { db: DBRef, write: WriteController, head_sender: tokio::sync::watch::Sender>, - finalized_head_sender: tokio::sync::watch::Sender>, + finalized_head_sender: tokio::sync::watch::Sender> } impl WriteCtx { @@ -179,7 +173,7 @@ impl WriteCtx { } IngestMessage::Fork { prev_blocks, - rollback_sender, + rollback_sender } => { self.write.compute_rollback(&prev_blocks).map(|rollback| { let _ = rollback_sender.send(rollback); @@ -216,18 +210,13 @@ impl WriteCtx { last_block_hash: new_chunk.last_block_hash, first_block_time: new_chunk.first_block_time, last_block_time: new_chunk.last_block_time, - tables, + tables }; - self.write - .new_chunk(new_chunk.finalized_head.as_ref(), &chunk) + self.write.new_chunk(new_chunk.finalized_head.as_ref(), &chunk) } - fn retain( - &mut self, - from_block: BlockNumber, - parent_hash: Option, - ) -> anyhow::Result<()> { + fn retain(&mut self, from_block: BlockNumber, parent_hash: Option) -> anyhow::Result<()> { self.write.retain(from_block, parent_hash)?; self.notify_finalized_head(); self.notify_head(); @@ -239,10 +228,7 @@ impl WriteCtx { } fn notify_finalized_head(&self) { - send_if_new( - &self.finalized_head_sender, - self.write.finalized_head().cloned(), - ) + send_if_new(&self.finalized_head_sender, self.write.finalized_head().cloned()) } fn starts_at(&self, block_number: BlockNumber, parent_hash: &Option) -> bool { @@ -265,25 +251,25 @@ fn send_if_new(sender: &tokio::sync::watch::Sender, value: T) { enum State { Idle, Init { - head: Option, + head: Option }, HeadProbe { future: BoxFuture<'static, BlockNumber>, - head: u64, + head: u64 }, Ingest { handle: IngestHandle, - head: Option, + head: Option }, IngestPause { until: Instant, - head: Option, - }, + head: Option + } } struct IngestHandle { msg_recv: tokio::sync::mpsc::Receiver, - task: JoinHandle>, + task: JoinHandle> } impl Drop for IngestHandle { @@ -299,7 +285,7 @@ struct Ctl { data_sources: Vec, retention_recv: tokio::sync::watch::Receiver, head_sender: tokio::sync::watch::Sender>, - finalized_head_sender: tokio::sync::watch::Sender>, + finalized_head_sender: tokio::sync::watch::Sender> } macro_rules! warn_on_tx_restart { @@ -356,10 +342,7 @@ impl Ctl { // need this variable to please the compiler let retention = self.retention_recv.borrow_and_update().clone(); let mut state = match retention { - RetentionStrategy::FromBlock { - number, - parent_hash, - } => { + RetentionStrategy::FromBlock { number, parent_hash } => { if !write.starts_at(number, &parent_hash) { blocking! { write.retain(number, parent_hash) @@ -386,12 +369,12 @@ impl Ctl { state = if let Some(n) = head { State::HeadProbe { future: fetch_chain_top(self.data_sources.clone()).boxed(), - head: *n, + head: *n } } else { State::Ingest { handle: self.spawn_ingest(&write), - head: None, + head: None } } } @@ -473,32 +456,25 @@ impl Ctl { } } - async fn handle_retention_change( - &mut self, - state: &mut State, - mut write: WriteCtx, - ) -> anyhow::Result { + async fn handle_retention_change(&mut self, state: &mut State, mut write: WriteCtx) -> anyhow::Result { // need this variable to please the compiler let retention = self.retention_recv.borrow_and_update().clone(); match retention { - RetentionStrategy::FromBlock { - number, - parent_hash, - } => { + RetentionStrategy::FromBlock { number, parent_hash } => { let will_erase_head = write.write.head().map_or(false, |h| h.number < number) || // FromBlock is greater than current head, so everything is cleared write.write.start_block() > number; // FromBlock is less than current front, dropping everything by design blocking_write!(write, write.retain(number, parent_hash))?; match state { State::Ingest { .. } if !will_erase_head => {} // Keep ingesting, head is valid - _ => *state = State::Init { head: None }, // New ingest needed + _ => *state = State::Init { head: None } // New ingest needed } } RetentionStrategy::Head(n) => match state { State::HeadProbe { head, .. } => *head = n, State::Ingest { head, .. } if head.is_some() => *head = Some(n), - _ => *state = State::Init { head: Some(n) }, + _ => *state = State::Init { head: Some(n) } }, - RetentionStrategy::None => *state = State::Idle, + RetentionStrategy::None => *state = State::Idle } Ok(write) } @@ -514,18 +490,15 @@ impl Ctl { self.data_sources.clone(), self.dataset_kind, write.write.next_block(), - write.write.head_hash(), + write.write.head_hash() ) - .instrument(ingest_span), + .instrument(ingest_span) ); IngestHandle { msg_recv, task } } - async fn new_write_ctx( - &self, - maybe_write: Option, - ) -> anyhow::Result { + async fn new_write_ctx(&self, maybe_write: Option) -> anyhow::Result { let db = self.db.clone(); let dataset_id = self.dataset_id; let dataset_kind = self.dataset_kind; @@ -546,15 +519,13 @@ impl Ctl { db: self.db.clone(), write, head_sender: self.head_sender.clone(), - finalized_head_sender: self.finalized_head_sender.clone(), + finalized_head_sender: self.finalized_head_sender.clone() }) } } async fn fetch_chain_top(clients: Vec) -> BlockNumber { - let mut calls: FuturesUnordered<_> = (0..clients.len()) - .map(|i| call_client(&clients, i, false)) - .collect(); + let mut calls: FuturesUnordered<_> = (0..clients.len()).map(|i| call_client(&clients, i, false)).collect(); let mut completed = 0; let mut last_seen = 0; @@ -605,7 +576,7 @@ async fn fetch_chain_top(clients: Vec) -> BlockNumber { async fn call_client( clients: &[ReqwestDataClient], idx: usize, - backoff: bool, + backoff: bool ) -> (anyhow::Result, usize) { if backoff { tokio::time::sleep(Duration::from_secs(5)).await; @@ -621,11 +592,7 @@ async fn fetch_chain_top(clients: Vec) -> BlockNumber { } #[instrument(name = "compaction", skip_all)] -async fn compaction_loop( - db: DBRef, - dataset_id: DatasetId, - mut enabled: tokio::sync::watch::Receiver, -) { +async fn compaction_loop(db: DBRef, dataset_id: DatasetId, mut enabled: tokio::sync::watch::Receiver) { let mut skips = 0; let skip_pause = [1, 2, 3, 4, 5, 10, 20, 30, 60]; loop { @@ -642,7 +609,7 @@ async fn compaction_loop( .await { Ok(res) => res, - Err(err) => Err(anyhow!("failed to await compaction task - {}", err)), + Err(err) => Err(anyhow!("failed to await compaction task - {}", err)) }; match result { diff --git a/crates/hotblocks/src/dataset_controller/ingest.rs b/crates/hotblocks/src/dataset_controller/ingest.rs index 8a1dd5c4..7c9ac7ff 100644 --- a/crates/hotblocks/src/dataset_controller/ingest.rs +++ b/crates/hotblocks/src/dataset_controller/ingest.rs @@ -1,13 +1,14 @@ -use crate::dataset_controller::ingest_generic::{IngestGeneric, IngestMessage}; -use crate::types::DatasetKind; use bytes::Bytes; -use futures::future::BoxFuture; -use futures::FutureExt; +use futures::{FutureExt, future::BoxFuture}; use serde::de::DeserializeOwned; use sqd_data_client::reqwest::ReqwestDataClient; use sqd_data_source::{DataSource, StandardDataSource}; use sqd_primitives::BlockNumber; +use crate::{ + dataset_controller::ingest_generic::{IngestGeneric, IngestMessage}, + types::DatasetKind +}; pub fn ingest<'a, 'b>( message_sender: tokio::sync::mpsc::Sender, @@ -15,30 +16,25 @@ pub fn ingest<'a, 'b>( dataset_kind: DatasetKind, first_block: BlockNumber, parent_block_hash: Option<&'a str> -) -> BoxFuture<'b, anyhow::Result<()>> -{ +) -> BoxFuture<'b, anyhow::Result<()>> { macro_rules! run { ($builder:expr) => {{ let mut data_source = StandardDataSource::new(sources, from_json_bytes); data_source.set_position(first_block, parent_block_hash); - IngestGeneric::new( - data_source, - $builder, - message_sender - ).run().boxed() + IngestGeneric::new(data_source, $builder, message_sender).run().boxed() }}; } match dataset_kind { DatasetKind::Evm => { run!(sqd_data::evm::tables::EvmChunkBuilder::new()) - }, + } DatasetKind::Solana => { run!(sqd_data::solana::tables::SolanaChunkBuilder::new()) - }, + } DatasetKind::Bitcoin => { run!(sqd_data::bitcoin::tables::BitcoinChunkBuilder::new()) - }, + } DatasetKind::HyperliquidFills => { run!(sqd_data::hyperliquid_fills::tables::HyperliquidFillsChunkBuilder::new()) } @@ -51,7 +47,6 @@ pub fn ingest<'a, 'b>( } } - fn from_json_bytes(bytes: Bytes) -> anyhow::Result { serde_json::from_slice(&bytes).map_err(|err| err.into()) -} \ No newline at end of file +} diff --git a/crates/hotblocks/src/dataset_controller/ingest_generic.rs b/crates/hotblocks/src/dataset_controller/ingest_generic.rs index b64cf74a..dbc7f863 100644 --- a/crates/hotblocks/src/dataset_controller/ingest_generic.rs +++ b/crates/hotblocks/src/dataset_controller/ingest_generic.rs @@ -1,14 +1,14 @@ -use crate::dataset_controller::write_controller::Rollback; +use std::fmt::{Display, Formatter}; + use anyhow::{anyhow, ensure}; use chrono::{DateTime, Utc}; use futures::StreamExt; use sqd_data_core::{BlockChunkBuilder, ChunkProcessor, PreparedChunk}; use sqd_data_source::{DataEvent, DataSource}; use sqd_primitives::{Block, BlockNumber, BlockRef, DataMask, DisplayBlockRefOption}; -use std::fmt::{Display, Formatter}; -use tracing::field::valuable; -use tracing::info; +use tracing::{field::valuable, info}; +use crate::dataset_controller::write_controller::Rollback; pub enum IngestMessage { FinalizedHead(BlockRef), @@ -19,7 +19,6 @@ pub enum IngestMessage { } } - pub struct NewChunk { pub finalized_head: Option, pub parent_block_hash: String, @@ -31,27 +30,24 @@ pub struct NewChunk { pub tables: PreparedChunk } - impl Display for NewChunk { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( - f, + f, "{}-{}-{} with finalized_head = {}", - self.first_block, - self.last_block, + self.first_block, + self.last_block, &self.last_block_hash, DisplayBlockRefOption(self.finalized_head.as_ref()) ) } } - struct DataBuilder { builder: CB, processor: Option } - impl DataBuilder { pub fn new(builder: CB) -> Self { Self { @@ -93,7 +89,6 @@ impl DataBuilder { } } - pub struct IngestGeneric { message_sender: tokio::sync::mpsc::Sender, data_source: DC, @@ -109,18 +104,12 @@ pub struct IngestGeneric { data_mask: DataMask } - impl IngestGeneric where DS: DataSource, CB: BlockChunkBuilder + Send + 'static { - pub fn new( - data_source: DS, - chunk_builder: CB, - message_sender: tokio::sync::mpsc::Sender - ) -> Self - { + pub fn new(data_source: DS, chunk_builder: CB, message_sender: tokio::sync::mpsc::Sender) -> Self { let first_block = data_source.get_next_block(); Self { message_sender, @@ -144,11 +133,9 @@ where DataEvent::FinalizedHead(head) => { self.set_finalized_head(head.number, &head.hash); if head.number < self.first_block { - self.message_sender.send( - IngestMessage::FinalizedHead(head) - ).await?; + self.message_sender.send(IngestMessage::FinalizedHead(head)).await?; } - }, + } DataEvent::Block { block, is_final } => { let data_mask = block.data_availability_mask(); if self.data_mask != data_mask { @@ -157,13 +144,9 @@ where } self.push_block(block, is_final)?; self.maybe_flush().await? - }, - DataEvent::Fork(prev_blocks) => { - self.handle_fork(prev_blocks).await? - }, - DataEvent::MaybeOnHead => { - self.flush().await? } + DataEvent::Fork(prev_blocks) => self.handle_fork(prev_blocks).await?, + DataEvent::MaybeOnHead => self.flush().await? } } Ok(()) @@ -174,14 +157,16 @@ where let (rollback_sender, rollback_recv) = tokio::sync::oneshot::channel(); - self.message_sender.send(IngestMessage::Fork { - prev_blocks, - rollback_sender - }).await?; + self.message_sender + .send(IngestMessage::Fork { + prev_blocks, + rollback_sender + }) + .await?; self.with_blocking_builder(|b| b.clear()).await; let rollback = rollback_recv.await?; - + info!( block_number = rollback.first_block, parent_block_hash =? rollback.parent_block_hash, @@ -191,8 +176,9 @@ where self.buffered_blocks = 0; self.finalized_head = None; self.first_block = rollback.first_block; - self.data_source.set_position(rollback.first_block, rollback.parent_block_hash.as_deref()); - + self.data_source + .set_position(rollback.first_block, rollback.parent_block_hash.as_deref()); + Ok(()) } @@ -233,7 +219,7 @@ where async fn flush(&mut self) -> anyhow::Result<()> { if self.buffered_blocks == 0 { - return Ok(()) + return Ok(()); } let parent_block_hash = self.parent_block_hash.clone(); @@ -241,11 +227,8 @@ where let last_block = self.last_block; let last_block_hash = self.last_block_hash.clone(); - let last_block_time = DateTime::::from_timestamp_millis( - self.last_block_time.unwrap_or(0) - ).ok_or_else(|| { - anyhow!("block time is out of range") - })?; + let last_block_time = DateTime::::from_timestamp_millis(self.last_block_time.unwrap_or(0)) + .ok_or_else(|| anyhow!("block time is out of range"))?; info!( first_block = first_block, @@ -263,16 +246,18 @@ where self.buffered_blocks = 0; self.first_block = last_block + 1; - self.message_sender.send(IngestMessage::NewChunk(NewChunk { - finalized_head: self.finalized_head.clone(), - parent_block_hash, - first_block, - last_block, - last_block_hash, - first_block_time: self.first_block_time, - last_block_time: self.last_block_time, - tables - })).await?; + self.message_sender + .send(IngestMessage::NewChunk(NewChunk { + finalized_head: self.finalized_head.clone(), + parent_block_hash, + first_block, + last_block, + last_block_hash, + first_block_time: self.first_block_time, + last_block_time: self.last_block_time, + tables + })) + .await?; Ok(()) } @@ -287,7 +272,9 @@ where let (result, builder) = tokio::task::spawn_blocking(move || { let result = cb(&mut builder); (result, builder) - }).await.unwrap(); + }) + .await + .unwrap(); self.builder = Some(builder); @@ -310,4 +297,4 @@ where }) } } -} \ No newline at end of file +} diff --git a/crates/hotblocks/src/dataset_controller/mod.rs b/crates/hotblocks/src/dataset_controller/mod.rs index 4833368b..9fe3502a 100644 --- a/crates/hotblocks/src/dataset_controller/mod.rs +++ b/crates/hotblocks/src/dataset_controller/mod.rs @@ -3,5 +3,4 @@ mod ingest; mod ingest_generic; mod write_controller; - pub use dataset_controller::DatasetController; diff --git a/crates/hotblocks/src/dataset_controller/write_controller.rs b/crates/hotblocks/src/dataset_controller/write_controller.rs index c54d4e33..1d74f5c6 100644 --- a/crates/hotblocks/src/dataset_controller/write_controller.rs +++ b/crates/hotblocks/src/dataset_controller/write_controller.rs @@ -1,10 +1,9 @@ -use crate::types::{DBRef, DatasetKind}; use anyhow::{anyhow, bail, ensure}; use sqd_primitives::{BlockNumber, BlockRef}; use sqd_storage::db::{Chunk as StorageChunk, Chunk, DatasetId}; -use tracing::field::valuable; -use tracing::{info, instrument, warn}; +use tracing::{field::valuable, info, instrument, warn}; +use crate::types::{DBRef, DatasetKind}; #[derive(Debug)] pub struct Rollback { @@ -12,7 +11,6 @@ pub struct Rollback { pub parent_block_hash: Option } - #[derive(Debug)] pub struct WriteController { db: DBRef, @@ -25,21 +23,15 @@ pub struct WriteController { finalized_head: Option } - impl WriteController { - pub fn new( - db: DBRef, - dataset_id: DatasetId, - dataset_kind: DatasetKind - ) -> anyhow::Result - { + pub fn new(db: DBRef, dataset_id: DatasetId, dataset_kind: DatasetKind) -> anyhow::Result { db.create_dataset_if_not_exists(dataset_id, dataset_kind.storage_kind())?; let snapshot = db.snapshot(); let label = snapshot.get_label(dataset_id)?; let first_chunk = snapshot.get_first_chunk(dataset_id)?; let last_chunk = snapshot.get_last_chunk(dataset_id)?; - + Ok(Self { db: db.clone(), dataset_id, @@ -51,11 +43,11 @@ impl WriteController { finalized_head: label.and_then(|l| l.finalized_head().cloned()) }) } - + pub fn dataset_kind(&self) -> DatasetKind { self.dataset_kind } - + pub fn start_block(&self) -> BlockNumber { self.first_block } @@ -69,7 +61,8 @@ impl WriteController { } pub fn head_hash(&self) -> Option<&str> { - self.head.as_ref() + self.head + .as_ref() .map(|h| h.hash.as_str()) .or_else(|| self.start_block_parent_hash()) } @@ -77,7 +70,7 @@ impl WriteController { pub fn head(&self) -> Option<&BlockRef> { self.head.as_ref() } - + pub fn finalized_head(&self) -> Option<&BlockRef> { self.finalized_head.as_ref() } @@ -85,7 +78,7 @@ impl WriteController { pub fn first_chunk_head(&self) -> Option<&BlockRef> { self.first_chunk_head.as_ref() } - + pub fn compute_rollback(&self, mut prev: &[BlockRef]) -> anyhow::Result { // FIXME: self.first_block rollback limit ensure!(!prev.is_empty(), "no previous blocks where provided"); @@ -95,11 +88,11 @@ impl WriteController { ); let snapshot = self.db.snapshot(); - - let label = snapshot.get_label(self.dataset_id)?.ok_or_else(|| { - anyhow!("dataset {} no longer exists", self.dataset_id) - })?; - + + let label = snapshot + .get_label(self.dataset_id)? + .ok_or_else(|| anyhow!("dataset {} no longer exists", self.dataset_id))?; + if let Some(finalized_head) = label.finalized_head() { let pos = match prev.iter().position(|b| b.number >= finalized_head.number) { Some(pos) => pos, @@ -111,37 +104,35 @@ impl WriteController { prev = &prev[pos..] } - let existing_chunks = snapshot.list_chunks( - self.dataset_id, - 0, - Some(prev.last().unwrap().number) - ).into_reversed(); + let existing_chunks = snapshot + .list_chunks(self.dataset_id, 0, Some(prev.last().unwrap().number)) + .into_reversed(); let mut prev_blocks = prev.iter().rev().peekable(); for chunk_result in existing_chunks { let head = chunk_result?; - + if prev_blocks.peek().map_or(false, |b| b.number < head.last_block()) { - continue + continue; } - + while prev_blocks.peek().map_or(false, |b| b.number > head.last_block()) { prev_blocks.next(); } - + if let Some(&b) = prev_blocks.peek() { if b.number == head.last_block() && b.hash == head.last_block_hash() { return Ok(Rollback { first_block: b.number + 1, parent_block_hash: Some(b.hash.clone()) - }) + }); } } else { return Ok(Rollback { first_block: head.last_block() + 1, parent_block_hash: Some(head.last_block_hash().to_string()) - }) + }); } } @@ -157,8 +148,7 @@ impl WriteController { from_block: BlockNumber, parent_block_hash: Option, delete_mismatch: bool - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { #[derive(Eq, PartialEq)] enum Status { Range { @@ -210,12 +200,14 @@ impl WriteController { ); } } else { - let head = tx.list_chunks(0, None) + let head = tx + .list_chunks(0, None) .into_reversed() .next() .expect("bottom chunk can't exist without head chunk")?; - let finalized_head = tx.label() + let finalized_head = tx + .label() .finalized_head() .filter(|h| chunk.first_block() <= h.number) .cloned(); @@ -228,7 +220,7 @@ impl WriteController { first_chunk: chunk, head, finalized_head - }) + }); } } } @@ -250,25 +242,24 @@ impl WriteController { first_chunk.first_block(), head.last_block() ); - }, + } Status::HashMismatch => { self.clear_heads(); warn!("cleared dataset due to parent block hash mismatch") - }, + } Status::Gap(existed) => { self.clear_heads(); warn!( "cleared dataset, because there was a gap between first requested block {} and already existed {}", - from_block, - existed + from_block, existed ) - }, + } Status::Clear => { self.clear_heads(); info!("dataset was cleared") } } - + self.first_block = from_block; self.parent_block_hash = parent_block_hash; Ok(()) @@ -293,10 +284,8 @@ impl WriteController { block_hash = %new_finalized_head.hash ))] pub fn finalize(&mut self, new_finalized_head: &BlockRef) -> anyhow::Result<()> { - let Some(head) = self.head.as_ref() else { - return Ok(()) - }; - + let Some(head) = self.head.as_ref() else { return Ok(()) }; + let update = self.db.update_dataset(self.dataset_id, |tx| { ensure!( tx.label().finalized_head() == self.finalized_head.as_ref(), @@ -305,18 +294,15 @@ impl WriteController { if let Some(current) = tx.label().finalized_head() { if current.number > new_finalized_head.number { - return Ok(None) + return Ok(None); } if current.number == new_finalized_head.number { ensure!(current.hash == new_finalized_head.hash); - return Ok(None) + return Ok(None); } } - let maybe_head_chunk = tx.list_chunks(0, None) - .into_reversed() - .next() - .transpose()?; + let maybe_head_chunk = tx.list_chunks(0, None).into_reversed().next().transpose()?; let head_chunk = match maybe_head_chunk { Some(c) if c.last_block_hash() == head.hash => c, @@ -331,14 +317,18 @@ impl WriteController { } else { new_finalized_head.clone() }; - + tx.set_finalized_head(new_finalized_head.clone()); - + Ok(Some(new_finalized_head)) })?; if let Some(new_head) = update { - info!(block_number = new_head.number, block_hash = new_head.hash, "saved new finalized head"); + info!( + block_number = new_head.number, + block_hash = new_head.hash, + "saved new finalized head" + ); self.finalized_head = Some(new_head); } else { info!("finalized head was ignored") @@ -353,24 +343,13 @@ impl WriteController { last_block_hash = %chunk.last_block_hash(), finalized_head = valuable(&finalized_head), ))] - pub fn new_chunk( - &mut self, - finalized_head: Option<&BlockRef>, - chunk: &StorageChunk - ) -> anyhow::Result<()> - { + pub fn new_chunk(&mut self, finalized_head: Option<&BlockRef>, chunk: &StorageChunk) -> anyhow::Result<()> { // FIXME: accept self.first_block rollback limit let finalized_head = self.db.update_dataset(self.dataset_id, |tx| { let new_finalized_head = match (finalized_head, tx.label().finalized_head()) { - (Some(new), None) => { - Some(new) - }, - (Some(new), Some(current)) if new.number >= current.number => { - Some(new) - }, - (_, Some(current)) if current.number < chunk.first_block() => { - Some(current) - }, + (Some(new), None) => Some(new), + (Some(new), Some(current)) if new.number >= current.number => Some(new), + (_, Some(current)) if current.number < chunk.first_block() => Some(current), (_, Some(_)) => bail!( "can't fork safely, because fork base is below the current finalized head \ and finalized head of the data pack is below the current" @@ -391,25 +370,25 @@ impl WriteController { Ok(new_finalized_head) })?; - info!( - finalized_head = valuable(&finalized_head), - "saved new chunk" - ); + info!(finalized_head = valuable(&finalized_head), "saved new chunk"); self.finalized_head = finalized_head; self.head = Some(get_chunk_head(&chunk)); - if self.first_chunk_head.as_ref().map_or(true, |h| chunk.first_block() <= h.number) { + if self + .first_chunk_head + .as_ref() + .map_or(true, |h| chunk.first_block() <= h.number) + { self.first_chunk_head = self.head.clone(); } - + Ok(()) } } - fn get_chunk_head(chunk: &Chunk) -> BlockRef { BlockRef { number: chunk.last_block(), hash: chunk.last_block_hash().to_string() } -} \ No newline at end of file +} diff --git a/crates/hotblocks/src/errors.rs b/crates/hotblocks/src/errors.rs index 66cb2adc..91621064 100644 --- a/crates/hotblocks/src/errors.rs +++ b/crates/hotblocks/src/errors.rs @@ -1,92 +1,84 @@ -use sqd_primitives::{BlockNumber, BlockRef}; -use sqd_storage::db::DatasetId; use std::fmt::{Display, Formatter}; +use sqd_primitives::{BlockNumber, BlockRef}; +use sqd_storage::db::DatasetId; #[derive(Debug)] pub struct Busy; - impl Display for Busy { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "service is busy") } } - impl std::error::Error for Busy {} - #[derive(Debug)] pub struct UnknownDataset { pub dataset_id: DatasetId } - impl Display for UnknownDataset { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "dataset {} does not exist", self.dataset_id) } } - impl std::error::Error for UnknownDataset {} - #[derive(Debug)] pub struct BlockRangeMissing { pub first_block: BlockNumber, pub last_block: BlockNumber } - impl Display for BlockRangeMissing { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( - f, "blocks from {} to {} are not available in the dataset", - self.first_block, - self.last_block + f, + "blocks from {} to {} are not available in the dataset", + self.first_block, self.last_block ) } } - impl std::error::Error for BlockRangeMissing {} - #[derive(Debug)] pub struct QueryIsAboveTheHead { pub finalized_head: Option } - impl Display for QueryIsAboveTheHead { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "first block requested by the query is above the current dataset head") + write!( + f, + "first block requested by the query is above the current dataset head" + ) } } - impl std::error::Error for QueryIsAboveTheHead {} - #[derive(Debug)] pub struct QueryKindMismatch { pub query_kind: sqd_storage::db::DatasetKind, pub dataset_kind: sqd_storage::db::DatasetKind } - impl Display for QueryKindMismatch { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "{} query was issued against {} dataset", self.query_kind, self.dataset_kind) + write!( + f, + "{} query was issued against {} dataset", + self.query_kind, self.dataset_kind + ) } } - impl std::error::Error for QueryKindMismatch {} - #[derive(Debug)] pub struct BlockItemIsNotAvailable { pub item_name: &'static str, @@ -94,18 +86,14 @@ pub struct BlockItemIsNotAvailable { pub last_block: BlockNumber } - impl Display for BlockItemIsNotAvailable { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( f, "'{}' data is not available for blocks {}..{}", - self.item_name, - self.first_block, - self.last_block + self.item_name, self.first_block, self.last_block ) } } - -impl std::error::Error for BlockItemIsNotAvailable {} \ No newline at end of file +impl std::error::Error for BlockItemIsNotAvailable {} diff --git a/crates/hotblocks/src/main.rs b/crates/hotblocks/src/main.rs index 3f802ca1..409ffecd 100644 --- a/crates/hotblocks/src/main.rs +++ b/crates/hotblocks/src/main.rs @@ -8,22 +8,20 @@ mod metrics; mod query; mod types; +use std::time::Duration; use api::build_api; use clap::Parser; use cli::CLI; -use std::time::Duration; use tracing::{debug, error, instrument}; use types::DBRef; - #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; - fn main() -> anyhow::Result<()> { let args = CLI::parse(); - + if let Some(n_threads) = args.query_threads { unsafe { sqd_polars::set_polars_thread_pool_size(n_threads); @@ -31,7 +29,7 @@ fn main() -> anyhow::Result<()> { } init_tracing(); - + tokio::runtime::Builder::new_multi_thread() .enable_all() .build()? @@ -52,20 +50,14 @@ fn main() -> anyhow::Result<()> { }) } - fn init_tracing() { use std::io::IsTerminal; - let env_filter = tracing_subscriber::EnvFilter::builder().parse_lossy( - std::env::var(tracing_subscriber::EnvFilter::DEFAULT_ENV) - .unwrap_or("info".to_string()), - ); + let env_filter = tracing_subscriber::EnvFilter::builder() + .parse_lossy(std::env::var(tracing_subscriber::EnvFilter::DEFAULT_ENV).unwrap_or("info".to_string())); if std::io::stdout().is_terminal() { - tracing_subscriber::fmt() - .with_env_filter(env_filter) - .compact() - .init(); + tracing_subscriber::fmt().with_env_filter(env_filter).compact().init(); } else { tracing_subscriber::fmt() .with_env_filter(env_filter) @@ -75,12 +67,9 @@ fn init_tracing() { } } - async fn shutdown_signal() { let ctrl_c = async { - tokio::signal::ctrl_c() - .await - .expect("failed to install Ctrl+C handler"); + tokio::signal::ctrl_c().await.expect("failed to install Ctrl+C handler"); }; #[cfg(unix)] @@ -100,7 +89,6 @@ async fn shutdown_signal() { } } - #[instrument(name = "db_cleanup", skip_all)] async fn db_cleanup_task(db: DBRef) { tokio::time::sleep(Duration::from_secs(10)).await; @@ -116,9 +104,9 @@ async fn db_cleanup_task(db: DBRef) { debug!("nothing to purge, pausing cleanup for 10 seconds"); tokio::time::sleep(Duration::from_secs(10)).await; } - }, + } Ok(Err(err)) => error!(error =? err, "database cleanup task failed"), Err(_) => error!("database cleanup task panicked") } } -} \ No newline at end of file +} diff --git a/crates/hotblocks/src/metrics.rs b/crates/hotblocks/src/metrics.rs index 5c80a6f0..d5e08c0b 100644 --- a/crates/hotblocks/src/metrics.rs +++ b/crates/hotblocks/src/metrics.rs @@ -1,26 +1,25 @@ -use crate::query::QueryExecutorCollector; -use crate::types::DBRef; +use std::{fmt::Write, sync::LazyLock, time::Duration}; + use anyhow::bail; -use prometheus_client::collector::Collector; -use prometheus_client::encoding::{ - DescriptorEncoder, EncodeLabelSet, EncodeLabelValue, LabelValueEncoder, -}; -use prometheus_client::metrics::{ - MetricType, - counter::Counter, - family::Family, - histogram::{Histogram, exponential_buckets}, +use prometheus_client::{ + collector::Collector, + encoding::{DescriptorEncoder, EncodeLabelSet, EncodeLabelValue, LabelValueEncoder}, + metrics::{ + MetricType, + counter::Counter, + family::Family, + histogram::{Histogram, exponential_buckets} + }, + registry::Registry }; -use prometheus_client::registry::Registry; use sqd_storage::db::{DatasetId, ReadSnapshot}; -use std::fmt::Write; -use std::sync::LazyLock; -use std::time::Duration; use tracing::error; +use crate::{query::QueryExecutorCollector, types::DBRef}; + #[derive(Copy, Clone, Hash, Debug, Default, Ord, PartialOrd, Eq, PartialEq, EncodeLabelSet)] struct DatasetLabel { - dataset: DatasetValue, + dataset: DatasetValue } #[derive(Copy, Clone, Hash, Debug, Default, Ord, PartialOrd, Eq, PartialEq)] @@ -35,7 +34,7 @@ impl EncodeLabelValue for DatasetValue { macro_rules! dataset_label { ($dataset_id:expr) => { DatasetLabel { - dataset: DatasetValue($dataset_id), + dataset: DatasetValue($dataset_id) } }; } @@ -57,27 +56,21 @@ pub static QUERY_ERROR_TOO_MANY_DATA_WAITERS: LazyLock = LazyLock::new( pub static COMPLETED_QUERIES: LazyLock = LazyLock::new(Default::default); -pub static STREAM_DURATIONS: LazyLock> = LazyLock::new(|| { - Family::new_with_constructor(|| Histogram::new(exponential_buckets(0.01, 2.0, 20))) -}); -pub static STREAM_BYTES: LazyLock> = LazyLock::new(|| { - Family::new_with_constructor(|| Histogram::new(exponential_buckets(1000., 2.0, 20))) -}); -pub static STREAM_BLOCKS: LazyLock> = LazyLock::new(|| { - Family::new_with_constructor(|| Histogram::new(exponential_buckets(1., 2.0, 30))) -}); +pub static STREAM_DURATIONS: LazyLock> = + LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(exponential_buckets(0.01, 2.0, 20)))); +pub static STREAM_BYTES: LazyLock> = + LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(exponential_buckets(1000., 2.0, 20)))); +pub static STREAM_BLOCKS: LazyLock> = + LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(exponential_buckets(1., 2.0, 30)))); pub static STREAM_CHUNKS: LazyLock> = LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(buckets(1., 20)))); -pub static STREAM_BYTES_PER_SECOND: LazyLock> = LazyLock::new(|| { - Family::new_with_constructor(|| Histogram::new(exponential_buckets(100., 3.0, 20))) -}); -pub static STREAM_BLOCKS_PER_SECOND: LazyLock> = LazyLock::new(|| { - Family::new_with_constructor(|| Histogram::new(exponential_buckets(1., 3.0, 20))) -}); - -pub static QUERIED_BLOCKS: LazyLock> = LazyLock::new(|| { - Family::new_with_constructor(|| Histogram::new(exponential_buckets(1., 2.0, 30))) -}); +pub static STREAM_BYTES_PER_SECOND: LazyLock> = + LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(exponential_buckets(100., 3.0, 20)))); +pub static STREAM_BLOCKS_PER_SECOND: LazyLock> = + LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(exponential_buckets(1., 3.0, 20)))); + +pub static QUERIED_BLOCKS: LazyLock> = + LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(exponential_buckets(1., 2.0, 30)))); pub static QUERIED_CHUNKS: LazyLock> = LazyLock::new(|| Family::new_with_constructor(|| Histogram::new(buckets(1., 20)))); @@ -91,15 +84,13 @@ pub fn report_query_too_many_data_waiters_error() { pub fn report_http_response(labels: &Vec<(&'static str, String)>, to_first_byte: Duration) { HTTP_STATUS.get_or_create(&labels).inc(); - HTTP_TTFB - .get_or_create(&labels) - .observe(to_first_byte.as_secs_f64()); + HTTP_TTFB.get_or_create(&labels).observe(to_first_byte.as_secs_f64()); } #[derive(Debug)] pub struct DatasetMetricsCollector { pub db: DBRef, - pub datasets: Vec, + pub datasets: Vec } impl Collector for DatasetMetricsCollector { @@ -130,7 +121,7 @@ impl Collector for DatasetMetricsCollector { fn collect_dataset_metrics( encoder: &mut DescriptorEncoder, db: &ReadSnapshot, - dataset_id: DatasetId, + dataset_id: DatasetId ) -> anyhow::Result<()> { let Some(label) = db.get_label(dataset_id)? else { return Ok(()); @@ -145,22 +136,12 @@ fn collect_dataset_metrics( }; encoder - .encode_descriptor( - "hotblocks_first_block", - "First block", - None, - MetricType::Gauge, - )? + .encode_descriptor("hotblocks_first_block", "First block", None, MetricType::Gauge)? .encode_family(&dataset_label!(dataset_id))? .encode_gauge(&first_chunk.first_block())?; encoder - .encode_descriptor( - "hotblocks_last_block", - "Last block", - None, - MetricType::Gauge, - )? + .encode_descriptor("hotblocks_last_block", "Last block", None, MetricType::Gauge)? .encode_family(&dataset_label!(dataset_id))? .encode_gauge(&last_chunk.last_block())?; @@ -169,7 +150,7 @@ fn collect_dataset_metrics( "hotblocks_last_block_timestamp_ms", "Timestamp of the last block", None, - MetricType::Gauge, + MetricType::Gauge )? .encode_family(&dataset_label!(dataset_id))? .encode_gauge(&last_chunk.last_block_time().unwrap_or(0))?; @@ -179,7 +160,7 @@ fn collect_dataset_metrics( "hotblocks_last_finalized_block", "Last finalized block", None, - MetricType::Gauge, + MetricType::Gauge )? .encode_family(&dataset_label!(dataset_id))? .encode_gauge(&label.finalized_head().map_or(0, |h| h.number))?; @@ -194,7 +175,7 @@ pub fn build_metrics_registry() -> Registry { registry.register( "query_error_too_many_tasks", "Number of query tasks rejected due to task queue overflow", - QUERY_ERROR_TOO_MANY_TASKS.clone(), + QUERY_ERROR_TOO_MANY_TASKS.clone() ); registry.register( @@ -203,61 +184,45 @@ pub fn build_metrics_registry() -> Registry { QUERY_ERROR_TOO_MANY_DATA_WAITERS.clone() ); - registry.register( - "http_status", - "Number of sent HTTP responses", - HTTP_STATUS.clone(), - ); + registry.register("http_status", "Number of sent HTTP responses", HTTP_STATUS.clone()); registry.register( "http_seconds_to_first_byte", "Time to first byte of HTTP responses", - HTTP_TTFB.clone(), + HTTP_TTFB.clone() ); - registry.register( - "stream_bytes", - "Number of bytes per stream", - STREAM_BYTES.clone(), - ); - registry.register( - "stream_blocks", - "Number of blocks per stream", - STREAM_BLOCKS.clone(), - ); - registry.register( - "stream_chunks", - "Number of chunks per stream", - STREAM_CHUNKS.clone(), - ); + registry.register("stream_bytes", "Number of bytes per stream", STREAM_BYTES.clone()); + registry.register("stream_blocks", "Number of blocks per stream", STREAM_BLOCKS.clone()); + registry.register("stream_chunks", "Number of chunks per stream", STREAM_CHUNKS.clone()); registry.register( "stream_bytes_per_second", "Completed streams bandwidth", - STREAM_BYTES_PER_SECOND.clone(), + STREAM_BYTES_PER_SECOND.clone() ); registry.register( "stream_blocks_per_second", "Completed streams speed in blocks", - STREAM_BLOCKS_PER_SECOND.clone(), + STREAM_BLOCKS_PER_SECOND.clone() ); registry.register( "stream_duration_seconds", "Durations of completed streams", - STREAM_DURATIONS.clone(), + STREAM_DURATIONS.clone() ); registry.register( "queried_blocks", "Number of blocks per running query", - QUERIED_BLOCKS.clone(), + QUERIED_BLOCKS.clone() ); registry.register( "queried_chunks", "Number of chunks per running query", - QUERIED_CHUNKS.clone(), + QUERIED_CHUNKS.clone() ); registry.register( "completed_queries", "Number of completed queries", - COMPLETED_QUERIES.clone(), + COMPLETED_QUERIES.clone() ); top_registry @@ -272,7 +237,7 @@ impl Collector for QueryExecutorCollector { "hotblocks_active_queries", "Number of currently active queries", None, - MetricType::Gauge, + MetricType::Gauge )? .encode_gauge(&active_queries)?; Ok(()) diff --git a/crates/hotblocks/src/query/executor.rs b/crates/hotblocks/src/query/executor.rs index d4d534db..b8cff53c 100644 --- a/crates/hotblocks/src/query/executor.rs +++ b/crates/hotblocks/src/query/executor.rs @@ -1,6 +1,9 @@ +use std::sync::{ + Arc, + atomic::{AtomicUsize, Ordering} +}; + use crate::metrics::{COMPLETED_QUERIES, report_query_too_many_tasks_error}; -use std::sync::Arc; -use std::sync::atomic::{AtomicUsize, Ordering}; #[derive(Clone)] pub struct QueryExecutor { @@ -8,7 +11,7 @@ pub struct QueryExecutor { in_flight: Arc, // limit for concurrent queries max_pending_tasks: usize, - urgency: usize, + urgency: usize } impl QueryExecutor { @@ -16,7 +19,7 @@ impl QueryExecutor { Self { in_flight: Arc::new(AtomicUsize::new(0)), max_pending_tasks, - urgency, + urgency } } @@ -25,7 +28,7 @@ impl QueryExecutor { if active_queries < self.max_pending_tasks { Some(QuerySlot { in_flight: self.in_flight.clone(), - urgency: self.urgency, + urgency: self.urgency }) } else { self.in_flight.fetch_sub(1, Ordering::SeqCst); @@ -41,7 +44,7 @@ impl QueryExecutor { pub struct QuerySlot { in_flight: Arc, - urgency: usize, + urgency: usize } impl Drop for QuerySlot { @@ -64,7 +67,7 @@ impl QuerySlot { pub async fn run(self, task: F) -> R where F: FnOnce(&Self) -> R + Send + 'static, - R: Send + 'static, + R: Send + 'static { let (tx, rx) = tokio::sync::oneshot::channel(); @@ -80,7 +83,7 @@ impl QuerySlot { #[derive(Debug)] pub struct QueryExecutorCollector { - in_flight: Arc, + in_flight: Arc } impl QueryExecutorCollector { diff --git a/crates/hotblocks/src/query/response.rs b/crates/hotblocks/src/query/response.rs index 1ce9c8e6..92ae4804 100644 --- a/crates/hotblocks/src/query/response.rs +++ b/crates/hotblocks/src/query/response.rs @@ -1,19 +1,22 @@ -use super::executor::{QueryExecutor, QuerySlot}; -use super::running::{RunningQuery, RunningQueryStats}; -use crate::errors::Busy; -use crate::metrics::{ - STREAM_BLOCKS, STREAM_BLOCKS_PER_SECOND, STREAM_BYTES, STREAM_BYTES_PER_SECOND, STREAM_CHUNKS, - STREAM_DURATIONS, -}; -use crate::types::ClientId; -use crate::types::DBRef; +use std::time::{Duration, Instant}; + use anyhow::bail; use bytes::Bytes; use sqd_primitives::BlockRef; use sqd_query::Query; use sqd_storage::db::DatasetId; -use std::time::Duration; -use std::time::Instant; + +use super::{ + executor::{QueryExecutor, QuerySlot}, + running::{RunningQuery, RunningQueryStats} +}; +use crate::{ + errors::Busy, + metrics::{ + STREAM_BLOCKS, STREAM_BLOCKS_PER_SECOND, STREAM_BYTES, STREAM_BYTES_PER_SECOND, STREAM_CHUNKS, STREAM_DURATIONS + }, + types::{ClientId, DBRef} +}; const DEFAULT_QUERY_LIMIT: Duration = Duration::from_secs(10); @@ -24,14 +27,14 @@ pub struct QueryResponse { dataset_id: DatasetId, client_id: ClientId, stats: QueryStreamStats, - time_limit: Duration, + time_limit: Duration } pub struct QueryStreamStats { response_chunks: u64, response_blocks: u64, response_bytes: u64, - start_time: Instant, + start_time: Instant } impl QueryStreamStats { @@ -40,18 +43,14 @@ impl QueryStreamStats { response_chunks: 0, response_blocks: 0, response_bytes: 0, - start_time: Instant::now(), + start_time: Instant::now() } } pub fn add_running_stats(&mut self, running_stats: &RunningQueryStats) { // We only count chunks/blocks that were actually written in the buffer - self.response_chunks = self - .response_chunks - .saturating_add(running_stats.chunks_returned); - self.response_blocks = self - .response_blocks - .saturating_add(running_stats.blocks_returned); + self.response_chunks = self.response_chunks.saturating_add(running_stats.chunks_returned); + self.response_blocks = self.response_blocks.saturating_add(running_stats.blocks_returned); } fn report_metrics(&self, dataset_id: &DatasetId, client_id: &ClientId) { @@ -70,9 +69,7 @@ impl QueryStreamStats { STREAM_BLOCKS.get_or_create(&labels).observe(blocks); STREAM_CHUNKS.get_or_create(&labels).observe(chunks); if duration > 0.0 { - STREAM_BYTES_PER_SECOND - .get_or_create(&labels) - .observe(bytes / duration); + STREAM_BYTES_PER_SECOND.get_or_create(&labels).observe(bytes / duration); STREAM_BLOCKS_PER_SECOND .get_or_create(&labels) .observe(blocks / duration); @@ -88,17 +85,14 @@ impl QueryResponse { query: Query, only_finalized: bool, time_limit: Option, - client_id: ClientId, + client_id: ClientId ) -> anyhow::Result { - let Some(slot) = executor.get_slot() else { - bail!(Busy) - }; + let Some(slot) = executor.get_slot() else { bail!(Busy) }; let stats = QueryStreamStats::new(); let mut runner = slot .run(move |slot| -> anyhow::Result<_> { - let mut runner = - RunningQuery::new(db, dataset_id, &query, only_finalized).map(Box::new)?; + let mut runner = RunningQuery::new(db, dataset_id, &query, only_finalized).map(Box::new)?; next_run(&mut runner, slot)?; Ok(runner) }) @@ -112,7 +106,7 @@ impl QueryResponse { stats, dataset_id, client_id, - time_limit, + time_limit }; Ok(response) @@ -138,8 +132,7 @@ impl QueryResponse { if runner.buffered_bytes() > 0 { let bytes = runner.take_buffered_bytes(); - self.stats.response_bytes = - self.stats.response_bytes.saturating_add(bytes.len() as u64); + self.stats.response_bytes = self.stats.response_bytes.saturating_add(bytes.len() as u64); self.runner = Some(runner); return Ok(Some(bytes)); } @@ -164,8 +157,7 @@ impl QueryResponse { if runner.has_next_chunk() { let bytes = runner.take_buffered_bytes(); - self.stats.response_bytes = - self.stats.response_bytes.saturating_add(bytes.len() as u64); + self.stats.response_bytes = self.stats.response_bytes.saturating_add(bytes.len() as u64); self.runner = Some(runner); Ok(Some(bytes)) } else { diff --git a/crates/hotblocks/src/query/running.rs b/crates/hotblocks/src/query/running.rs index 3914258b..ab06b030 100644 --- a/crates/hotblocks/src/query/running.rs +++ b/crates/hotblocks/src/query/running.rs @@ -1,20 +1,22 @@ -use crate::errors::{BlockItemIsNotAvailable, QueryKindMismatch}; -use crate::errors::{BlockRangeMissing, QueryIsAboveTheHead}; -use crate::metrics::{QUERIED_BLOCKS, QUERIED_CHUNKS}; -use crate::query::static_snapshot::{StaticChunkIterator, StaticChunkReader, StaticSnapshot}; -use crate::types::{ClientId, DBRef, DatasetKind}; +use std::io::Write; + use anyhow::{anyhow, bail, ensure}; use bytes::{BufMut, Bytes, BytesMut}; -use flate2::Compression; -use flate2::write::GzEncoder; +use flate2::{Compression, write::GzEncoder}; use sqd_primitives::{BlockNumber, BlockRef}; use sqd_query::{JsonLinesWriter, Plan, Query}; use sqd_storage::db::{Chunk as StorageChunk, DatasetId}; -use std::io::Write; + +use crate::{ + errors::{BlockItemIsNotAvailable, BlockRangeMissing, QueryIsAboveTheHead, QueryKindMismatch}, + metrics::{QUERIED_BLOCKS, QUERIED_CHUNKS}, + query::static_snapshot::{StaticChunkIterator, StaticChunkReader, StaticSnapshot}, + types::{ClientId, DBRef, DatasetKind} +}; struct LeftOver { chunk: StaticChunkReader, - next_block: BlockNumber, + next_block: BlockNumber } pub struct RunningQueryStats { @@ -40,12 +42,8 @@ impl RunningQueryStats { ("dataset_id", dataset_id.as_str().to_owned()), ]; - QUERIED_BLOCKS - .get_or_create(&labels) - .observe(self.blocks_read as f64); - QUERIED_CHUNKS - .get_or_create(&labels) - .observe(self.chunks_read as f64); + QUERIED_BLOCKS.get_or_create(&labels).observe(self.blocks_read as f64); + QUERIED_CHUNKS.get_or_create(&labels).observe(self.chunks_read as f64); // blocks_returned and chunks_returned are reported by the streaming part } @@ -59,16 +57,11 @@ pub struct RunningQuery { chunk_iterator: StaticChunkIterator, finalized_head: Option, buf: GzEncoder>, - stats: RunningQueryStats, + stats: RunningQueryStats } impl RunningQuery { - pub fn new( - db: DBRef, - dataset_id: DatasetId, - query: &Query, - only_finalized: bool, - ) -> anyhow::Result { + pub fn new(db: DBRef, dataset_id: DatasetId, query: &Query, only_finalized: bool) -> anyhow::Result { let snapshot = StaticSnapshot::new(db); let finalized_head = match snapshot.get_label(dataset_id)? { @@ -86,14 +79,11 @@ impl RunningQuery { } }; - let mut chunk_iterator = - StaticChunkIterator::new(snapshot, dataset_id, query.first_block(), None); + let mut chunk_iterator = StaticChunkIterator::new(snapshot, dataset_id, query.first_block(), None); let mut stats = RunningQueryStats::new(); let Some(first_chunk) = chunk_iterator.next().transpose()? else { - bail!(QueryIsAboveTheHead { - finalized_head: None - }) + bail!(QueryIsAboveTheHead { finalized_head: None }) }; stats.chunks_read += 1; stats.blocks_read += first_chunk.last_block() - first_chunk.first_block() + 1; @@ -150,7 +140,7 @@ impl RunningQuery { chunk_iterator, finalized_head, buf: GzEncoder::new(BytesMut::new().writer(), Compression::fast()), - stats, + stats }) } @@ -192,35 +182,27 @@ impl RunningQuery { (left_over.chunk, false) } else { let storage_chunk = self.next_chunk()?; - let chunk = self - .chunk_iterator - .snapshot() - .create_chunk_reader(storage_chunk); + let chunk = self.chunk_iterator.snapshot().create_chunk_reader(storage_chunk); (chunk, true) }; - if self - .last_block - .map_or(false, |end| end < chunk.last_block()) - { + if self.last_block.map_or(false, |end| end < chunk.last_block()) { let last_block = self.last_block; self.plan.set_last_block(last_block); } else { self.plan.set_last_block(None); } - let query_result = chunk - .with_reader(|reader| self.plan.execute(reader)) - .map_err(|err| { - if let Some(err) = err.downcast_ref::() { - return anyhow!(BlockItemIsNotAvailable { - item_name: err.table_name, - first_block: chunk.first_block(), - last_block: chunk.last_block() - }); - } - err - }); + let query_result = chunk.with_reader(|reader| self.plan.execute(reader)).map_err(|err| { + if let Some(err) = err.downcast_ref::() { + return anyhow!(BlockItemIsNotAvailable { + item_name: err.table_name, + first_block: chunk.first_block(), + last_block: chunk.last_block() + }); + } + err + }); // no matter what, we are moving to the next chunk self.plan.set_first_block(None); @@ -234,15 +216,13 @@ impl RunningQuery { self.stats.chunks_returned += 1; } self.stats.blocks_returned += block_writer.num_blocks() as u64; - + if chunk.last_block() > block_writer.last_block() - && self - .last_block - .map_or(true, |end| end > block_writer.last_block()) + && self.last_block.map_or(true, |end| end > block_writer.last_block()) { self.left_over = Some(LeftOver { chunk, - next_block: block_writer.last_block() + 1, + next_block: block_writer.last_block() + 1 }) } @@ -252,9 +232,7 @@ impl RunningQuery { .write_blocks(&mut block_writer) .expect("IO errors are not possible"); - json_lines_writer - .finish() - .expect("IO errors are not possible"); + json_lines_writer.finish().expect("IO errors are not possible"); self.buf.flush().expect("IO errors are not possible"); @@ -274,11 +252,9 @@ impl RunningQuery { let next_chunk = maybe_next_chunk?; self.stats.chunks_read += 1; self.stats.blocks_read += chunk.last_block() - chunk.first_block() + 1; - + let is_continuous = chunk.last_block() + 1 == next_chunk.first_block(); - let is_requested = self - .last_block - .map_or(true, |end| next_chunk.first_block() <= end); + let is_requested = self.last_block.map_or(true, |end| next_chunk.first_block() <= end); if is_continuous && is_requested { Some(next_chunk) } else { diff --git a/crates/hotblocks/src/query/service.rs b/crates/hotblocks/src/query/service.rs index a32a99a3..8edd1bad 100644 --- a/crates/hotblocks/src/query/service.rs +++ b/crates/hotblocks/src/query/service.rs @@ -1,27 +1,31 @@ -use super::executor::QueryExecutor; -use super::response::QueryResponse; -use crate::dataset_controller::DatasetController; -use crate::errors::{Busy, QueryIsAboveTheHead, QueryKindMismatch}; -use crate::types::{ClientId, DBRef, DatasetKind}; -use crate::query::QueryExecutorCollector; +use std::{ + sync::{ + Arc, + atomic::{AtomicUsize, Ordering} + }, + time::Duration +}; + use anyhow::{bail, ensure}; use sqd_query::Query; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; -use std::time::Duration; +use super::{executor::QueryExecutor, response::QueryResponse}; +use crate::{ + dataset_controller::DatasetController, + errors::{Busy, QueryIsAboveTheHead, QueryKindMismatch}, + query::QueryExecutorCollector, + types::{ClientId, DBRef, DatasetKind} +}; pub type QueryServiceRef = Arc; - pub struct QueryServiceBuilder { db: DBRef, max_data_waiters: usize, max_pending_tasks: usize, - urgency: usize, + urgency: usize } - impl QueryServiceBuilder { pub fn new(db: DBRef) -> Self { Self { @@ -65,14 +69,12 @@ impl QueryServiceBuilder { } } - pub struct QueryService { db: DBRef, executor: QueryExecutor, wait_slots: WaitSlots } - impl QueryService { pub fn builder(db: DBRef) -> QueryServiceBuilder { QueryServiceBuilder::new(db) @@ -82,7 +84,7 @@ impl QueryService { &self, dataset: &DatasetController, query: Query, - client_id: ClientId, + client_id: ClientId ) -> anyhow::Result { self.query_internal(dataset, query, false, client_id).await } @@ -91,7 +93,7 @@ impl QueryService { &self, dataset: &DatasetController, query: Query, - client_id: ClientId, + client_id: ClientId ) -> anyhow::Result { self.query_internal(dataset, query, true, client_id).await } @@ -101,7 +103,7 @@ impl QueryService { dataset: &DatasetController, query: Query, finalized: bool, - client_id: ClientId, + client_id: ClientId ) -> anyhow::Result { ensure!( dataset.dataset_kind() == DatasetKind::from_query(&query), @@ -130,7 +132,7 @@ impl QueryService { ); } true - }, + } Some(_) | None => true }; @@ -146,9 +148,7 @@ impl QueryService { } }) .await - .map_err(|_| QueryIsAboveTheHead { - finalized_head: None - })?; + .map_err(|_| QueryIsAboveTheHead { finalized_head: None })?; } QueryResponse::new( @@ -158,8 +158,9 @@ impl QueryService { query, finalized, None, - client_id, - ).await + client_id + ) + .await } pub fn metrics_collector(&self) -> QueryExecutorCollector { @@ -167,19 +168,15 @@ impl QueryService { } } - struct WaitSlots { waiters: AtomicUsize, limit: usize } - impl WaitSlots { fn get(&self) -> Option> { let previously_waiting = self.waiters.fetch_add(1, Ordering::SeqCst); - let slot = WaitingSlot { - waiters: &self.waiters - }; + let slot = WaitingSlot { waiters: &self.waiters }; if previously_waiting < self.limit { Some(slot) } else { @@ -189,12 +186,10 @@ impl WaitSlots { } } - struct WaitingSlot<'a> { waiters: &'a AtomicUsize } - impl<'a> Drop for WaitingSlot<'a> { fn drop(&mut self) { self.waiters.fetch_sub(1, Ordering::SeqCst); diff --git a/crates/hotblocks/src/query/static_snapshot.rs b/crates/hotblocks/src/query/static_snapshot.rs index f79eeccc..e7e16740 100644 --- a/crates/hotblocks/src/query/static_snapshot.rs +++ b/crates/hotblocks/src/query/static_snapshot.rs @@ -1,34 +1,32 @@ -use crate::types::DBRef; +use std::sync::Arc; + use ouroboros::self_referencing; use sqd_primitives::BlockNumber; use sqd_storage::db::{Chunk, ChunkReader, DatasetId, DatasetLabel, ReadSnapshot, ReadSnapshotChunkIterator}; -use std::sync::Arc; +use crate::types::DBRef; #[self_referencing] struct StaticSnapshotInner { db: DBRef, #[borrows(db)] #[covariant] - snapshot: ReadSnapshot<'this>, + snapshot: ReadSnapshot<'this> } - #[derive(Clone)] pub struct StaticSnapshot { inner: Arc } - impl StaticSnapshot { pub fn new(db: DBRef) -> Self { let inner = StaticSnapshotInnerBuilder { db, snapshot_builder: |db: &DBRef| db.snapshot() - }.build(); - Self { - inner: Arc::new(inner) } + .build(); + Self { inner: Arc::new(inner) } } pub fn snapshot(&self) -> &ReadSnapshot<'_> { @@ -44,7 +42,6 @@ impl StaticSnapshot { } } - #[self_referencing] struct StaticChunkIteratorInner { snapshot: StaticSnapshot, @@ -53,27 +50,23 @@ struct StaticChunkIteratorInner { iter: ReadSnapshotChunkIterator<'this> } - pub struct StaticChunkIterator { inner: StaticChunkIteratorInner } - impl StaticChunkIterator { pub fn new( snapshot: StaticSnapshot, dataset_id: DatasetId, from_block: BlockNumber, to_block: Option - ) -> Self - { + ) -> Self { let inner = StaticChunkIteratorInnerBuilder { snapshot, iter_builder: |snapshot| snapshot.snapshot().list_chunks(dataset_id, from_block, to_block) - }.build(); - Self { - inner } + .build(); + Self { inner } } pub fn snapshot(&self) -> &StaticSnapshot { @@ -81,7 +74,6 @@ impl StaticChunkIterator { } } - impl Iterator for StaticChunkIterator { type Item = anyhow::Result; @@ -90,7 +82,6 @@ impl Iterator for StaticChunkIterator { } } - #[self_referencing] struct StaticChunkReaderInner { snapshot: StaticSnapshot, @@ -99,22 +90,19 @@ struct StaticChunkReaderInner { reader: ChunkReader<'this> } - #[derive(Clone)] pub struct StaticChunkReader { inner: Arc } - impl StaticChunkReader { pub fn new(snapshot: StaticSnapshot, chunk: Chunk) -> Self { let inner = StaticChunkReaderInnerBuilder { snapshot, reader_builder: |snapshot| snapshot.snapshot().create_chunk_reader(chunk) - }.build(); - Self { - inner: Arc::new(inner) } + .build(); + Self { inner: Arc::new(inner) } } pub fn with_reader(&self, cb: F) -> R @@ -131,4 +119,4 @@ impl StaticChunkReader { pub fn last_block(&self) -> BlockNumber { self.inner.with_reader(|r| r.last_block()) } -} \ No newline at end of file +} diff --git a/crates/hotblocks/src/types.rs b/crates/hotblocks/src/types.rs index 2e7b89ef..ea0eac95 100644 --- a/crates/hotblocks/src/types.rs +++ b/crates/hotblocks/src/types.rs @@ -1,13 +1,12 @@ +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use sqd_dataset::DatasetDescriptionRef; use sqd_query::{BlockNumber, Query}; use sqd_storage::db::Database; -use std::sync::Arc; - pub type DBRef = Arc; - #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum DatasetKind { #[serde(rename = "evm")] @@ -24,7 +23,6 @@ pub enum DatasetKind { Tron } - impl DatasetKind { pub fn storage_kind(&self) -> sqd_storage::db::DatasetKind { sqd_storage::db::DatasetKind::from_str(self.as_str()) @@ -37,21 +35,25 @@ impl DatasetKind { DatasetKind::Bitcoin => "bitcoin", DatasetKind::HyperliquidFills => "hl-fills", DatasetKind::HyperliquidReplicaCmds => "hl-replica-cmds", - DatasetKind::Tron => "tron", + DatasetKind::Tron => "tron" } } - + pub fn dataset_description(&self) -> DatasetDescriptionRef { match self { DatasetKind::Evm => sqd_data::evm::tables::EvmChunkBuilder::dataset_description(), DatasetKind::Solana => sqd_data::solana::tables::SolanaChunkBuilder::dataset_description(), DatasetKind::Bitcoin => sqd_data::bitcoin::tables::BitcoinChunkBuilder::dataset_description(), - DatasetKind::HyperliquidFills => sqd_data::hyperliquid_fills::tables::HyperliquidFillsChunkBuilder::dataset_description(), - DatasetKind::HyperliquidReplicaCmds => sqd_data::hyperliquid_replica_cmds::tables::HyperliquidReplicaCmdsChunkBuilder::dataset_description(), - DatasetKind::Tron => sqd_data::tron::tables::TronChunkBuilder::dataset_description(), + DatasetKind::HyperliquidFills => { + sqd_data::hyperliquid_fills::tables::HyperliquidFillsChunkBuilder::dataset_description() + } + DatasetKind::HyperliquidReplicaCmds => { + sqd_data::hyperliquid_replica_cmds::tables::HyperliquidReplicaCmdsChunkBuilder::dataset_description() + } + DatasetKind::Tron => sqd_data::tron::tables::TronChunkBuilder::dataset_description() } } - + pub fn from_query(query: &Query) -> Self { match query { Query::Eth(_) => Self::Evm, @@ -65,7 +67,6 @@ impl DatasetKind { } } - #[derive(Debug, Clone, Serialize, Deserialize)] pub enum RetentionStrategy { FromBlock { @@ -76,7 +77,6 @@ pub enum RetentionStrategy { None } - #[derive(Clone, Debug)] pub struct ClientId(String); diff --git a/crates/polars/src/arrow.rs b/crates/polars/src/arrow.rs index b9e16f49..5536c3bf 100644 --- a/crates/polars/src/arrow.rs +++ b/crates/polars/src/arrow.rs @@ -3,29 +3,20 @@ use polars::prelude::{DataFrame, IntoLazy, LazyFrame, Series, UnionArgs}; use polars_arrow::array::to_data; use polars_core::prelude::{BooleanChunked, CompatLevel, SortMultipleOptions}; - pub fn array_series(name: &str, arr: &dyn Array) -> anyhow::Result { - let s = Series::from_arrow( - name.into(), - Box::::from(arr), - )?; + let s = Series::from_arrow(name.into(), Box::::from(arr))?; Ok(s) } - pub fn record_batch_to_polars_df(batch: &RecordBatch) -> anyhow::Result { let schema = batch.schema(); let mut columns = Vec::with_capacity(batch.num_columns()); for (i, column) in batch.columns().iter().enumerate() { - columns.push(array_series( - schema.fields().get(i).unwrap().name(), - column - )?); + columns.push(array_series(schema.fields().get(i).unwrap().name(), column)?); } Ok(DataFrame::from_iter(columns)) } - pub fn record_batch_vec_to_lazy_polars_df(batch_vec: &[RecordBatch]) -> anyhow::Result { Ok(match batch_vec.len() { 0 => DataFrame::empty().lazy(), @@ -33,27 +24,25 @@ pub fn record_batch_vec_to_lazy_polars_df(batch_vec: &[RecordBatch]) -> anyhow:: let b = &batch_vec[0]; let df = record_batch_to_polars_df(b)?; df.lazy() - }, + } _ => { - let batches = batch_vec.iter().map(|record_batch| { - let df = record_batch_to_polars_df(record_batch)?; - Ok(df.lazy()) - }).collect::>>()?; + let batches = batch_vec + .iter() + .map(|record_batch| { + let df = record_batch_to_polars_df(record_batch)?; + Ok(df.lazy()) + }) + .collect::>>()?; - polars::prelude::concat( - batches.as_slice(), - UnionArgs::default() - )? + polars::prelude::concat(batches.as_slice(), UnionArgs::default())? } }) } - pub fn polars_series_to_row_index_iter(series: &Series) -> impl Iterator + '_ { series.u32().unwrap().into_no_null_iter() } - pub fn polars_series_to_arrow_array(series: &Series) -> ArrayRef { let series = series.rechunk(); assert_eq!(series.chunks().len(), 1); @@ -61,7 +50,6 @@ pub fn polars_series_to_arrow_array(series: &Series) -> ArrayRef { ArrayRef::from(polars_array) } - pub fn polars_boolean_to_arrow_boolean(values: &BooleanChunked) -> BooleanArray { let chunks = values.chunks(); assert_eq!(chunks.len(), 1); @@ -70,27 +58,27 @@ pub fn polars_boolean_to_arrow_boolean(values: &BooleanChunked) -> BooleanArray BooleanArray::from(array_data) } - pub fn sort_record_batch(record_batch: &RecordBatch, by: Vec) -> anyhow::Result { let df = record_batch_to_polars_df(record_batch)?; - - let sorted_df = df.sort( - by, - SortMultipleOptions::default().with_multithreaded(false) - )?; - + + let sorted_df = df.sort(by, SortMultipleOptions::default().with_multithreaded(false))?; + let schema = record_batch.schema(); - - let columns: Vec = sorted_df.iter().enumerate().map(|(i, s)| { - let array = polars_series_to_arrow_array(s); - if array.data_type() == schema.field(i).data_type() { - array - } else { - arrow::compute::cast(&array, schema.field(i).data_type()).unwrap() - } - }).collect(); - + + let columns: Vec = sorted_df + .iter() + .enumerate() + .map(|(i, s)| { + let array = polars_series_to_arrow_array(s); + if array.data_type() == schema.field(i).data_type() { + array + } else { + arrow::compute::cast(&array, schema.field(i).data_type()).unwrap() + } + }) + .collect(); + let sorted_batch = RecordBatch::try_new(schema, columns)?; - + Ok(sorted_batch) -} \ No newline at end of file +} diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index 6cd60385..4281a957 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -1,10 +1,8 @@ -pub mod prelude; pub mod arrow; - +pub mod prelude; pub use polars_core::POOL; - /// Safety: Call it in the main at the very beginning. /// See https://doc.rust-lang.org/std/env/fn.set_var.html for more details pub unsafe fn set_polars_thread_pool_size(n_threads: usize) { diff --git a/crates/polars/src/prelude.rs b/crates/polars/src/prelude.rs index 6703623b..2bac2b00 100644 --- a/crates/polars/src/prelude.rs +++ b/crates/polars/src/prelude.rs @@ -1,2 +1 @@ - -pub use polars::prelude::*; \ No newline at end of file +pub use polars::prelude::*; diff --git a/crates/primitives/src/lib.rs b/crates/primitives/src/lib.rs index 33a6522f..bcf41694 100644 --- a/crates/primitives/src/lib.rs +++ b/crates/primitives/src/lib.rs @@ -4,4 +4,4 @@ pub mod range; pub mod sid; mod types; -pub use types::*; \ No newline at end of file +pub use types::*; diff --git a/crates/primitives/src/range/arith.rs b/crates/primitives/src/range/arith.rs index db27b114..3b5d9ed3 100644 --- a/crates/primitives/src/range/arith.rs +++ b/crates/primitives/src/range/arith.rs @@ -1,79 +1,68 @@ -use std::cmp::{max, min, Ordering}; -use std::ops::Range; +use std::{ + cmp::{max, min, Ordering}, + ops::Range +}; - -pub fn seal(ranges: L) -> impl Iterator> +pub fn seal(ranges: L) -> impl Iterator> where I: Ord, - L: IntoIterator>, + L: IntoIterator> { let mut list = ranges.into_iter().peekable(); - std::iter::from_fn(move || { - loop { - match (list.next(), list.peek_mut()) { - (Some(head), Some(next)) => { - match head.end.cmp(&next.start) { - Ordering::Less => { - return Some(head) - } - Ordering::Equal => { - next.start = head.start - } - Ordering::Greater => { - panic!("unordered or intersecting ranges found in range list") - } - } + std::iter::from_fn(move || loop { + match (list.next(), list.peek_mut()) { + (Some(head), Some(next)) => match head.end.cmp(&next.start) { + Ordering::Less => return Some(head), + Ordering::Equal => next.start = head.start, + Ordering::Greater => { + panic!("unordered or intersecting ranges found in range list") } - (head, _) => return head - } + }, + (head, _) => return head } }) } - -pub fn intersection(a: L1, b: L2) -> impl Iterator> +pub fn intersection(a: L1, b: L2) -> impl Iterator> where - L1: IntoIterator>, - L2: IntoIterator>, - I: Ord + Clone, + L1: IntoIterator>, + L2: IntoIterator>, + I: Ord + Clone { let mut list1 = a.into_iter().peekable(); let mut list2 = b.into_iter().peekable(); - std::iter::from_fn(move || { - loop { - match (list1.peek().cloned(), list2.peek().cloned()) { - (Some(h1), Some(h2)) => { - let start = max(h1.start, h2.start); - let end = match h1.end.cmp(&h2.end) { - Ordering::Less => { - list1.next(); - h1.end - } - Ordering::Equal => { - list1.next(); - list2.next(); - h1.end - } - Ordering::Greater => { - list2.next(); - h2.end - } - }; - if start < end { - return Some(start..end) + std::iter::from_fn(move || loop { + match (list1.peek().cloned(), list2.peek().cloned()) { + (Some(h1), Some(h2)) => { + let start = max(h1.start, h2.start); + let end = match h1.end.cmp(&h2.end) { + Ordering::Less => { + list1.next(); + h1.end } - }, - (_, None) | (None, _) => return None + Ordering::Equal => { + list1.next(); + list2.next(); + h1.end + } + Ordering::Greater => { + list2.next(); + h2.end + } + }; + if start < end { + return Some(start..end); + } } + (_, None) | (None, _) => return None } }) } - -pub fn union(a: L1, b: L2) -> impl Iterator> +pub fn union(a: L1, b: L2) -> impl Iterator> where - L1: IntoIterator>, - L2: IntoIterator>, + L1: IntoIterator>, + L2: IntoIterator>, I: Ord + Copy { let mut list1 = a.into_iter().peekable(); @@ -94,9 +83,9 @@ where list2.peek_mut().unwrap().start = end; } Some(start..end) - }, + } (_, None) => list1.next(), (None, _) => list2.next() } })) -} \ No newline at end of file +} diff --git a/crates/primitives/src/range/mod.rs b/crates/primitives/src/range/mod.rs index 6de8174b..b14d45f2 100644 --- a/crates/primitives/src/range/mod.rs +++ b/crates/primitives/src/range/mod.rs @@ -1,50 +1,46 @@ -use std::cmp::{max, min, Ordering}; -use std::fmt::Debug; -use std::ops::{Range, Sub}; - -use crate::ItemIndex; -use crate::range::arith::{intersection, seal, union}; +use std::{ + cmp::{max, min, Ordering}, + fmt::Debug, + ops::{Range, Sub} +}; +use crate::{ + range::arith::{intersection, seal, union}, + ItemIndex +}; mod arith; - #[derive(Debug)] pub struct RangeList { ranges: Vec> } - -impl TryFrom>> for RangeList { +impl TryFrom>> for RangeList { type Error = &'static str; fn try_from(ranges: Vec>) -> Result { if ranges.iter().any(|r| r.is_empty()) { - return Err("range list can only contain non-empty ranges") + return Err("range list can only contain non-empty ranges"); } for i in 1..ranges.len() { let current = &ranges[i]; - let prev = &ranges[i-1]; + let prev = &ranges[i - 1]; if prev.end > current.start { - return Err("found unordered or overlapping ranges") + return Err("found unordered or overlapping ranges"); } } - Ok(Self { - ranges - }) + Ok(Self { ranges }) } } - -impl RangeList { +impl RangeList { pub fn new(ranges: Vec>) -> Self { Self::try_from(ranges).unwrap() } - + pub unsafe fn new_unchecked(ranges: Vec>) -> Self { - Self { - ranges - } + Self { ranges } } pub fn seal>>(list: I) -> Self { @@ -57,10 +53,10 @@ impl RangeList { self.ranges.len() == 0 } - pub fn iter(&self) -> impl Iterator> + '_ { + pub fn iter(&self) -> impl Iterator> + '_ { self.ranges.iter().cloned() } - + pub fn as_slice(&self) -> &[Range] { &self.ranges } @@ -76,18 +72,17 @@ impl RangeList { ranges: intersection(self.iter(), other.iter()).collect() } } - + pub fn end(&self) -> Idx { self.ranges.last().map(|r| r.end).unwrap_or_default() } - + pub fn len(&self) -> usize { self.ranges.len() } } - -impl + Debug> RangeList { +impl + Debug> RangeList { pub fn paginate<'a>(&'a self, page_offsets: &'a [Idx]) -> impl Iterator)> + 'a { let mut ranges = self.iter().peekable(); let mut i = 0; @@ -105,39 +100,43 @@ impl + Debug> RangeList { if intersection == page { i += 1; - return Some((pix, None)) + return Some((pix, None)); } - + if intersection.end > intersection.start { - included_ranges.push( - intersection.start - page.start..intersection.end - page.start - ); + included_ranges.push(intersection.start - page.start..intersection.end - page.start); } - + if intersection.end == range.end { ranges.next(); } if page.end == intersection.end { - break + break; } } else { return if included_ranges.len() > 0 { - Some((pix, Some(RangeList { - ranges: included_ranges - }))) + Some(( + pix, + Some(RangeList { + ranges: included_ranges + }) + )) } else { None - } + }; } } i += 1; if included_ranges.len() > 0 { - return Some((pix, Some(RangeList { - ranges: std::mem::take(&mut included_ranges) - }))) + return Some(( + pix, + Some(RangeList { + ranges: std::mem::take(&mut included_ranges) + }) + )); } } None @@ -145,7 +144,6 @@ impl + Debug> RangeList { } } - impl RangeList { pub fn from_sorted_indexes>(indexes: I) -> Self { let mut iter = indexes.into_iter(); @@ -159,10 +157,10 @@ impl RangeList { ranges.push(beg..end); beg = idx; end = idx + 1; - }, + } Ordering::Equal => { end = idx + 1; - }, + } Ordering::Greater => { panic!("index list was unsorted") } @@ -170,8 +168,6 @@ impl RangeList { } ranges.push(beg..end) } - Self { - ranges - } + Self { ranges } } -} \ No newline at end of file +} diff --git a/crates/primitives/src/sid.rs b/crates/primitives/src/sid.rs index 20d94fb8..42d3b154 100644 --- a/crates/primitives/src/sid.rs +++ b/crates/primitives/src/sid.rs @@ -1,25 +1,25 @@ -use std::{fmt::{Debug, Display, Formatter}, str::FromStr}; - +use std::{ + fmt::{Debug, Display, Formatter}, + str::FromStr +}; #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] pub struct SID { bytes: [u8; N] } - -impl AsRef<[u8]> for SID { +impl AsRef<[u8]> for SID { fn as_ref(&self) -> &[u8] { &self.bytes } } - -impl TryFrom<&[u8]> for SID { +impl TryFrom<&[u8]> for SID { type Error = &'static str; fn try_from(value: &[u8]) -> Result { if value.len() > N { - return Err("binary string is too long") + return Err("binary string is too long"); } let mut bytes = [0; N]; bytes[..value.len()].copy_from_slice(value); @@ -27,29 +27,26 @@ impl TryFrom<&[u8]> for SID { } } - -impl TryFrom<&str> for SID { +impl TryFrom<&str> for SID { type Error = &'static str; fn try_from(value: &str) -> Result { if value.len() > N { - return Err("string is too long") + return Err("string is too long"); } if !value.as_bytes().iter().copied().all(Self::is_valid_byte) { - return Err("only ascii alphanumeric, '-' and '_' characters are allowed in short id strings") + return Err("only ascii alphanumeric, '-' and '_' characters are allowed in short id strings"); } let mut bytes = [0; N]; bytes[..value.len()].copy_from_slice(value.as_bytes()); - Ok(Self { - bytes - }) + Ok(Self { bytes }) } } -impl FromStr for SID { +impl FromStr for SID { type Err = &'static str; fn from_str(s: &str) -> Result { @@ -57,25 +54,21 @@ impl FromStr for SID { } } - -impl Default for SID { +impl Default for SID { fn default() -> Self { - Self { - bytes: [0; N] - } + Self { bytes: [0; N] } } } - -impl SID { +impl SID { pub fn from_str(s: &str) -> Self { - s.try_into().unwrap() + s.try_into().unwrap() } - + pub fn try_new(bytes: [u8; N]) -> Result { let slice = if let Some(end) = bytes.iter().position(|b| *b == 0) { if !bytes[end..].iter().all(|b| *b == 0) { - return Err("only trailing 0 bytes are allowed in SID") + return Err("only trailing 0 bytes are allowed in SID"); } &bytes[0..end] } else { @@ -83,12 +76,10 @@ impl SID { }; if !slice.iter().copied().all(Self::is_valid_byte) { - return Err("only ascii alphanumeric, '-' and '_' characters are allowed in SID strings") + return Err("only ascii alphanumeric, '-' and '_' characters are allowed in SID strings"); } - Ok(Self { - bytes - }) + Ok(Self { bytes }) } fn is_valid_byte(b: u8) -> bool { @@ -96,72 +87,55 @@ impl SID { } pub fn as_str(&self) -> &str { - std::str::from_utf8( - if let Some(end) = self.bytes.iter().position(|b| *b == 0) { - &self.bytes[0..end] - } else { - &self.bytes - } - ).unwrap() + std::str::from_utf8(if let Some(end) = self.bytes.iter().position(|b| *b == 0) { + &self.bytes[0..end] + } else { + &self.bytes + }) + .unwrap() } } - -impl <'a, const N: usize> From<&'a SID> for &'a str { +impl<'a, const N: usize> From<&'a SID> for &'a str { fn from(val: &'a SID) -> Self { val.as_str() } } - -impl Display for SID { +impl Display for SID { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str(self.as_str()) } } - -impl Debug for SID { +impl Debug for SID { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str(self.as_str()) } } - #[cfg(feature = "borsh")] -impl borsh::BorshSerialize for SID { +impl borsh::BorshSerialize for SID { fn serialize(&self, writer: &mut W) -> std::io::Result<()> { self.bytes.serialize(writer) } } - #[cfg(feature = "borsh")] -impl borsh::BorshDeserialize for SID { +impl borsh::BorshDeserialize for SID { fn deserialize(buf: &mut &[u8]) -> std::io::Result { let bytes = <[u8; N]>::deserialize(buf)?; - Self::try_new(bytes).map_err(|err| { - std::io::Error::new( - std::io::ErrorKind::InvalidData, - err - ) - }) + Self::try_new(bytes).map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err)) } - + fn deserialize_reader(reader: &mut R) -> std::io::Result { let bytes = <[u8; N]>::deserialize_reader(reader)?; - Self::try_new(bytes).map_err(|err| { - std::io::Error::new( - std::io::ErrorKind::InvalidData, - err - ) - }) + Self::try_new(bytes).map_err(|err| std::io::Error::new(std::io::ErrorKind::InvalidData, err)) } } - #[cfg(feature = "serde")] -impl serde::ser::Serialize for SID { +impl serde::ser::Serialize for SID { fn serialize(&self, serializer: S) -> Result where S: serde::ser::Serializer @@ -170,9 +144,8 @@ impl serde::ser::Serialize for SID { } } - #[cfg(feature = "serde")] -impl <'de, const N: usize> serde::de::Deserialize<'de> for SID { +impl<'de, const N: usize> serde::de::Deserialize<'de> for SID { fn deserialize(deserializer: D) -> Result where D: serde::de::Deserializer<'de> @@ -181,16 +154,13 @@ impl <'de, const N: usize> serde::de::Deserialize<'de> for SID { } } - #[cfg(feature = "serde")] mod serde_visitor { use super::SID; - pub struct SIDVisitor; - - impl <'de, const N: usize> serde::de::Visitor<'de> for SIDVisitor { + impl<'de, const N: usize> serde::de::Visitor<'de> for SIDVisitor { type Value = SID; fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -199,13 +169,14 @@ mod serde_visitor { fn visit_str(self, v: &str) -> Result where - E: serde::de::Error, + E: serde::de::Error { SID::try_from(v).map_err(|msg| { - serde::de::Error::custom( - format!("failed to deserialize `{}` as {}-byte short id string: {}", v, N, msg) - ) + serde::de::Error::custom(format!( + "failed to deserialize `{}` as {}-byte short id string: {}", + v, N, msg + )) }) } } -} \ No newline at end of file +} diff --git a/crates/primitives/src/types.rs b/crates/primitives/src/types.rs index 8c154c61..2fc60259 100644 --- a/crates/primitives/src/types.rs +++ b/crates/primitives/src/types.rs @@ -1,12 +1,12 @@ -use std::fmt::{Debug, Display, Formatter}; -use std::sync::Arc; - +use std::{ + fmt::{Debug, Display, Formatter}, + sync::Arc +}; pub type Name = &'static str; pub type BlockNumber = u64; pub type ItemIndex = u32; - #[cfg_attr(feature = "borsh", derive(borsh::BorshSerialize, borsh::BorshDeserialize))] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[cfg_attr(feature = "valuable", derive(valuable::Valuable))] @@ -16,13 +16,12 @@ pub struct BlockRef { pub hash: String } - impl BlockRef { pub fn set_hash(&mut self, hash: &str) { self.hash.clear(); self.hash.push_str(hash) } - + pub fn set_ptr(&mut self, ptr: BlockPtr) { self.number = ptr.number; self.set_hash(ptr.hash) @@ -36,7 +35,6 @@ impl BlockRef { } } - #[cfg_attr(feature = "valuable", derive(valuable::Valuable))] #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub struct BlockPtr<'a> { @@ -44,7 +42,6 @@ pub struct BlockPtr<'a> { pub hash: &'a str } - impl<'a> BlockPtr<'a> { pub fn to_ref(&self) -> BlockRef { BlockRef { @@ -54,24 +51,20 @@ impl<'a> BlockPtr<'a> { } } - impl Display for BlockRef { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}#{}", self.number, self.hash) } } - impl<'a> Display for BlockPtr<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}#{}", self.number, self.hash) } } - pub struct DisplayBlockRefOption<'a>(pub Option<&'a BlockRef>); - impl<'a> Display for DisplayBlockRefOption<'a> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { if let Some(r) = self.0 { @@ -82,7 +75,6 @@ impl<'a> Display for DisplayBlockRefOption<'a> { } } - pub trait Block { fn number(&self) -> BlockNumber; @@ -121,11 +113,9 @@ pub trait Block { } } - #[derive(Copy, Clone, Debug, Default, Eq, PartialEq)] pub struct DataMask(u32); - impl DataMask { pub fn get(&self, i: usize) -> bool { debug_assert!(i < 32); @@ -138,7 +128,6 @@ impl DataMask { } } - impl<'a, T: Block> Block for &'a T { #[inline] fn number(&self) -> BlockNumber { @@ -174,8 +163,7 @@ impl<'a, T: Block> Block for &'a T { fn parent_ptr(&self) -> BlockPtr<'_> { (*self).parent_ptr() } -} - +} impl Block for Arc { #[inline] @@ -214,12 +202,10 @@ impl Block for Arc { } } - pub trait AsBlockPtr { fn as_block_ptr(&self) -> BlockPtr<'_>; } - impl AsBlockPtr for BlockRef { #[inline] fn as_block_ptr(&self) -> BlockPtr<'_> { @@ -227,7 +213,6 @@ impl AsBlockPtr for BlockRef { } } - impl AsBlockPtr for B { #[inline] fn as_block_ptr(&self) -> BlockPtr<'_> { @@ -235,10 +220,9 @@ impl AsBlockPtr for B { } } - impl<'a> AsBlockPtr for BlockPtr<'a> { #[inline] fn as_block_ptr(&self) -> BlockPtr<'_> { *self } -} \ No newline at end of file +} diff --git a/crates/query-example/src/main.rs b/crates/query-example/src/main.rs index 0fdfca94..b03ae98f 100644 --- a/crates/query-example/src/main.rs +++ b/crates/query-example/src/main.rs @@ -1,11 +1,8 @@ +use std::{fs::File, io::Write, time::Instant}; + use anyhow::{bail, Context}; +use flate2::{write::GzEncoder, Compression}; use sqd_query::{ParquetChunk, Query}; -use std::fs::File; -use std::io::Write; -use std::time::Instant; -use flate2::Compression; -use flate2::write::GzEncoder; - fn main() -> anyhow::Result<()> { unsafe { diff --git a/crates/query/benches/main.rs b/crates/query/benches/main.rs index b184437c..e3ff37a6 100644 --- a/crates/query/benches/main.rs +++ b/crates/query/benches/main.rs @@ -6,9 +6,8 @@ mod query; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; - fn main() { // Workaround for scan benchmarks to appear let _ = sqd_query::Query::from_json_value(serde_json::json!({})).map(|q| q.compile()); divan::main() -} \ No newline at end of file +} diff --git a/crates/query/benches/query/mod.rs b/crates/query/benches/query/mod.rs index e551d8b3..9e2df721 100644 --- a/crates/query/benches/query/mod.rs +++ b/crates/query/benches/query/mod.rs @@ -1,6 +1,5 @@ mod util; - #[divan::bench_group(sample_size = 20)] mod whirlpool_swap { use super::util::{bench_solana_parquet_query, bench_solana_storage_query, query}; @@ -46,7 +45,6 @@ mod whirlpool_swap { } } - #[divan::bench_group(sample_size = 5, sample_count = 20)] mod solana_hard { use crate::query::util::{bench_parquet_query, query}; @@ -139,4 +137,4 @@ mod solana_hard { fn large(bench: divan::Bencher) { bench_parquet_query(bench, "benches/data/solana/large", &QUERY) } -} \ No newline at end of file +} diff --git a/crates/query/benches/query/util.rs b/crates/query/benches/query/util.rs index 939996a6..74971bb5 100644 --- a/crates/query/benches/query/util.rs +++ b/crates/query/benches/query/util.rs @@ -1,6 +1,5 @@ use sqd_query::{Chunk, JsonLinesWriter, Plan, Query}; - macro_rules! query { ($name:ident, $($json:tt)+) => { static $name: std::sync::LazyLock = std::sync::LazyLock::new(|| { @@ -11,17 +10,14 @@ macro_rules! query { } pub(crate) use query; - pub fn bench_solana_parquet_query(bench: divan::Bencher, query: &Query) { bench_parquet_query(bench, "benches/data/solana/200", query) } - pub fn bench_solana_storage_query(bench: divan::Bencher, query: &Query) { storage::bench_solana_query(bench, query) } - pub fn bench_parquet_query(bench: divan::Bencher, chunk_path: &str, query: &Query) { let chunk = sqd_query::ParquetChunk::new( std::path::Path::new(env!("CARGO_MANIFEST_DIR")) @@ -32,15 +28,11 @@ pub fn bench_parquet_query(bench: divan::Bencher, chunk_path: &str, query: &Quer bench_query(bench, &chunk, query) } - fn bench_query(bench: divan::Bencher, chunk: &dyn Chunk, query: &Query) { let plan = query.compile(); - bench.bench_local(|| { - perform_query(&plan, chunk).unwrap() - }) + bench.bench_local(|| perform_query(&plan, chunk).unwrap()) } - fn perform_query(plan: &Plan, chunk: &dyn Chunk) -> anyhow::Result> { sqd_polars::POOL.install(|| { let mut json_writer = JsonLinesWriter::new(Vec::new()); @@ -51,22 +43,18 @@ fn perform_query(plan: &Plan, chunk: &dyn Chunk) -> anyhow::Result> { }) } - mod storage { - use arrow::array::RecordBatchReader; - use arrow::datatypes::Schema; + use std::{collections::BTreeMap, fs::File, path::Path, sync::LazyLock}; + + use arrow::{array::RecordBatchReader, datatypes::Schema}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use sqd_data::solana::tables::SolanaChunkBuilder; use sqd_dataset::DatasetDescription; + use sqd_query::Query; use sqd_storage::db::{Chunk, Database, DatabaseSettings, DatasetId, DatasetKind}; - use std::collections::BTreeMap; - use std::fs::File; - use std::path::Path; - use std::sync::LazyLock; use tempfile::TempDir; - use sqd_query::Query; - use crate::query::util::bench_query; + use crate::query::util::bench_query; pub fn bench_solana_query(bench: divan::Bencher, query: &Query) { let db = DatabaseSettings::default() @@ -82,21 +70,18 @@ mod storage { bench_query(bench, &chunk_reader, query) } - static DB_DIR: LazyLock = LazyLock::new(|| { let dir = tempfile::tempdir().unwrap(); prepare_database(&dir).unwrap(); dir }); - fn prepare_database(dir: &TempDir) -> anyhow::Result<()> { let db = DatabaseSettings::default().open(dir.path())?; prepare_solana_chunk(&db)?; Ok(()) } - fn prepare_solana_chunk(db: &Database) -> anyhow::Result<()> { let dataset_id = DatasetId::try_from("solana").unwrap(); let dataset_kind = DatasetKind::try_from("solana").unwrap(); @@ -105,8 +90,7 @@ mod storage { let mut tables = BTreeMap::new(); - let parquet_chunk_path = Path::new(env!("CARGO_MANIFEST_DIR")) - .join("benches/data/solana/200"); + let parquet_chunk_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("benches/data/solana/200"); for item_result in std::fs::read_dir(&parquet_chunk_path)? { let item = item_result?.file_name(); @@ -121,13 +105,11 @@ mod storage { let mut builder = db.new_table_builder(parquet_reader.schema()); - builder.set_stats( - get_columns_with_stats( - &SolanaChunkBuilder::dataset_description(), - table, - &parquet_reader.schema() - ) - )?; + builder.set_stats(get_columns_with_stats( + &SolanaChunkBuilder::dataset_description(), + table, + &parquet_reader.schema() + ))?; while let Some(record_batch) = parquet_reader.next().transpose()? { builder.write_record_batch(&record_batch)?; @@ -137,29 +119,30 @@ mod storage { } } - db.insert_chunk(dataset_id, &Chunk::V0 { - first_block: 200000000, - last_block: 200000899, - last_block_hash: "hello".to_string(), - parent_block_hash: "".to_string(), - tables - })?; + db.insert_chunk( + dataset_id, + &Chunk::V0 { + first_block: 200000000, + last_block: 200000899, + last_block_hash: "hello".to_string(), + parent_block_hash: "".to_string(), + tables + } + )?; Ok(()) } - fn get_columns_with_stats(d: &DatasetDescription, name: &str, schema: &Schema) -> Vec { if let Some(table_desc) = d.tables.get(name) { - table_desc.options.column_options.iter() - .filter_map(|(&name, opts)| { - opts.stats_enable.then(|| { - schema.index_of(name).unwrap() - }) - }) + table_desc + .options + .column_options + .iter() + .filter_map(|(&name, opts)| opts.stats_enable.then(|| schema.index_of(name).unwrap())) .collect() } else { Vec::new() } } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/binary.rs b/crates/query/src/json/encoder/binary.rs index 68271732..11264792 100644 --- a/crates/query/src/json/encoder/binary.rs +++ b/crates/query/src/json/encoder/binary.rs @@ -1,21 +1,17 @@ -use crate::json::encoder::Encoder; use arrow::array::{BinaryArray, FixedSizeBinaryArray}; +use crate::json::encoder::Encoder; pub struct BinaryEncoder { array: BinaryArray } - impl BinaryEncoder { pub fn new(array: BinaryArray) -> Self { - Self { - array - } + Self { array } } } - impl Encoder for BinaryEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let bytes = self.array.value(idx); @@ -23,21 +19,16 @@ impl Encoder for BinaryEncoder { } } - pub struct FixedSizedBinaryEncoder { array: FixedSizeBinaryArray } - impl FixedSizedBinaryEncoder { pub fn new(array: FixedSizeBinaryArray) -> Self { - Self { - array - } + Self { array } } } - impl Encoder for FixedSizedBinaryEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let bytes = self.array.value(idx); @@ -45,7 +36,6 @@ impl Encoder for FixedSizedBinaryEncoder { } } - fn write_hex(bytes: &[u8], out: &mut Vec) { let offset = out.len(); let len = bytes.len() + 4; @@ -56,4 +46,4 @@ fn write_hex(bytes: &[u8], out: &mut Vec) { dst[2] = b'x'; faster_hex::hex_encode(bytes, &mut dst[3..len - 1]).unwrap(); dst[len - 1] = b'"'; -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/boolean.rs b/crates/query/src/json/encoder/boolean.rs index 98013a1c..259e4a1a 100644 --- a/crates/query/src/json/encoder/boolean.rs +++ b/crates/query/src/json/encoder/boolean.rs @@ -1,30 +1,22 @@ use arrow::buffer::BooleanBuffer; -use crate::json::encoder::Encoder; +use crate::json::encoder::Encoder; pub struct BooleanEncoder { values: BooleanBuffer } - impl BooleanEncoder { pub fn new(values: BooleanBuffer) -> Self { - Self { - values - } + Self { values } } } - impl Encoder for BooleanEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { match self.values.value(idx) { - true => { - out.extend_from_slice(b"true") - } - false => { - out.extend_from_slice(b"false") - } + true => out.extend_from_slice(b"true"), + false => out.extend_from_slice(b"false") } } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/factory.rs b/crates/query/src/json/encoder/factory.rs index 6ab66377..6886ed08 100644 --- a/crates/query/src/json/encoder/factory.rs +++ b/crates/query/src/json/encoder/factory.rs @@ -1,9 +1,14 @@ +use arrow::{ + array::{ + Array, AsArray, BinaryArray, BooleanArray, FixedSizeBinaryArray, GenericListArray, GenericStringArray, + PrimitiveArray, StructArray + }, + buffer::NullBuffer, + datatypes::* +}; + use super::*; use crate::primitives::SchemaError; -use arrow::array::{Array, AsArray, BinaryArray, BooleanArray, FixedSizeBinaryArray, GenericListArray, GenericStringArray, PrimitiveArray, StructArray}; -use arrow::buffer::NullBuffer; -use arrow::datatypes::*; - macro_rules! _ok_box { ($value:expr) => { @@ -11,7 +16,6 @@ macro_rules! _ok_box { }; } - macro_rules! _nullable_encoder { ($maybe_nulls:expr, $encoder:expr, $constructor:path) => { if let Some(nulls) = $maybe_nulls { @@ -22,7 +26,6 @@ macro_rules! _nullable_encoder { }; } - macro_rules! _primitive_encoder { ($array:expr, $t:ty, $constructor:path) => {{ let array = $array.as_primitive::<$t>(); @@ -31,7 +34,6 @@ macro_rules! _primitive_encoder { }}; } - macro_rules! _make_non_list_encoder { ($array:expr, $constructor:path) => {{ let array = $array; @@ -48,47 +50,47 @@ macro_rules! _make_non_list_encoder { DataType::Float64 => _primitive_encoder!(array, Float64Type, $constructor), DataType::Timestamp(TimeUnit::Second, _) => { _primitive_encoder!(array, TimestampSecondType, $constructor) - }, + } DataType::Timestamp(TimeUnit::Millisecond, _) => { _primitive_encoder!(array, TimestampMillisecondType, $constructor) - }, + } DataType::Timestamp(TimeUnit::Microsecond, _) => { _primitive_encoder!(array, TimestampMicrosecondType, $constructor) - }, + } DataType::Boolean => { let array = array.as_boolean(); let (values, nulls) = array.clone().into_parts(); _nullable_encoder!(nulls, BooleanEncoder::new(values), $constructor) - }, + } DataType::FixedSizeBinary(_) => { let array = array.as_fixed_size_binary().clone(); let nulls = array.nulls().cloned(); _nullable_encoder!(nulls, FixedSizedBinaryEncoder::new(array), $constructor) - }, + } DataType::Binary => { let array = array.as_binary::().clone(); let nulls = array.nulls().cloned(); _nullable_encoder!(nulls, BinaryEncoder::new(array), $constructor) - }, + } DataType::Utf8 => { let array = array.as_string::(); let nulls = array.nulls().cloned(); _nullable_encoder!(nulls, StringEncoder::new(array.clone()), $constructor) - }, + } DataType::Struct(_) => { let array = array.as_struct(); let encoder = make_struct_encoder(array)?; let nulls = array.nulls().cloned(); _nullable_encoder!(nulls, encoder, $constructor) - }, - _ => Err(SchemaError::new( - format!("unsupported arrow type - {}", array.data_type()) - )) - }} - }; + } + _ => Err(SchemaError::new(format!( + "unsupported arrow type - {}", + array.data_type() + ))) + } + }}; } - pub fn make_encoder(array: &dyn Array) -> Result { match array.data_type() { DataType::List(_) => { @@ -98,7 +100,7 @@ pub fn make_encoder(array: &dyn Array) -> Result { DataType::List(_) => { let item_encoder = make_encoder(&values)?; _nullable_encoder!(nulls, ListEncoder::new(item_encoder, offsets), _ok_box) - }, + } _ => { macro_rules! make_list_encoder { ($item_encoder:expr) => { @@ -107,13 +109,13 @@ pub fn make_encoder(array: &dyn Array) -> Result { } _make_non_list_encoder!(values, make_list_encoder) } - }.map_err(|err| err.at("item")) - }, + } + .map_err(|err| err.at("item")) + } _ => _make_non_list_encoder!(array, _ok_box) } } - pub fn make_struct_encoder(array: &StructArray) -> Result { let mut fields = Vec::with_capacity(array.columns().len()); for (array, &name) in array.columns().iter().zip(array.column_names().iter()) { @@ -124,7 +126,6 @@ pub fn make_struct_encoder(array: &StructArray) -> Result(encoder: E, maybe_nulls: Option) -> EncoderObject { if let Some(nulls) = maybe_nulls { @@ -134,10 +135,9 @@ pub fn make_nullable_encoder(encoder: E, maybe_nulls: Opti } } - pub fn extract_nulls(array: &dyn Array) -> Result<(Option>, Option), SchemaError> { if array.nulls().is_none() { - return Ok((None, None)) + return Ok((None, None)); } macro_rules! ok { @@ -169,40 +169,38 @@ pub fn extract_nulls(array: &dyn Array) -> Result<(Option>, Optio let array = array.as_boolean(); let (values, nulls) = array.clone().into_parts(); ok!(BooleanArray::new(values, None), nulls) - }, + } DataType::FixedSizeBinary(_) => { let array = array.as_fixed_size_binary(); let (size, values, nulls) = array.clone().into_parts(); ok!(FixedSizeBinaryArray::new(size, values, None), nulls) - }, + } DataType::Binary => { let array = array.as_binary::(); let (offsets, values, nulls) = array.clone().into_parts(); - ok!(unsafe { - BinaryArray::new_unchecked(offsets, values, None) - }, nulls) - }, + ok!(unsafe { BinaryArray::new_unchecked(offsets, values, None) }, nulls) + } DataType::Utf8 => { let array = array.as_string::(); let (offsets, values, nulls) = array.clone().into_parts(); - ok!(unsafe { - GenericStringArray::::new_unchecked(offsets, values, None) - }, nulls) - }, + ok!( + unsafe { GenericStringArray::::new_unchecked(offsets, values, None) }, + nulls + ) + } DataType::List(_) => { let array = array.as_list(); let (field, offsets, values, nulls) = array.clone().into_parts(); ok!(GenericListArray::::new(field, offsets, values, None), nulls) - }, + } DataType::Struct(_) => { let array = array.as_struct(); let (fields, columns, nulls) = array.clone().into_parts(); - ok!(unsafe { - StructArray::new_unchecked(fields, columns, None) - }, nulls) - }, - _ => Err(SchemaError::new( - format!("unsupported arrow type - {}", array.data_type()) - )) + ok!(unsafe { StructArray::new_unchecked(fields, columns, None) }, nulls) + } + _ => Err(SchemaError::new(format!( + "unsupported arrow type - {}", + array.data_type() + ))) } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/hex.rs b/crates/query/src/json/encoder/hex.rs index 79e6966b..6f68c51d 100644 --- a/crates/query/src/json/encoder/hex.rs +++ b/crates/query/src/json/encoder/hex.rs @@ -1,11 +1,9 @@ -use crate::json::encoder::Encoder; -use arrow::buffer::ScalarBuffer; -use arrow::datatypes::ArrowNativeType; +use arrow::{buffer::ScalarBuffer, datatypes::ArrowNativeType}; +use crate::json::encoder::Encoder; static TABLE: &[u8] = b"0123456789abcdef"; - pub trait HexEncode: ArrowNativeType + Send { type Buffer: Send + AsRef<[u8]>; @@ -14,7 +12,6 @@ pub trait HexEncode: ArrowNativeType + Send { fn encode(self, buf: &mut Self::Buffer); } - macro_rules! hex_encode { ($($t:ty),*) => { $( @@ -46,14 +43,12 @@ macro_rules! hex_encode { } hex_encode!(u8, u16, u32, u64, u128); - pub struct HexEncoder { values: ScalarBuffer, - buffer: N::Buffer, + buffer: N::Buffer } - -impl HexEncoder { +impl HexEncoder { pub fn new(values: ScalarBuffer) -> Self { Self { values, @@ -62,7 +57,6 @@ impl HexEncoder { } } - impl Encoder for HexEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { self.values[idx].encode(&mut self.buffer); @@ -70,12 +64,10 @@ impl Encoder for HexEncoder { } } - #[cfg(test)] mod tests { use super::HexEncode; - #[test] fn test_hex_write() { let mut buf = u16::init_buffer(); diff --git a/crates/query/src/json/encoder/json.rs b/crates/query/src/json/encoder/json.rs index cf3f58e0..73c6b6bb 100644 --- a/crates/query/src/json/encoder/json.rs +++ b/crates/query/src/json/encoder/json.rs @@ -1,28 +1,25 @@ -use arrow::buffer::{Buffer, OffsetBuffer}; -use arrow::datatypes::ArrowNativeType; -use crate::json::encoder::Encoder; +use arrow::{ + buffer::{Buffer, OffsetBuffer}, + datatypes::ArrowNativeType +}; +use crate::json::encoder::Encoder; pub struct JsonEncoder { buffer: Buffer, offsets: OffsetBuffer } - impl JsonEncoder { pub fn new(buffer: Buffer, offsets: OffsetBuffer) -> Self { - Self { - buffer, - offsets - } + Self { buffer, offsets } } } - impl Encoder for JsonEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let start = self.offsets[idx].as_usize(); let end = self.offsets[idx + 1].as_usize(); out.extend_from_slice(&self.buffer[start..end]) } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/list.rs b/crates/query/src/json/encoder/list.rs index a1108e00..4a904136 100644 --- a/crates/query/src/json/encoder/list.rs +++ b/crates/query/src/json/encoder/list.rs @@ -1,15 +1,12 @@ -use crate::json::encoder::util::json_close; -use crate::json::encoder::Encoder; -use arrow::buffer::OffsetBuffer; -use arrow::datatypes::ArrowNativeType; +use arrow::{buffer::OffsetBuffer, datatypes::ArrowNativeType}; +use crate::json::encoder::{util::json_close, Encoder}; pub struct ListEncoder { spread: ListSpreadEncoder } - -impl ListEncoder { +impl ListEncoder { pub fn new(encoder: E, offsets: OffsetBuffer) -> Self { Self { spread: ListSpreadEncoder::new(encoder, offsets) @@ -17,8 +14,7 @@ impl ListEncoder { } } - -impl Encoder for ListEncoder { +impl Encoder for ListEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { out.push(b'['); self.spread.encode(idx, out); @@ -26,24 +22,18 @@ impl Encoder for ListEncoder { } } - pub struct ListSpreadEncoder { encoder: E, offsets: OffsetBuffer } - -impl ListSpreadEncoder { +impl ListSpreadEncoder { pub fn new(encoder: E, offsets: OffsetBuffer) -> Self { - Self { - encoder, - offsets - } + Self { encoder, offsets } } } - -impl Encoder for ListSpreadEncoder { +impl Encoder for ListSpreadEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let start = self.offsets[idx].as_usize(); let end = self.offsets[idx + 1].as_usize(); @@ -52,4 +42,4 @@ impl Encoder for ListSpreadEncoder { out.push(b','); } } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/mod.rs b/crates/query/src/json/encoder/mod.rs index 92f767ac..b15ca436 100644 --- a/crates/query/src/json/encoder/mod.rs +++ b/crates/query/src/json/encoder/mod.rs @@ -6,11 +6,10 @@ mod json; mod list; mod nullable; mod primitive; -mod r#struct; mod string; +mod r#struct; pub mod util; - pub use binary::*; pub use boolean::*; pub use hex::*; @@ -21,15 +20,12 @@ pub use primitive::*; pub use r#struct::*; pub use string::*; - pub trait Encoder: Send { fn encode(&mut self, idx: usize, out: &mut Vec); } - pub type EncoderObject = Box; - impl Encoder for Box { fn encode(&mut self, idx: usize, out: &mut Vec) { self.as_mut().encode(idx, out) diff --git a/crates/query/src/json/encoder/nullable.rs b/crates/query/src/json/encoder/nullable.rs index 41b6cbbc..5b9e6264 100644 --- a/crates/query/src/json/encoder/nullable.rs +++ b/crates/query/src/json/encoder/nullable.rs @@ -1,24 +1,19 @@ use arrow::buffer::NullBuffer; -use crate::json::encoder::Encoder; +use crate::json::encoder::Encoder; pub struct NullableEncoder { encoder: E, nulls: NullBuffer } - -impl NullableEncoder { +impl NullableEncoder { pub fn new(encoder: E, nulls: NullBuffer) -> Self { - Self { - encoder, - nulls - } + Self { encoder, nulls } } } - -impl Encoder for NullableEncoder { +impl Encoder for NullableEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { if self.nulls.is_null(idx) { out.extend_from_slice(b"null") @@ -27,4 +22,3 @@ impl Encoder for NullableEncoder { } } } - diff --git a/crates/query/src/json/encoder/primitive.rs b/crates/query/src/json/encoder/primitive.rs index 27b74025..f0a5b490 100644 --- a/crates/query/src/json/encoder/primitive.rs +++ b/crates/query/src/json/encoder/primitive.rs @@ -1,8 +1,7 @@ -use arrow::buffer::ScalarBuffer; -use arrow::datatypes::ArrowNativeType; +use arrow::{buffer::ScalarBuffer, datatypes::ArrowNativeType}; use lexical_core::FormattedSize; -use crate::json::encoder::Encoder; +use crate::json::encoder::Encoder; pub trait PrimitiveEncode: ArrowNativeType + Send { type Buffer: Send; @@ -16,7 +15,6 @@ pub trait PrimitiveEncode: ArrowNativeType + Send { fn encode(self, buf: &mut Self::Buffer) -> &[u8]; } - macro_rules! integer_encode { ($($t:ty),*) => { $( @@ -36,7 +34,6 @@ macro_rules! integer_encode { } integer_encode!(i8, i16, i32, i64, u8, u16, u32, u64, i128); - macro_rules! float_encode { ($($t:ty),*) => { $( @@ -60,14 +57,12 @@ macro_rules! float_encode { } float_encode!(f32, f64); - pub struct PrimitiveEncoder { values: ScalarBuffer, - buffer: N::Buffer, + buffer: N::Buffer } - -impl PrimitiveEncoder { +impl PrimitiveEncoder { pub fn new(values: ScalarBuffer) -> Self { Self { values, @@ -76,14 +71,12 @@ impl PrimitiveEncoder { } } - impl Encoder for PrimitiveEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { out.extend_from_slice(self.values[idx].encode(&mut self.buffer)); } } - pub struct TimestampEncoder { values: ScalarBuffer, buffer: [u8; i64::FORMATTED_SIZE], @@ -91,7 +84,6 @@ pub struct TimestampEncoder { scale_divisor: i64 } - impl TimestampEncoder { pub fn new(values: ScalarBuffer, scale_multiplier: i64, scale_divisor: i64) -> Self { Self { @@ -103,10 +95,9 @@ impl TimestampEncoder { } } - impl Encoder for TimestampEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let value = self.values[idx] * self.scale_multiplier / self.scale_divisor; out.extend_from_slice(value.encode(&mut self.buffer)); } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/string.rs b/crates/query/src/json/encoder/string.rs index 0ae11bad..a09d5f3a 100644 --- a/crates/query/src/json/encoder/string.rs +++ b/crates/query/src/json/encoder/string.rs @@ -1,23 +1,17 @@ use arrow::array::StringArray; -use crate::json::encoder::Encoder; -use crate::json::encoder::util::encode_string; - +use crate::json::encoder::{util::encode_string, Encoder}; pub struct StringEncoder { array: StringArray } - impl StringEncoder { pub fn new(array: StringArray) -> Self { - Self { - array - } + Self { array } } } - impl Encoder for StringEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let s = self.array.value(idx); @@ -25,25 +19,20 @@ impl Encoder for StringEncoder { } } - pub struct SafeStringEncoder { encode: E } - -impl SafeStringEncoder { +impl SafeStringEncoder { pub fn new(encode: E) -> Self { - Self { - encode - } + Self { encode } } } - -impl Encoder for SafeStringEncoder { +impl Encoder for SafeStringEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { out.push(b'"'); self.encode.encode(idx, out); out.push(b'"') } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/struct.rs b/crates/query/src/json/encoder/struct.rs index 334a96d2..d1d5f33e 100644 --- a/crates/query/src/json/encoder/struct.rs +++ b/crates/query/src/json/encoder/struct.rs @@ -1,20 +1,17 @@ -use crate::json::encoder::{Encoder, EncoderObject}; -use crate::json::encoder::util::{json_close, make_object_prop}; - +use crate::json::encoder::{ + util::{json_close, make_object_prop}, + Encoder, EncoderObject +}; pub struct StructField { prop: Vec, value: EncoderObject } - impl StructField { pub fn new(name: &str, value: EncoderObject) -> Self { let prop = make_object_prop(name); - Self { - prop, - value - } + Self { prop, value } } #[inline] @@ -25,21 +22,16 @@ impl StructField { } } - pub struct StructEncoder { fields: Vec } - impl StructEncoder { pub fn new(fields: Vec) -> Self { - Self { - fields - } + Self { fields } } } - impl Encoder for StructEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { out.push(b'{'); @@ -48,4 +40,4 @@ impl Encoder for StructEncoder { } json_close(b'}', out) } -} \ No newline at end of file +} diff --git a/crates/query/src/json/encoder/util.rs b/crates/query/src/json/encoder/util.rs index a1bd07b2..f9cfca14 100644 --- a/crates/query/src/json/encoder/util.rs +++ b/crates/query/src/json/encoder/util.rs @@ -1,13 +1,11 @@ use convert_case::{Case, Casing}; use serde::Serializer; - pub fn encode_string(s: &str, out: &mut Vec) { let mut serializer = serde_json::Serializer::new(out); serializer.serialize_str(s).unwrap() } - #[inline] pub fn json_close(end: u8, out: &mut Vec) { let last = out.len() - 1; @@ -18,16 +16,14 @@ pub fn json_close(end: u8, out: &mut Vec) { } } - pub fn to_camel_case(s: &str) -> String { s.to_case(Case::Camel) } - pub fn make_object_prop(name: &str) -> Vec { let name = to_camel_case(name); let mut prop = Vec::with_capacity(name.len() + 3); encode_string(&name, &mut prop); prop.push(b':'); prop -} \ No newline at end of file +} diff --git a/crates/query/src/json/exp.rs b/crates/query/src/json/exp.rs index 8b205bfe..38d32800 100644 --- a/crates/query/src/json/exp.rs +++ b/crates/query/src/json/exp.rs @@ -1,13 +1,21 @@ -use crate::json::encoder::factory::{extract_nulls, make_encoder, make_nullable_encoder}; -use crate::json::encoder::util::json_close; -use crate::json::encoder::{Encoder, EncoderObject, HexEncode, HexEncoder, JsonEncoder, ListEncoder, ListSpreadEncoder, NullableEncoder, PrimitiveEncoder, PrimitiveEncode, SafeStringEncoder, StructEncoder, StructField, TimestampEncoder}; -use crate::primitives::{schema_error, Name, SchemaError}; -use arrow::array::{Array, AsArray, PrimitiveArray, StringArray, StructArray}; -use arrow::buffer::{NullBuffer, ScalarBuffer}; -use arrow::datatypes::{DataType, TimeUnit, TimestampMillisecondType, TimestampSecondType}; -use lexical_core::FormattedSize; use std::ops::Deref; +use arrow::{ + array::{Array, AsArray, PrimitiveArray, StringArray, StructArray}, + buffer::{NullBuffer, ScalarBuffer}, + datatypes::{DataType, TimeUnit, TimestampMillisecondType, TimestampSecondType} +}; +use lexical_core::FormattedSize; + +use crate::{ + json::encoder::{ + factory::{extract_nulls, make_encoder, make_nullable_encoder}, + util::json_close, + Encoder, EncoderObject, HexEncode, HexEncoder, JsonEncoder, ListEncoder, ListSpreadEncoder, NullableEncoder, + PrimitiveEncode, PrimitiveEncoder, SafeStringEncoder, StructEncoder, StructField, TimestampEncoder + }, + primitives::{schema_error, Name, SchemaError} +}; #[derive(Debug, Clone)] pub enum Exp { @@ -31,35 +39,31 @@ pub enum Exp { } } - impl Exp { - pub fn for_each_column(&self, f: &mut F) where F: FnMut(Name) { + pub fn for_each_column(&self, f: &mut F) + where + F: FnMut(Name) + { match self { - Exp::Object(props) => { - props.iter().for_each(|(_name, exp)| { - exp.for_each_column(f); - }) - }, - Exp::Prop(name, _) => { - f(name) - }, - Exp::Roll { columns, .. } => { - columns.iter().for_each(|name| f(name)) - }, + Exp::Object(props) => props.iter().for_each(|(_name, exp)| { + exp.for_each_column(f); + }), + Exp::Prop(name, _) => f(name), + Exp::Roll { columns, .. } => columns.iter().for_each(|name| f(name)), Exp::Enum { tag_column, variants } => { f(tag_column); variants.iter().for_each(|(_name, exp)| { exp.for_each_column(f); }) - }, - Exp::List(_) | - Exp::Value | - Exp::Json | - Exp::BigNum | - Exp::HexNum | - Exp::SolanaTransactionVersion | - Exp::TimestampSecond | - Exp::TimestampMillisecond => {}, + } + Exp::List(_) + | Exp::Value + | Exp::Json + | Exp::BigNum + | Exp::HexNum + | Exp::SolanaTransactionVersion + | Exp::TimestampSecond + | Exp::TimestampMillisecond => {} } } @@ -81,7 +85,6 @@ impl Exp { } } - macro_rules! extract_nulls { ($array:expr, $result_array:ident, $nulls:ident) => { let array = $array; @@ -90,24 +93,16 @@ macro_rules! extract_nulls { }; } - fn eval_json(array: &dyn Array) -> Result { let (offsets, buffer, nulls) = match array.data_type() { - DataType::Binary => { - array.as_binary().clone().into_parts() - }, - DataType::Utf8 => { - array.as_string().clone().into_parts() - }, - ty => return Err(schema_error!( - "Expected a raw JSON column, but got - {}", ty - )) + DataType::Binary => array.as_binary().clone().into_parts(), + DataType::Utf8 => array.as_string().clone().into_parts(), + ty => return Err(schema_error!("Expected a raw JSON column, but got - {}", ty)) }; let encoder = JsonEncoder::new(buffer, offsets); Ok(make_nullable_encoder(encoder, nulls)) } - fn eval_bignum(array: &dyn Array) -> Result { use arrow::datatypes::*; @@ -116,10 +111,7 @@ fn eval_bignum(array: &dyn Array) -> Result { let array = array.as_primitive::<$ty>(); let (_, buffer, nulls) = array.clone().into_parts(); let encoder = PrimitiveEncoder::new(buffer); - Ok(make_nullable_encoder( - SafeStringEncoder::new(encoder), - nulls - )) + Ok(make_nullable_encoder(SafeStringEncoder::new(encoder), nulls)) }}; } @@ -135,22 +127,17 @@ fn eval_bignum(array: &dyn Array) -> Result { DataType::Float32 => make!(Float32Type), DataType::Float64 => make!(Float64Type), DataType::Decimal128(_, 0) => make!(Decimal128Type), - ty => Err(schema_error!( - "Expected numeric primitive value, but got - {}", ty - )) + ty => Err(schema_error!("Expected numeric primitive value, but got - {}", ty)) } } - fn eval_hex(array: &dyn Array) -> Result { use arrow::datatypes::*; - fn make_encoder( - array: &PrimitiveArray, - ) -> Result + fn make_encoder(array: &PrimitiveArray) -> Result where T: ArrowPrimitiveType, - T::Native: HexEncode, + T::Native: HexEncode { let (_, buffer, nulls) = array.clone().into_parts(); let encoder = HexEncoder::new(buffer); @@ -163,12 +150,12 @@ fn eval_hex(array: &dyn Array) -> Result { DataType::UInt32 => make_encoder(array.as_primitive::()), DataType::UInt64 => make_encoder(array.as_primitive::()), ty => Err(schema_error!( - "Expected unsigned numeric primitive value, but got - {}", ty + "Expected unsigned numeric primitive value, but got - {}", + ty )) } } - fn eval_solana_transaction_version(array: &dyn Array) -> Result { use arrow::datatypes::Int16Type; @@ -180,22 +167,24 @@ fn eval_solana_transaction_version(array: &dyn Array) -> Result Result { let (unit, buffer, nulls) = match array.data_type() { DataType::Timestamp(TimeUnit::Second, _) => { let array = array.as_primitive::(); let (_, buffer, nulls) = array.clone().into_parts(); (TimeUnit::Second, buffer, nulls) - }, + } DataType::Timestamp(TimeUnit::Millisecond, _) => { let array = array.as_primitive::(); let (_, buffer, nulls) = array.clone().into_parts(); (TimeUnit::Millisecond, buffer, nulls) - }, - ty => return Err( - schema_error!("expected Timestamp measured in seconds or milliseconds, but got {}", ty) - ) + } + ty => { + return Err(schema_error!( + "expected Timestamp measured in seconds or milliseconds, but got {}", + ty + )) + } }; let (mul, div) = match (unit, target_unit) { (TimeUnit::Second, TimeUnit::Millisecond) => (1000, 1), @@ -207,7 +196,6 @@ fn eval_timestamp(array: &dyn Array, target_unit: TimeUnit) -> Result) -> Result { extract_nulls!(array, array, nulls); let mut fields = Vec::with_capacity(props.len()); @@ -220,7 +208,6 @@ fn eval_object(array: &dyn Array, props: &Vec<(Name, Exp)>) -> Result Result { let array = array.as_list::(); let (_, offsets, values, nulls) = array.clone().into_parts(); @@ -228,32 +215,30 @@ fn eval_list(array: &dyn Array, exp: &Exp) -> Result Ok(make_nullable_encoder(ListEncoder::new(item_encoder, offsets), nulls)) } - struct AllNullEncoder; - impl Encoder for AllNullEncoder { fn encode(&mut self, _idx: usize, out: &mut Vec) { out.extend_from_slice(b"null") } } - fn eval_prop(array: &dyn Array, name: Name, exp: &Exp) -> Result { - let array: &StructArray = array.as_any().downcast_ref().ok_or_else(|| { - schema_error!("expected a StructArray, but got {}", array.data_type()) - })?; + let array: &StructArray = array + .as_any() + .downcast_ref() + .ok_or_else(|| schema_error!("expected a StructArray, but got {}", array.data_type()))?; - let column_array = array.column_by_name(name).ok_or_else(|| { - schema_error!("column `{}` not found", name) - })?; + let column_array = array + .column_by_name(name) + .ok_or_else(|| schema_error!("column `{}` not found", name))?; if column_array.data_type() == &DataType::Null { let encoder: EncoderObject = Box::new(AllNullEncoder); if let Some(nulls) = array.nulls() { - return Ok(Box::new(NullableEncoder::new(encoder, nulls.clone()))) + return Ok(Box::new(NullableEncoder::new(encoder, nulls.clone()))); } else { - return Ok(encoder) + return Ok(encoder); } } @@ -266,22 +251,22 @@ fn eval_prop(array: &dyn Array, name: Name, exp: &Exp) -> Result, exp: &Exp) -> Result { extract_nulls!(array, array, array_nulls); - let struct_array: &StructArray = array.as_any().downcast_ref().ok_or_else(|| { - schema_error!("expected a StructArray, but got {}", array.data_type()) - })?; + let struct_array: &StructArray = array + .as_any() + .downcast_ref() + .ok_or_else(|| schema_error!("expected a StructArray, but got {}", array.data_type()))?; let mut non_nullable = Vec::with_capacity(columns.len()); let mut nullable = Vec::with_capacity(columns.len()); let mut spread: Option> = None; for (idx, name) in columns.iter().copied().enumerate() { - let item_array = struct_array.column_by_name(name).ok_or_else(|| { - schema_error!("column `{}` is not found", name) - })?; + let item_array = struct_array + .column_by_name(name) + .ok_or_else(|| schema_error!("column `{}` is not found", name))?; extract_nulls!(item_array, item_array, item_nulls); @@ -290,32 +275,28 @@ fn eval_roll(array: &dyn Array, columns: &Vec, exp: &Exp) -> Result 0 { - return Err( - SchemaError::new("list item of a roll is not supposed to be nullable") - .at("item") - .at(name) - ) + return Err(SchemaError::new("list item of a roll is not supposed to be nullable") + .at("item") + .at(name)); } let encoder = exp.eval(list.values()).map_err(|err| err.at("item").at(name))?; spread = Some(ListSpreadEncoder::new(encoder, list.offsets().clone())); - break + break; } if item_nulls.is_none() && nullable.len() > 0 { - return Err( - schema_error!( - "Failed to construct list roll: column `{}` is nullable, while the next in the roll `{}` is not", - columns[idx-1], - name - ) - ) + return Err(schema_error!( + "Failed to construct list roll: column `{}` is nullable, while the next in the roll `{}` is not", + columns[idx - 1], + name + )); } let encoder = exp.eval(item_array).map_err(|err| err.at(name))?; - + if let Some(nulls) = item_nulls { nullable.push((nulls, encoder)) } else { @@ -332,14 +313,12 @@ fn eval_roll(array: &dyn Array, columns: &Vec, exp: &Exp) -> Result, nullable: Vec<(NullBuffer, EncoderObject)>, spread: Option> } - impl Encoder for ListRollEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { out.push(b'['); @@ -350,7 +329,7 @@ impl Encoder for ListRollEncoder { for (nulls, item) in self.nullable.iter_mut() { if nulls.is_null(idx) { json_close(b']', out); - return + return; } item.encode(idx, out); out.push(b',') @@ -362,34 +341,27 @@ impl Encoder for ListRollEncoder { } } - -fn eval_enum( - array: &dyn Array, - tag_column: Name, - variants: &Vec<(Name, Exp)> -) -> Result -{ +fn eval_enum(array: &dyn Array, tag_column: Name, variants: &Vec<(Name, Exp)>) -> Result { extract_nulls!(array, array, array_nulls); - let struct_array: &StructArray = array.as_struct_opt().ok_or_else(|| { - schema_error!("expected a StructArray, but got {}", array.data_type()) - })?; + let struct_array: &StructArray = array + .as_struct_opt() + .ok_or_else(|| schema_error!("expected a StructArray, but got {}", array.data_type()))?; - let tag_array = struct_array.column_by_name(tag_column).ok_or_else(|| { - schema_error!("column `{}` not found", tag_column) - })?; + let tag_array = struct_array + .column_by_name(tag_column) + .ok_or_else(|| schema_error!("column `{}` not found", tag_column))?; extract_nulls!(tag_array, tag_array, tag_nulls); - let tag_array = tag_array.as_string_opt().ok_or_else(|| { - schema_error!("expected a StringArray, but got {}", tag_array.data_type()).at(tag_column) - })?; + let tag_array = tag_array + .as_string_opt() + .ok_or_else(|| schema_error!("expected a StringArray, but got {}", tag_array.data_type()).at(tag_column))?; - let variants = variants.iter().map(|(name, exp)| { - exp.eval(array).map(|encoder| { - (*name, encoder) - }) - }).collect::, _>>()?; + let variants = variants + .iter() + .map(|(name, exp)| exp.eval(array).map(|encoder| (*name, encoder))) + .collect::, _>>()?; let encoder = EnumEncoder { tag: tag_array.clone(), @@ -403,43 +375,38 @@ fn eval_enum( }) } - struct EnumEncoder { tag: StringArray, variants: Vec<(Name, EncoderObject)> } - impl Encoder for EnumEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let tag = self.tag.value(idx); for (v, e) in self.variants.iter_mut() { if *v == tag { e.encode(idx, out); - return + return; } } out.extend_from_slice(b"null") } } - struct SolanaTransactionVersionEncoder { values: ScalarBuffer, buffer: [u8; i16::FORMATTED_SIZE] } - impl SolanaTransactionVersionEncoder { fn new(values: ScalarBuffer) -> Self { Self { values, - buffer: i16::init_buffer(), + buffer: i16::init_buffer() } } } - impl Encoder for SolanaTransactionVersionEncoder { fn encode(&mut self, idx: usize, out: &mut Vec) { let value = self.values[idx]; diff --git a/crates/query/src/json/lang.rs b/crates/query/src/json/lang.rs index 0edfddca..da541bc7 100644 --- a/crates/query/src/json/lang.rs +++ b/crates/query/src/json/lang.rs @@ -1,13 +1,10 @@ -use crate::json::exp::Exp; -use crate::primitives::Name; - +use crate::{json::exp::Exp, primitives::Name}; #[derive(Debug, Clone)] pub struct JsonObject { props: Vec<(Name, Exp)> } - impl Default for JsonObject { fn default() -> Self { Self::new() @@ -16,9 +13,7 @@ impl Default for JsonObject { impl JsonObject { pub fn new() -> Self { - Self { - props: Vec::new() - } + Self { props: Vec::new() } } pub fn add>(&mut self, name: Name, exp: E) -> &mut Self { @@ -36,14 +31,12 @@ impl JsonObject { } } - impl From for Exp { fn from(val: JsonObject) -> Self { Exp::Object(val.props) } } - macro_rules! json_object { ( $({ @@ -88,7 +81,6 @@ macro_rules! json_object { } pub(crate) use json_object; - pub fn roll(exp: Exp, columns: Vec) -> Exp { Exp::Roll { columns, @@ -96,7 +88,6 @@ pub fn roll(exp: Exp, columns: Vec) -> Exp { } } - pub fn prop>(name: Name, exp: E) -> Exp { Exp::Prop(name, Box::new(exp.into())) -} \ No newline at end of file +} diff --git a/crates/query/src/json/mod.rs b/crates/query/src/json/mod.rs index cbcc33eb..69c3c6c8 100644 --- a/crates/query/src/json/mod.rs +++ b/crates/query/src/json/mod.rs @@ -1,3 +1,3 @@ pub mod encoder; pub mod exp; -pub mod lang; \ No newline at end of file +pub mod lang; diff --git a/crates/query/src/json_writer.rs b/crates/query/src/json_writer.rs index 38977d2f..a6dd79df 100644 --- a/crates/query/src/json_writer.rs +++ b/crates/query/src/json_writer.rs @@ -2,10 +2,10 @@ use std::io::Write; use arrow::array::{Array, RecordBatch, StructArray}; -use crate::json::encoder::Encoder; -use crate::json::encoder::factory::make_struct_encoder; -use crate::plan::BlockWriter; - +use crate::{ + json::encoder::{factory::make_struct_encoder, Encoder}, + plan::BlockWriter +}; pub struct JsonArrayWriter { write: W, @@ -14,8 +14,7 @@ pub struct JsonArrayWriter { rows_written: bool } - -impl JsonArrayWriter { +impl JsonArrayWriter { pub fn new(write: W) -> Self { Self { write, @@ -26,8 +25,7 @@ impl JsonArrayWriter { } } - -impl JsonArrayWriter { +impl JsonArrayWriter { pub fn write_batch(&mut self, batch: RecordBatch) -> anyhow::Result<()> { if batch.num_rows() == 0 { return Ok(()); @@ -83,23 +81,21 @@ impl JsonArrayWriter { } } - pub struct JsonLinesWriter { write: W, buf: Vec, - flush_threshold: usize, -} - + flush_threshold: usize +} -impl JsonLinesWriter { +impl JsonLinesWriter { pub fn new(write: W) -> Self { Self { write, buf: Vec::with_capacity(256 * 1024), - flush_threshold: 16 * 1024, + flush_threshold: 16 * 1024 } } - + pub fn write_blocks(&mut self, blocks: &mut BlockWriter) -> std::io::Result<()> { while blocks.has_next_block() { blocks.write_next_block(&mut self.buf); @@ -121,4 +117,4 @@ impl JsonLinesWriter { self.flush()?; Ok(self.write) } -} \ No newline at end of file +} diff --git a/crates/query/src/lib.rs b/crates/query/src/lib.rs index 318e4c23..7480ab9c 100644 --- a/crates/query/src/lib.rs +++ b/crates/query/src/lib.rs @@ -1,11 +1,10 @@ #![allow(dead_code)] -mod plan; -mod primitives; -mod scan; mod json; mod json_writer; +mod plan; +mod primitives; mod query; - +mod scan; pub use json_writer::*; pub use plan::{BlockWriter, Plan, UnexpectedBaseBlock}; @@ -13,5 +12,5 @@ pub use primitives::BlockNumber; pub use query::*; #[cfg(feature = "parquet")] pub use scan::parquet::ParquetChunk; -pub use scan::{Chunk, TableDoesNotExist, ColumnDoesNotExist}; +pub use scan::{Chunk, ColumnDoesNotExist, TableDoesNotExist}; pub use sqd_polars::set_polars_thread_pool_size; diff --git a/crates/query/src/plan/key.rs b/crates/query/src/plan/key.rs index a0cf6988..7dee234e 100644 --- a/crates/query/src/plan/key.rs +++ b/crates/query/src/plan/key.rs @@ -1,8 +1,9 @@ use std::ops::Deref; -use arrow::array::{Array, ArrowPrimitiveType, AsArray, OffsetSizeTrait, StringArray}; -use arrow::buffer::{OffsetBuffer, ScalarBuffer}; - +use arrow::{ + array::{Array, ArrowPrimitiveType, AsArray, OffsetSizeTrait, StringArray}, + buffer::{OffsetBuffer, ScalarBuffer} +}; pub trait Key { type Item: ?Sized; @@ -10,13 +11,11 @@ pub trait Key { fn get(&self, idx: usize) -> &Self::Item; } - pub struct PrimitiveKey { values: ScalarBuffer } - -impl Key for PrimitiveKey { +impl Key for PrimitiveKey { type Item = A::Native; #[inline] @@ -25,19 +24,15 @@ impl Key for PrimitiveKey { } } - -impl From<&dyn Array> for PrimitiveKey { +impl From<&dyn Array> for PrimitiveKey { fn from(value: &dyn Array) -> Self { let array = value.as_primitive::(); let values = array.values().clone(); - Self { - values - } + Self { values } } } - -impl <'a> Key for StringArray { +impl<'a> Key for StringArray { type Item = str; #[inline] @@ -46,18 +41,15 @@ impl <'a> Key for StringArray { } } - pub type PrimitiveListKey = PrimitiveGenericListKey; pub type PrimitiveLargeListKey = PrimitiveGenericListKey; - pub struct PrimitiveGenericListKey { offsets: OffsetBuffer, values: ScalarBuffer } - -impl Key for PrimitiveGenericListKey { +impl Key for PrimitiveGenericListKey { type Item = [A::Native]; fn get(&self, idx: usize) -> &Self::Item { @@ -67,34 +59,24 @@ impl Key for PrimitiveGenericListKey } } - -impl From<&dyn Array> for PrimitiveGenericListKey { +impl From<&dyn Array> for PrimitiveGenericListKey { fn from(value: &dyn Array) -> Self { let array = value.as_list(); let offsets = array.offsets().clone(); - let values = array.values() - .as_primitive::() - .values() - .clone(); - Self { - offsets, - values - } + let values = array.values().as_primitive::().values().clone(); + Self { offsets, values } } } - pub type ListKey = GenericListKey; pub type LargeListKey = GenericListKey; - pub struct GenericListKey { offsets: OffsetBuffer, item: K } - -impl GenericListKey { +impl GenericListKey { pub fn get(&self, idx: usize) -> OffsetKey<'_, K> { let offset = self.offsets[idx].as_usize(); let len = self.offsets[idx + 1].as_usize() - offset; @@ -106,22 +88,19 @@ impl GenericListKey { } } - pub struct OffsetKey<'a, K> { item: &'a K, offset: usize, len: usize } - -impl <'a, K> OffsetKey<'a, K> { +impl<'a, K> OffsetKey<'a, K> { pub fn len(&self) -> usize { self.len } } - -impl Key for OffsetKey<'_, K> { +impl Key for OffsetKey<'_, K> { type Item = K::Item; fn get(&self, idx: usize) -> &Self::Item { @@ -129,17 +108,14 @@ impl Key for OffsetKey<'_, K> { } } - -impl From<&dyn Array> for GenericListKey - where K: for<'a> From<&'a dyn Array> +impl From<&dyn Array> for GenericListKey +where + K: for<'a> From<&'a dyn Array> { fn from(value: &dyn Array) -> Self { let array = value.as_list(); let offsets = array.offsets().clone(); let item = K::from(array.values()); - Self { - offsets, - item - } + Self { offsets, item } } -} \ No newline at end of file +} diff --git a/crates/query/src/plan/mod.rs b/crates/query/src/plan/mod.rs index 9a51b5d0..82ee8e2c 100644 --- a/crates/query/src/plan/mod.rs +++ b/crates/query/src/plan/mod.rs @@ -1,12 +1,11 @@ -mod rel; +mod key; mod plan; -mod row_list; -mod table; +mod rel; mod result; +mod row_list; mod sort; -mod key; - +mod table; pub use plan::*; -pub use table::*; pub use result::*; +pub use table::*; diff --git a/crates/query/src/plan/plan.rs b/crates/query/src/plan/plan.rs index cc697c78..e9557053 100644 --- a/crates/query/src/plan/plan.rs +++ b/crates/query/src/plan/plan.rs @@ -1,16 +1,22 @@ -use crate::json::exp::Exp; -use crate::plan::rel::Rel; -use crate::plan::result::{BlockWriter, DataItem}; -use crate::plan::row_list::RowList; -use crate::plan::table::{ColumnWeight, TableSet}; -use crate::primitives::{BlockNumber, Name, RowRangeList, RowWeight, RowWeightPolarsType}; -use crate::scan::{col_between, col_gt_eq, col_lt_eq, Chunk, RowPredicateRef}; -use crate::UnexpectedBaseBlock; +use std::collections::{HashMap, HashSet}; + use anyhow::{anyhow, bail}; use rayon::prelude::*; use sqd_polars::arrow::record_batch_vec_to_lazy_polars_df; use sqd_primitives::BlockRef; -use std::collections::{HashMap, HashSet}; + +use crate::{ + json::exp::Exp, + plan::{ + rel::Rel, + result::{BlockWriter, DataItem}, + row_list::RowList, + table::{ColumnWeight, TableSet} + }, + primitives::{BlockNumber, Name, RowRangeList, RowWeight, RowWeightPolarsType}, + scan::{col_between, col_gt_eq, col_lt_eq, Chunk, RowPredicateRef}, + UnexpectedBaseBlock +}; type Idx = usize; @@ -18,7 +24,7 @@ struct Scan { table: Name, predicate: Option, relations: Vec, - output: Option, + output: Option } struct Output { @@ -28,7 +34,7 @@ struct Output { weight_per_row: RowWeight, weight_columns: Vec, exp: Exp, - item_name: Name, + item_name: Name } pub struct Plan { @@ -39,7 +45,7 @@ pub struct Plan { include_all_blocks: bool, parent_block_hash: Option, first_block: Option, - last_block: Option, + last_block: Option } impl Plan { @@ -47,9 +53,9 @@ impl Plan { PlanExecution { chunk: ChunkWithDefaults { chunk: data_chunk, - tables: self.tables, + tables: self.tables }, - plan: self, + plan: self } .execute() } @@ -73,7 +79,7 @@ impl Plan { /// treated as all-null rather than causing errors. struct ChunkWithDefaults<'a> { chunk: &'a dyn Chunk, - tables: &'static TableSet, + tables: &'static TableSet } impl Chunk for ChunkWithDefaults<'_> { @@ -89,7 +95,7 @@ impl Chunk for ChunkWithDefaults<'_> { struct PlanExecution<'a> { chunk: ChunkWithDefaults<'a>, - plan: &'a Plan, + plan: &'a Plan } impl<'a> PlanExecution<'a> { @@ -115,22 +121,17 @@ impl<'a> PlanExecution<'a> { fn check_parent_block(&self) -> anyhow::Result<()> { let parent_hash = match self.plan.parent_block_hash.as_ref() { Some(s) => s.as_str(), - None => return Ok(()), + None => return Ok(()) }; let block_number = match self.plan.first_block { Some(bn) => bn, - None => bail!( - "invalid plan: parent block hash is specified, but block number is not available" - ), + None => bail!("invalid plan: parent block hash is specified, but block number is not available") }; let block_scan = self.chunk.scan_table(self.plan.outputs[0].table)?; - let has_parent_number = block_scan - .schema() - .column_with_name("parent_number") - .is_some(); + let has_parent_number = block_scan.schema().column_with_name("parent_number").is_some(); let (number_col, predicate_upper) = if has_parent_number { ("parent_number", block_number.saturating_sub(1)) @@ -144,14 +145,12 @@ impl<'a> PlanExecution<'a> { .with_predicate(col_between( number_col, block_number.saturating_sub(100), - predicate_upper, + predicate_upper )) .to_lazy_df()? .collect()?; - let numbers = df - .column(number_col)? - .cast(&sqd_polars::prelude::DataType::UInt64)?; + let numbers = df.column(number_col)?.cast(&sqd_polars::prelude::DataType::UInt64)?; let numbers = numbers.u64()?; let hashes = df.column("parent_hash")?; @@ -164,8 +163,12 @@ impl<'a> PlanExecution<'a> { .expect("block number can't be null according to the predicate applied"); let hash = hashes.get(i).unwrap_or(""); BlockRef { - number: if has_parent_number { number } else { number.saturating_sub(1) }, - hash: hash.to_string(), + number: if has_parent_number { + number + } else { + number.saturating_sub(1) + }, + hash: hash.to_string() } }) .collect(); @@ -192,33 +195,26 @@ impl<'a> PlanExecution<'a> { /// its predicate. These row indexes are distributed to relation inputs /// (for cross-table joins in `execute_relations`) and/or output inputs (for direct /// data reading in `execute_output`). Scans run in parallel. - fn execute_scans( - &self, - relation_inputs: &Vec, - output_inputs: &Vec, - ) -> anyhow::Result<()> { - self.plan - .scans - .par_iter() - .try_for_each(|scan| -> anyhow::Result<()> { - let rows = self - .chunk - .scan_table(scan.table)? - .with_row_index(true) - .with_columns([]) - .with_predicate(scan.predicate.clone()) - .execute()?; - - for rel_idx in scan.relations.iter() { - relation_inputs[*rel_idx].extend_from_record_batch_vec(&rows); - } + fn execute_scans(&self, relation_inputs: &Vec, output_inputs: &Vec) -> anyhow::Result<()> { + self.plan.scans.par_iter().try_for_each(|scan| -> anyhow::Result<()> { + let rows = self + .chunk + .scan_table(scan.table)? + .with_row_index(true) + .with_columns([]) + .with_predicate(scan.predicate.clone()) + .execute()?; + + for rel_idx in scan.relations.iter() { + relation_inputs[*rel_idx].extend_from_record_batch_vec(&rows); + } - if let Some(idx) = &scan.output { - output_inputs[*idx].extend_from_record_batch_vec(&rows) - } + if let Some(idx) = &scan.output { + output_inputs[*idx].extend_from_record_batch_vec(&rows) + } - Ok(()) - }) + Ok(()) + }) } /// Propagate row selections through relations. @@ -227,13 +223,11 @@ impl<'a> PlanExecution<'a> { /// For each relation, the matched rows from `execute_scans` are used to find /// corresponding rows in the relation's output table, adding them /// to that table's output inputs. Relations run in parallel. - fn execute_relations( - &self, - relation_inputs: Vec, - output_inputs: &Vec, - ) -> anyhow::Result<()> { - relation_inputs.into_par_iter().enumerate().try_for_each( - |(idx, row_list)| -> anyhow::Result<()> { + fn execute_relations(&self, relation_inputs: Vec, output_inputs: &Vec) -> anyhow::Result<()> { + relation_inputs + .into_par_iter() + .enumerate() + .try_for_each(|(idx, row_list)| -> anyhow::Result<()> { let input = row_list.into_inner(); if input.is_empty() { return Ok(()); @@ -241,8 +235,7 @@ impl<'a> PlanExecution<'a> { let rel = &self.plan.relations[idx]; let output = &output_inputs[self.get_output_index(rel.output_table())]; rel.eval(&self.chunk, &input, output) - }, - ) + }) } /// Read actual column data and build the output. @@ -300,11 +293,7 @@ impl<'a> PlanExecution<'a> { weight_exp = weight_exp.alias("weight"); let rows = df - .select([ - col("row_index"), - col(output.key[0]).alias("block_number"), - weight_exp, - ]) + .select([col("row_index"), col(output.key[0]).alias("block_number"), weight_exp]) .collect()?; Ok(rows) @@ -322,14 +311,14 @@ impl<'a> PlanExecution<'a> { for df in rows.iter().skip(1).filter(|df| !df.is_empty()) { item_union.push(df.clone().lazy().select([ col("block_number"), - col("weight").strict_cast(RowWeightPolarsType::get_dtype()), + col("weight").strict_cast(RowWeightPolarsType::get_dtype()) ])) } let block_weights = if self.plan.include_all_blocks { item_union.push(header_rows.clone().lazy().select([ col("block_number"), - col("weight").strict_cast(RowWeightPolarsType::get_dtype()), + col("weight").strict_cast(RowWeightPolarsType::get_dtype()) ])); concat(item_union, UnionArgs::default())? .group_by([col("block_number")]) @@ -340,7 +329,7 @@ impl<'a> PlanExecution<'a> { .lazy() .select([ min("block_number").alias("first_block"), - max("block_number").alias("last_block"), + max("block_number").alias("last_block") ]) .collect()?; @@ -353,7 +342,7 @@ impl<'a> PlanExecution<'a> { block_numbers, Series::new("weight".into(), &[0 as RowWeight, 0 as RowWeight]), ])? - .lazy(), + .lazy() ); let item_stats = concat(item_union, UnionArgs::default())? @@ -366,7 +355,7 @@ impl<'a> PlanExecution<'a> { .left_join(item_stats, col("block_number"), col("block_number")) .select([ col("block_number"), - (col("weight") + col("weight_right")).alias("weight"), + (col("weight") + col("weight_right")).alias("weight") ]) }; @@ -398,7 +387,7 @@ impl<'a> PlanExecution<'a> { let data_items_mutex = parking_lot::Mutex::new( std::iter::repeat_with(|| None) .take(self.plan.outputs.len()) - .collect::>(), + .collect::>() ); rows.into_par_iter() @@ -411,24 +400,16 @@ impl<'a> PlanExecution<'a> { let output = &self.plan.outputs[idx]; let row_index = if idx == 0 && !self.plan.include_all_blocks { - rows.lazy().semi_join( - selected_blocks.clone().lazy(), - col("block_number"), - col("block_number"), - ) - } else { rows.lazy() - .filter(col("block_number").lt_eq(lit(last_block))) + .semi_join(selected_blocks.clone().lazy(), col("block_number"), col("block_number")) + } else { + rows.lazy().filter(col("block_number").lt_eq(lit(last_block))) } .select([col("row_index")]) .collect()?; let row_selection = RowRangeList::from_sorted_indexes( - row_index - .column("row_index") - .unwrap() - .u32()? - .into_no_null_iter(), + row_index.column("row_index").unwrap().u32()?.into_no_null_iter() ); let records = self @@ -446,20 +427,12 @@ impl<'a> PlanExecution<'a> { })?; Ok(Some(BlockWriter::new( - data_items_mutex - .into_inner() - .into_iter() - .flatten() - .collect(), + data_items_mutex.into_inner().into_iter().flatten().collect() ))) } fn get_output_index(&self, table: Name) -> usize { - self.plan - .outputs - .iter() - .position(|o| o.table == table) - .unwrap() + self.plan.outputs.iter().position(|o| o.table == table).unwrap() } fn get_block_number_predicate(&self, output_idx: usize) -> Option { @@ -468,7 +441,7 @@ impl<'a> PlanExecution<'a> { (Some(fst), Some(lst)) => Some(col_between(column, fst, lst)), (None, Some(lst)) => Some(col_lt_eq(column, lst)), (Some(fst), None) => Some(col_gt_eq(column, fst)), - (None, None) => None, + (None, None) => None } } } @@ -481,7 +454,7 @@ pub struct PlanBuilder { include_all_blocks: bool, parent_block_hash: Option, first_block: Option, - last_block: Option, + last_block: Option } impl PlanBuilder { @@ -499,13 +472,13 @@ impl PlanBuilder { weight_per_row: 0, weight_columns: Vec::new(), exp: Exp::Object(vec![]), - item_name: table.result_item_name, + item_name: table.result_item_name }) .collect(), include_all_blocks: false, parent_block_hash: None, first_block: None, - last_block: None, + last_block: None } } @@ -514,14 +487,11 @@ impl PlanBuilder { table, predicate: None, output: Some(self.tables.get_index(table)), - relations: Vec::new(), + relations: Vec::new() }; let scan_idx = self.scans.len(); self.scans.push(scan); - ScanBuilder { - plan: self, - scan_idx, - } + ScanBuilder { plan: self, scan_idx } } pub fn set_projection>(&mut self, table: Name, exp: E) -> &mut Self { @@ -561,7 +531,7 @@ impl PlanBuilder { include_all_blocks: self.include_all_blocks, parent_block_hash: self.parent_block_hash, first_block: self.first_block, - last_block: self.last_block, + last_block: self.last_block } } @@ -579,11 +549,7 @@ impl PlanBuilder { let mut weight_columns = Vec::new(); for col in output.projection.iter() { - match table - .column_weights - .get(col) - .unwrap_or(&ColumnWeight::Fixed(32)) - { + match table.column_weights.get(col).unwrap_or(&ColumnWeight::Fixed(32)) { ColumnWeight::Fixed(weight) => { per_row += weight; } @@ -598,13 +564,10 @@ impl PlanBuilder { } fn add_rel(&mut self, rel: Rel) -> usize { - self.relations - .iter() - .position(|r| r == &rel) - .unwrap_or_else(|| { - self.relations.push(rel); - self.relations.len() - 1 - }) + self.relations.iter().position(|r| r == &rel).unwrap_or_else(|| { + self.relations.push(rel); + self.relations.len() - 1 + }) } /// Lame plan optimization procedure @@ -703,11 +666,7 @@ impl PlanBuilder { // Introduce new scans to populate that. let mut new_scans: HashMap = HashMap::new(); - for out_idx in is_full - .iter() - .enumerate() - .filter_map(|(idx, full)| full.then_some(idx)) - { + for out_idx in is_full.iter().enumerate().filter_map(|(idx, full)| full.then_some(idx)) { let table = self.outputs[out_idx].table; new_scans.insert( table, @@ -715,17 +674,12 @@ impl PlanBuilder { table, predicate: None, relations: vec![], - output: Some(out_idx), - }, + output: Some(out_idx) + } ); } - for (idx, rel) in self - .relations - .iter() - .enumerate() - .filter(|(idx, _)| is_full_rel[*idx]) - { + for (idx, rel) in self.relations.iter().enumerate().filter(|(idx, _)| is_full_rel[*idx]) { // The new scan must read the relation's *input* table, since // execute_scans feeds scan rows into relation_inputs[rel_idx], // which eval_join etc. interpret as row indexes of input_table. @@ -734,7 +688,7 @@ impl PlanBuilder { table, predicate: None, relations: vec![], - output: None, + output: None }); scan.relations.push(idx); } @@ -803,15 +757,14 @@ impl PlanBuilder { input_table, input_key, output_table, - output_key, + output_key } => { let input_desc = self.tables.get(input_table); - input_key == &input_desc.primary_key - && input_desc.children.get(output_table) == Some(output_key) + input_key == &input_desc.primary_key && input_desc.children.get(output_table) == Some(output_key) } Rel::Children { .. } => true, Rel::Parents { .. } => true, - _ => false, + _ => false } } } @@ -827,7 +780,7 @@ fn remove_elements(vec: &mut Vec, remove_mask: &[bool]) { pub struct ScanBuilder<'a> { plan: &'a mut PlanBuilder, - scan_idx: usize, + scan_idx: usize } impl<'a> ScanBuilder<'a> { @@ -836,17 +789,12 @@ impl<'a> ScanBuilder<'a> { self } - pub fn join( - &mut self, - output_table: Name, - output_key: Vec, - scan_key: Vec, - ) -> &mut Self { + pub fn join(&mut self, output_table: Name, output_key: Vec, scan_key: Vec) -> &mut Self { let rel_idx = self.plan.add_rel(Rel::Join { input_table: self.plan.scans[self.scan_idx].table, input_key: scan_key, output_table, - output_key, + output_key }); self.scan_mut().relations.push(rel_idx); self @@ -872,13 +820,13 @@ impl<'a> ScanBuilder<'a> { &mut self, output_table: Name, output_key: Vec, - scan_key: Vec, + scan_key: Vec ) -> &mut Self { let rel_idx = self.plan.add_rel(Rel::ForeignChildren { input_table: self.plan.scans[self.scan_idx].table, input_key: scan_key, output_table, - output_key, + output_key }); self.scan_mut().relations.push(rel_idx); self @@ -888,13 +836,13 @@ impl<'a> ScanBuilder<'a> { &mut self, output_table: Name, output_key: Vec, - scan_key: Vec, + scan_key: Vec ) -> &mut Self { let rel_idx = self.plan.add_rel(Rel::ForeignParents { input_table: self.plan.scans[self.scan_idx].table, input_key: scan_key, output_table, - output_key, + output_key }); self.scan_mut().relations.push(rel_idx); self @@ -910,12 +858,12 @@ impl<'a> ScanBuilder<'a> { } } - #[cfg(test)] mod tests { + use std::sync::OnceLock; + use super::*; use crate::scan::col_gt_eq; - use std::sync::OnceLock; /// Tables with a `transactions` parent and `logs` child, plus an explicit /// `add_child("logs", ...)` registration on transactions. The logs side has @@ -927,11 +875,8 @@ mod tests { TABLES.get_or_init(|| { let mut t = TableSet::new(); t.add_table("blocks", vec!["number"]); - t.add_table( - "transactions", - vec!["block_number", "transaction_index"], - ) - .add_child("logs", vec!["block_number", "transaction_index"]); + t.add_table("transactions", vec!["block_number", "transaction_index"]) + .add_child("logs", vec!["block_number", "transaction_index"]); t.add_table("logs", vec!["block_number", "log_index"]); t }) @@ -963,13 +908,11 @@ mod tests { #[test] fn simplify_attaches_surviving_relations_to_input_table_scan() { let mut builder = PlanBuilder::new(evm_like_tables()); - builder - .add_scan("logs") - .join( - "transactions", - vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], - ); + builder.add_scan("logs").join( + "transactions", + vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] + ); let plan = builder.build(); @@ -996,13 +939,11 @@ mod tests { #[test] fn simplify_drops_full_rel_and_replaces_with_direct_scan() { let mut builder = PlanBuilder::new(full_rel_tables()); - builder - .add_scan("logs") - .join( - "transactions", - vec!["block_number", "log_index"], - vec!["block_number", "log_index"], - ); + builder.add_scan("logs").join( + "transactions", + vec!["block_number", "log_index"], + vec!["block_number", "log_index"] + ); let plan = builder.build(); @@ -1038,7 +979,7 @@ mod tests { .join( "transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); let plan = builder.build(); @@ -1073,13 +1014,13 @@ mod tests { input_table: "logs", input_key: vec!["block_number", "transaction_index"], output_table: "transactions", - output_key: vec!["block_number", "transaction_index"], + output_key: vec!["block_number", "transaction_index"] }); builder.scans.push(Scan { table: "transactions", predicate: None, relations: vec![0], - output: None, + output: None }); builder.assert_scan_relation_invariant(); } diff --git a/crates/query/src/plan/rel.rs b/crates/query/src/plan/rel.rs index d2a53d35..11ca41fb 100644 --- a/crates/query/src/plan/rel.rs +++ b/crates/query/src/plan/rel.rs @@ -1,13 +1,20 @@ -use crate::plan::key::{GenericListKey, Key, PrimitiveGenericListKey}; -use crate::plan::row_list::RowList; -use crate::primitives::{schema_error, Name, RowIndex, RowIndexArrowType, RowRangeList, SchemaError}; -use crate::scan::Chunk; +use std::collections::{BTreeSet, HashSet}; + use anyhow::bail; -use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait}; -use arrow::datatypes::{DataType, Int32Type, UInt16Type, UInt32Type}; +use arrow::{ + array::{Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait}, + datatypes::{DataType, Int32Type, UInt16Type, UInt32Type} +}; use sqd_polars::arrow::{polars_series_to_arrow_array, polars_series_to_row_index_iter}; -use std::collections::{BTreeSet, HashSet}; +use crate::{ + plan::{ + key::{GenericListKey, Key, PrimitiveGenericListKey}, + row_list::RowList + }, + primitives::{schema_error, Name, RowIndex, RowIndexArrowType, RowRangeList, SchemaError}, + scan::Chunk +}; #[derive(PartialEq, Eq, Hash, Clone)] pub enum Rel { @@ -39,7 +46,6 @@ pub enum Rel { } } - impl Rel { pub fn output_table(&self) -> Name { match self { @@ -61,55 +67,32 @@ impl Rel { } } - pub fn eval( - &self, - chunk: &dyn Chunk, - input: &BTreeSet, - output: &RowList - ) -> anyhow::Result<()> - { + pub fn eval(&self, chunk: &dyn Chunk, input: &BTreeSet, output: &RowList) -> anyhow::Result<()> { match self { Rel::Join { input_table, input_key, output_table, output_key - } => { - eval_join(chunk, input, input_table, input_key, output_table, output_key, output) - }, + } => eval_join(chunk, input, input_table, input_key, output_table, output_key, output), Rel::ForeignChildren { input_table, input_key, output_table, output_key - } => { - eval_foreign_children(chunk, input, input_table, input_key, output_table, output_key, output) - }, + } => eval_foreign_children(chunk, input, input_table, input_key, output_table, output_key, output), Rel::ForeignParents { input_table, input_key, output_table, output_key - } => { - eval_foreign_parents(chunk, input, input_table, input_key, output_table, output_key, output) - }, - Rel::Children { - table, - key - } => { - eval_children(chunk, input, table, key, output) - }, - Rel::Parents { - table, - key - } => { - eval_parents(chunk, input, table, key, output) - } + } => eval_foreign_parents(chunk, input, input_table, input_key, output_table, output_key, output), + Rel::Children { table, key } => eval_children(chunk, input, table, key, output), + Rel::Parents { table, key } => eval_parents(chunk, input, table, key, output) } } } - fn eval_join( chunk: &dyn Chunk, input: &BTreeSet, @@ -118,72 +101,65 @@ fn eval_join( output_table: Name, output_key: &[Name], output: &RowList -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { use sqd_polars::prelude::*; - - let input_rows = chunk.scan_table(input_table)? + + let input_rows = chunk + .scan_table(input_table)? .with_row_selection(RowRangeList::from_sorted_indexes(input.iter().copied())) .with_columns(input_key.iter().copied()) .to_lazy_df()?; - let output_rows = chunk.scan_table(output_table)? + let output_rows = chunk + .scan_table(output_table)? .with_row_index(true) .with_columns(output_key.iter().copied()) .to_lazy_df()?; - let result = output_rows.join( - input_rows, - output_key.iter().copied().map(col).collect::>(), - input_key.iter().copied().map(col).collect::>(), - JoinArgs::new(JoinType::Semi) - ).select([ - col("row_index") - ]).collect()?; + let result = output_rows + .join( + input_rows, + output_key.iter().copied().map(col).collect::>(), + input_key.iter().copied().map(col).collect::>(), + JoinArgs::new(JoinType::Semi) + ) + .select([col("row_index")]) + .collect()?; output.extend_from_polars_df(&result); Ok(()) } - fn eval_children( chunk: &dyn Chunk, input: &BTreeSet, table: Name, key: &[Name], output: &RowList -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stack = select_stack(chunk, table, key, input)?; - let children = find_children( - &stack, - input, - false - )?; + let children = find_children(&stack, input, false)?; output.extend(children); Ok(()) } - fn eval_parents( chunk: &dyn Chunk, input: &BTreeSet, table: Name, key: &[Name], output: &RowList -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stack = select_stack(chunk, table, key, input)?; let parents = find_parents(&stack, input)?; output.extend(parents); Ok(()) } - fn eval_foreign_children( chunk: &dyn Chunk, input: &BTreeSet, @@ -192,29 +168,16 @@ fn eval_foreign_children( output_table: Name, output_key: &[Name], output: &RowList -) -> anyhow::Result<()> -{ - let (stack, parents) = select_foreign_stack( - chunk, - input, - input_table, - input_key, - output_table, - output_key - )?; - - let children = find_children( - &stack, - &parents, - true - )?; +) -> anyhow::Result<()> { + let (stack, parents) = select_foreign_stack(chunk, input, input_table, input_key, output_table, output_key)?; + + let children = find_children(&stack, &parents, true)?; output.extend(children); Ok(()) } - fn eval_foreign_parents( chunk: &dyn Chunk, input: &BTreeSet, @@ -223,33 +186,19 @@ fn eval_foreign_parents( output_table: Name, output_key: &[Name], output: &RowList -) -> anyhow::Result<()> -{ - let (stack, children) = select_foreign_stack( - chunk, - input, - input_table, - input_key, - output_table, - output_key - )?; +) -> anyhow::Result<()> { + let (stack, children) = select_foreign_stack(chunk, input, input_table, input_key, output_table, output_key)?; let parents = find_parents(&stack, &children)?; output.extend(parents); Ok(()) } - -fn select_stack( - chunk: &dyn Chunk, - table: Name, - key: &[Name], - input: &BTreeSet -) -> anyhow::Result -{ +fn select_stack(chunk: &dyn Chunk, table: Name, key: &[Name], input: &BTreeSet) -> anyhow::Result { use sqd_polars::prelude::*; - let items = chunk.scan_table(table)? + let items = chunk + .scan_table(table)? .with_row_index(true) .with_columns(key.iter().copied()) .to_lazy_df()? @@ -261,77 +210,67 @@ fn select_stack( series }])?; - let group_key_columns: Vec<_> = key.iter().copied() - .map(col) - .take(key.len() - 1) - .collect(); + let group_key_columns: Vec<_> = key.iter().copied().map(col).take(key.len() - 1).collect(); let address_column = *key.last().unwrap(); - let groups = row_index.lazy().join( - items.clone().lazy(), - [col("row_index")], - [col("row_index")], - JoinArgs::new(JoinType::Inner) - ).select( - &group_key_columns - ).unique( - None, - UniqueKeepStrategy::Any - ).with_row_index( - "_group", - None - ); - - let items = groups.join( - items.lazy(), - &group_key_columns, - &group_key_columns, - JoinArgs::new(JoinType::Inner) - ).group_by( - ["_group"] - ).agg([ - col(address_column), - col("row_index") - ]).select([ - col(address_column).alias("address"), - col("row_index") - ]).collect()?; + let groups = row_index + .lazy() + .join( + items.clone().lazy(), + [col("row_index")], + [col("row_index")], + JoinArgs::new(JoinType::Inner) + ) + .select(&group_key_columns) + .unique(None, UniqueKeepStrategy::Any) + .with_row_index("_group", None); + + let items = groups + .join( + items.lazy(), + &group_key_columns, + &group_key_columns, + JoinArgs::new(JoinType::Inner) + ) + .group_by(["_group"]) + .agg([col(address_column), col("row_index")]) + .select([col(address_column).alias("address"), col("row_index")]) + .collect()?; Ok(Stack::from_df(&items)) } - fn select_foreign_stack( chunk: &dyn Chunk, input: &BTreeSet, input_table: Name, input_key: &[Name], output_table: Name, - output_key: &[Name], -) -> anyhow::Result<(Stack, BTreeSet)> -{ + output_key: &[Name] +) -> anyhow::Result<(Stack, BTreeSet)> { use sqd_polars::prelude::*; - let group_key_exp: Vec<_> = output_key.iter().copied() - .map(col) - .take(output_key.len() - 1) - .collect(); + let group_key_exp: Vec<_> = output_key.iter().copied().map(col).take(output_key.len() - 1).collect(); let address_column = *output_key.last().unwrap(); - let input = chunk.scan_table(input_table)? + let input = chunk + .scan_table(input_table)? .with_row_selection(RowRangeList::from_sorted_indexes(input.iter().copied())) .with_columns(input_key.iter().copied()) .to_lazy_df()? .select( - input_key.iter().zip(output_key.iter()) + input_key + .iter() + .zip(output_key.iter()) .map(|(i, o)| col(*i).alias(*o)) .collect::>() ) .collect()?; - let items = chunk.scan_table(output_table)? + let items = chunk + .scan_table(output_table)? .with_columns(output_key.iter().copied()) .with_row_index(true) .to_lazy_df()? @@ -345,7 +284,8 @@ fn select_foreign_stack( let output_key_exp = output_key.iter().copied().map(col).collect::>(); - let input_rows = input.lazy() + let input_rows = input + .lazy() .join( items.clone().lazy(), &output_key_exp, @@ -354,22 +294,13 @@ fn select_foreign_stack( ) .select([col("row_index")]) .collect() - .map(|df| { - BTreeSet::from_iter( - polars_series_to_row_index_iter(df.column("row_index").unwrap()) - ) - })?; + .map(|df| BTreeSet::from_iter(polars_series_to_row_index_iter(df.column("row_index").unwrap())))?; - let groups = items.lazy() + let groups = items + .lazy() .group_by(group_key_exp) - .agg([ - col(address_column), - col("row_index") - ]) - .select([ - col(address_column).alias("address"), - col("row_index") - ]) + .agg([col(address_column), col("row_index")]) + .select([col(address_column).alias("address"), col("row_index")]) .collect()?; let stack = Stack::from_df(&groups); @@ -377,25 +308,16 @@ fn select_foreign_stack( Ok((stack, input_rows)) } - struct Stack { address: ArrayRef, row_index: ArrayRef } - impl Stack { fn from_df(df: &sqd_polars::prelude::DataFrame) -> Self { - let address = polars_series_to_arrow_array( - df.column("address").unwrap() - ); - let row_index = polars_series_to_arrow_array( - df.column("row_index").unwrap() - ); - Self { - address, - row_index - } + let address = polars_series_to_arrow_array(df.column("address").unwrap()); + let row_index = polars_series_to_arrow_array(df.column("row_index").unwrap()); + Self { address, row_index } } fn get_address_type(&self) -> &DataType { @@ -407,55 +329,38 @@ impl Stack { } } - macro_rules! downcast_address_type { ($address_type:expr, $co:path) => { match $address_type { DataType::LargeList(it) if it.data_type() == &DataType::Int32 => { $co!(Int32Type) - }, + } DataType::LargeList(it) if it.data_type() == &DataType::UInt32 => { $co!(UInt32Type) - }, + } DataType::LargeList(it) if it.data_type() == &DataType::UInt16 => { $co!(UInt16Type) - }, - it => bail!( - schema_error!("invalid address type - {}", it) - ), + } + it => bail!(schema_error!("invalid address type - {}", it)) } }; } - -fn find_children( - stack: &Stack, - parents: &BTreeSet, - include_parent: bool -) -> anyhow::Result> -{ +fn find_children(stack: &Stack, parents: &BTreeSet, include_parent: bool) -> anyhow::Result> { macro_rules! find { ($address_type:ty) => { - find_children_impl::<$address_type, i64>( - &stack, - parents, - include_parent - ) + find_children_impl::<$address_type, i64>(&stack, parents, include_parent) }; } Ok(downcast_address_type!(stack.get_address_type(), find)) } - -fn find_children_impl( - stack: &Stack, - parents: &BTreeSet, - include_parent: bool -) -> Vec - where A: ArrowPrimitiveType, - A::Native: Eq + Ord, - O: OffsetSizeTrait +fn find_children_impl(stack: &Stack, parents: &BTreeSet, include_parent: bool) -> Vec +where + A: ArrowPrimitiveType, + A::Native: Eq + Ord, + O: OffsetSizeTrait { let n_groups = stack.address.len(); let mut children = Vec::with_capacity(stack.row_index.as_list::().values().len()); @@ -463,9 +368,7 @@ fn find_children_impl( let address = GenericListKey::, O>::from(stack.address.as_ref()); let row_index = PrimitiveGenericListKey::::from(stack.row_index.as_ref()); - let mut order = Vec::with_capacity( - (0..n_groups).map(|g| row_index.get(g).len()).max().unwrap_or(0) - ); + let mut order = Vec::with_capacity((0..n_groups).map(|g| row_index.get(g).len()).max().unwrap_or(0)); for g in 0..n_groups { let rows = row_index.get(g); @@ -500,12 +403,10 @@ fn find_children_impl( children } - fn is_parent_address(parent: &[I], child: &[I]) -> bool { parent.len() < child.len() && parent.eq(&child[0..parent.len()]) } - fn find_parents(stack: &Stack, children: &BTreeSet) -> anyhow::Result> { macro_rules! find { ($address_type:ty) => { @@ -516,14 +417,11 @@ fn find_parents(stack: &Stack, children: &BTreeSet) -> anyhow::Result< Ok(downcast_address_type!(stack.get_address_type(), find)) } - -fn find_parents_impl( - stack: &Stack, - children: &BTreeSet -) -> HashSet - where A: ArrowPrimitiveType, - A::Native: Eq + Ord, - O: OffsetSizeTrait +fn find_parents_impl(stack: &Stack, children: &BTreeSet) -> HashSet +where + A: ArrowPrimitiveType, + A::Native: Eq + Ord, + O: OffsetSizeTrait { let n_groups = stack.address.len(); let mut parents = HashSet::with_capacity(stack.row_index.as_list::().values().len()); @@ -531,9 +429,7 @@ fn find_parents_impl( let address = GenericListKey::, O>::from(stack.address.as_ref()); let row_index = PrimitiveGenericListKey::::from(stack.row_index.as_ref()); - let mut order = Vec::with_capacity( - (0..n_groups).map(|g| row_index.get(g).len()).max().unwrap_or(0) - ); + let mut order = Vec::with_capacity((0..n_groups).map(|g| row_index.get(g).len()).max().unwrap_or(0)); let mut s = Vec::with_capacity(50); @@ -554,19 +450,17 @@ fn find_parents_impl( let addr = addrs.get(order[i]); while let Some(top) = s.last().copied() { if is_parent_address(addrs.get(top), addr) { - break + break; } else { s.pop(); } } s.push(order[i]); if children.contains(&rows[order[i]]) { - parents.extend( - s.iter().map(|i| rows[*i]) - ); + parents.extend(s.iter().map(|i| rows[*i])); } } } parents -} \ No newline at end of file +} diff --git a/crates/query/src/plan/result.rs b/crates/query/src/plan/result.rs index e1199c9e..d93ac860 100644 --- a/crates/query/src/plan/result.rs +++ b/crates/query/src/plan/result.rs @@ -1,14 +1,23 @@ -use crate::json::encoder::util::{json_close, make_object_prop}; -use crate::json::encoder::{Encoder, EncoderObject}; -use crate::json::exp::Exp; -use crate::plan::sort::{compute_order, Position}; -use crate::primitives::{BlockNumber, Name}; +use std::fmt::{Display, Formatter}; + use anyhow::{anyhow, Context}; -use arrow::array::{AsArray, PrimitiveArray, RecordBatch, StructArray}; -use arrow::datatypes::{DataType, UInt64Type}; +use arrow::{ + array::{AsArray, PrimitiveArray, RecordBatch, StructArray}, + datatypes::{DataType, UInt64Type} +}; use sqd_primitives::BlockRef; -use std::fmt::{Display, Formatter}; +use crate::{ + json::{ + encoder::{ + util::{json_close, make_object_prop}, + Encoder, EncoderObject + }, + exp::Exp + }, + plan::sort::{compute_order, Position}, + primitives::{BlockNumber, Name} +}; pub(super) struct DataItem { prop: Vec, @@ -20,40 +29,39 @@ pub(super) struct DataItem { is_block_header: bool } - impl DataItem { - pub(super) fn new( - name: &str, - key: &[Name], - records: Vec, - exp: &Exp - ) -> anyhow::Result { + pub(super) fn new(name: &str, key: &[Name], records: Vec, exp: &Exp) -> anyhow::Result { let block_number_column = key[0]; - let block_numbers = records.iter().map(|b| -> anyhow::Result<_> { - let column = b.column_by_name(block_number_column).ok_or_else(|| { - anyhow!( - "key column '{}' is not present in '{}' output", - block_number_column, - name - ) - })?; + let block_numbers = records + .iter() + .map(|b| -> anyhow::Result<_> { + let column = b.column_by_name(block_number_column).ok_or_else(|| { + anyhow!( + "key column '{}' is not present in '{}' output", + block_number_column, + name + ) + })?; - let numbers = arrow::compute::cast(column, &DataType::UInt64).with_context(|| { - format!("failed to cast '{}' to block number", block_number_column) - })?; + let numbers = arrow::compute::cast(column, &DataType::UInt64) + .with_context(|| format!("failed to cast '{}' to block number", block_number_column))?; - Ok(numbers.as_primitive::().clone()) - }).collect::>>()?; + Ok(numbers.as_primitive::().clone()) + }) + .collect::>>()?; let order = compute_order(&records, key)?; let size = records.iter().map(|b| b.get_array_memory_size()).sum(); - let encoders = records.into_iter().map(|b| { - let struct_array = StructArray::from(b); - exp.eval(&struct_array) - }).collect::, _>>()?; + let encoders = records + .into_iter() + .map(|b| { + let struct_array = StructArray::from(b); + exp.eval(&struct_array) + }) + .collect::, _>>()?; Ok(Self { prop: make_object_prop(name), @@ -71,9 +79,7 @@ impl DataItem { } fn get_block_at(&self, idx: usize) -> Option { - self.order.get(idx).map(|pos| { - self.block_numbers[pos.0].value(pos.1) - }) + self.order.get(idx).map(|pos| self.block_numbers[pos.0].value(pos.1)) } fn write_header(&mut self, out: &mut Vec) { @@ -85,9 +91,10 @@ impl DataItem { } fn write_items(&mut self, block_number: BlockNumber, out: &mut Vec) { - let has_items = self.order.get(self.pos).map_or(false, |pos| { - self.block_numbers[pos.0].value(pos.1) == block_number - }); + let has_items = self + .order + .get(self.pos) + .map_or(false, |pos| self.block_numbers[pos.0].value(pos.1) == block_number); if !has_items { return; @@ -98,7 +105,7 @@ impl DataItem { while self.pos < self.order.len() { let pos = self.order[self.pos]; if self.block_numbers[pos.0].value(pos.1) != block_number { - break + break; } self.pos += 1; self.encoders[pos.0].encode(pos.1, out); @@ -110,18 +117,14 @@ impl DataItem { } } - pub struct BlockWriter { items: Vec } - impl BlockWriter { pub(super) fn new(items: Vec) -> Self { assert!(items.len() > 0 && items[0].is_block_header); - Self { - items - } + Self { items } } pub fn data_size(&self) -> usize { @@ -160,23 +163,19 @@ impl BlockWriter { } } - #[derive(Clone, Debug)] pub struct UnexpectedBaseBlock { pub prev_blocks: Vec, pub expected_hash: String } - impl Display for UnexpectedBaseBlock { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { if let Some(last_block) = self.prev_blocks.last() { write!( f, "unexpected base block: expected {}, but got {}#{}", - self.expected_hash, - last_block.number, - last_block.hash + self.expected_hash, last_block.number, last_block.hash ) } else { write!( @@ -188,5 +187,4 @@ impl Display for UnexpectedBaseBlock { } } - -impl std::error::Error for UnexpectedBaseBlock {} \ No newline at end of file +impl std::error::Error for UnexpectedBaseBlock {} diff --git a/crates/query/src/plan/row_list.rs b/crates/query/src/plan/row_list.rs index 02456b79..a252315c 100644 --- a/crates/query/src/plan/row_list.rs +++ b/crates/query/src/plan/row_list.rs @@ -1,16 +1,14 @@ use std::collections::BTreeSet; use arrow::array::{Array, AsArray, PrimitiveArray, RecordBatch}; - -use crate::primitives::{RowIndex, RowIndexArrowType}; use sqd_polars::arrow::polars_series_to_row_index_iter; +use crate::primitives::{RowIndex, RowIndexArrowType}; pub struct RowList { row_indexes: parking_lot::Mutex> } - impl RowList { pub fn new() -> Self { Self { @@ -19,14 +17,16 @@ impl RowList { } pub fn extend(&self, rows: I) - where I: IntoIterator + where + I: IntoIterator { self.row_indexes.lock().extend(rows) } pub fn extend_from_record_batch_vec(&self, batches: &Vec) { let row_index_iter = batches.iter().flat_map(|b| { - let array: &PrimitiveArray = b.column_by_name("row_index") + let array: &PrimitiveArray = b + .column_by_name("row_index") .expect("No row_index column in the batch") .as_primitive(); assert_eq!(array.null_count(), 0); @@ -44,4 +44,4 @@ impl RowList { pub fn into_inner(self) -> BTreeSet { self.row_indexes.into_inner() } -} \ No newline at end of file +} diff --git a/crates/query/src/plan/sort.rs b/crates/query/src/plan/sort.rs index 0e2d3e54..c2fbfa49 100644 --- a/crates/query/src/plan/sort.rs +++ b/crates/query/src/plan/sort.rs @@ -1,27 +1,23 @@ use std::cmp::Ordering; -use arrow::array::{Array, AsArray, RecordBatch}; -use arrow::datatypes::{DataType, Int32Type, Int64Type, Schema, UInt16Type, UInt32Type, UInt64Type}; - -use crate::plan::key::{Key, PrimitiveKey, PrimitiveListKey}; -use crate::primitives::{Name, schema_error, SchemaError}; +use arrow::{ + array::{Array, AsArray, RecordBatch}, + datatypes::{DataType, Int32Type, Int64Type, Schema, UInt16Type, UInt32Type, UInt64Type} +}; +use crate::{ + plan::key::{Key, PrimitiveKey, PrimitiveListKey}, + primitives::{schema_error, Name, SchemaError} +}; pub type Position = (usize, usize); - -pub fn compute_order( - record_batches: &Vec, - key: &[Name] -) -> Result, SchemaError> -{ +pub fn compute_order(record_batches: &Vec, key: &[Name]) -> Result, SchemaError> { if record_batches.len() == 0 { - return Ok(vec![]) + return Ok(vec![]); } - let mut positions = Vec::with_capacity( - record_batches.iter().map(|b| b.num_rows()).sum() - ); + let mut positions = Vec::with_capacity(record_batches.iter().map(|b| b.num_rows()).sum()); for (idx, batch) in record_batches.iter().enumerate() { positions.extend((0..batch.num_rows()).map(|pos| (idx, pos))) @@ -34,69 +30,55 @@ pub fn compute_order( [data_type] => { let col = get_column(record_batches, key.indexes[0], data_type)?; positions.sort_unstable_by(|a, b| col.compare_positions(*a, *b)) - }, - [DataType::Int32, DataType::Int32] => { - sort_by_2::, PrimitiveKey>( - &mut positions, - record_batches, - key.indexes[0], - key.indexes[1] - ) - }, + } + [DataType::Int32, DataType::Int32] => sort_by_2::, PrimitiveKey>( + &mut positions, + record_batches, + key.indexes[0], + key.indexes[1] + ), [DataType::Int32, DataType::Int32, DataType::List(field)] if field.data_type() == &DataType::Int32 => { - sort_by_3::< - PrimitiveKey, - PrimitiveKey, - PrimitiveListKey - >( + sort_by_3::, PrimitiveKey, PrimitiveListKey>( &mut positions, record_batches, key.indexes[0], key.indexes[1], key.indexes[2] ) - }, + } [DataType::Int32, DataType::Int32, DataType::List(field)] if field.data_type() == &DataType::UInt32 => { - sort_by_3::< - PrimitiveKey, - PrimitiveKey, - PrimitiveListKey - >( + sort_by_3::, PrimitiveKey, PrimitiveListKey>( &mut positions, record_batches, key.indexes[0], key.indexes[1], key.indexes[2] ) - }, + } _ => { let column_list = ColumnList::new(record_batches, &key)?; - positions.sort_unstable_by(|a, b| { - column_list.compare_positions(*a, *b) - }); + positions.sort_unstable_by(|a, b| column_list.compare_positions(*a, *b)); } } Ok(positions) } - struct TableKey { names: Vec, indexes: Vec, types: Vec } - impl TableKey { fn new(schema: &Schema, key: &[Name]) -> Result { let mut indexes = Vec::with_capacity(key.len()); let mut types = Vec::with_capacity(key.len()); for &name in key.iter() { - let (idx, field) = schema.column_with_name(name).ok_or_else(|| { - schema_error!("key column `{}` is not found in result", name) - })?; + let (idx, field) = schema + .column_with_name(name) + .ok_or_else(|| schema_error!("key column `{}` is not found in result", name))?; indexes.push(idx); types.push(field.data_type().clone()); } @@ -113,43 +95,47 @@ impl TableKey { } } - #[inline(never)] -fn sort_by_1<'a, K>( - positions: &mut Vec, - record_batches: &'a Vec, - key_idx: usize -) where K: Cmp, - K: From<&'a dyn Array> +fn sort_by_1<'a, K>(positions: &mut Vec, record_batches: &'a Vec, key_idx: usize) +where + K: Cmp, + K: From<&'a dyn Array> { - let comparators: Vec = record_batches.iter().map(|b| { - let array = b.column(key_idx).as_ref(); - K::from(array) - }).collect(); + let comparators: Vec = record_batches + .iter() + .map(|b| { + let array = b.column(key_idx).as_ref(); + K::from(array) + }) + .collect(); sort_positions(positions, &comparators) } - #[inline(never)] fn sort_by_2<'a, K1, K2>( positions: &mut Vec<(usize, usize)>, record_batches: &'a Vec, key1_idx: usize, key2_idx: usize -) where K1: Cmp, K1: From<&'a dyn Array>, - K2: Cmp, K2: From<&'a dyn Array> +) where + K1: Cmp, + K1: From<&'a dyn Array>, + K2: Cmp, + K2: From<&'a dyn Array> { - let comparators: Vec<_> = record_batches.iter().map(|b| { - let array1 = b.column(key1_idx).as_ref(); - let array2 = b.column(key2_idx).as_ref(); - let c1 = K1::from(array1); - let c2 = K2::from(array2); - Cons(c1, c2) - }).collect(); + let comparators: Vec<_> = record_batches + .iter() + .map(|b| { + let array1 = b.column(key1_idx).as_ref(); + let array2 = b.column(key2_idx).as_ref(); + let c1 = K1::from(array1); + let c2 = K2::from(array2); + Cons(c1, c2) + }) + .collect(); sort_positions(positions, &comparators) } - #[inline(never)] fn sort_by_3<'a, K1, K2, K3>( positions: &mut Vec<(usize, usize)>, @@ -157,27 +143,30 @@ fn sort_by_3<'a, K1, K2, K3>( key1_idx: usize, key2_idx: usize, key3_idx: usize -) where K1: Cmp, K1: From<&'a dyn Array>, - K2: Cmp, K2: From<&'a dyn Array>, - K3: Cmp, K3: From<&'a dyn Array>, +) where + K1: Cmp, + K1: From<&'a dyn Array>, + K2: Cmp, + K2: From<&'a dyn Array>, + K3: Cmp, + K3: From<&'a dyn Array> { - let comparators: Vec<_> = record_batches.iter().map(|b| { - let array1 = b.column(key1_idx).as_ref(); - let array2 = b.column(key2_idx).as_ref(); - let array3 = b.column(key3_idx).as_ref(); - let c1 = K1::from(array1); - let c2 = K2::from(array2); - let c3 = K3::from(array3); - Cons(c1, Cons(c2, c3)) - }).collect(); + let comparators: Vec<_> = record_batches + .iter() + .map(|b| { + let array1 = b.column(key1_idx).as_ref(); + let array2 = b.column(key2_idx).as_ref(); + let array3 = b.column(key3_idx).as_ref(); + let c1 = K1::from(array1); + let c2 = K2::from(array2); + let c3 = K3::from(array3); + Cons(c1, Cons(c2, c3)) + }) + .collect(); sort_positions(positions, &comparators) } - -fn sort_positions( - positions: &mut Vec<(usize, usize)>, - comparators: &Vec -) { +fn sort_positions(positions: &mut Vec<(usize, usize)>, comparators: &Vec) { positions.sort_unstable_by(|a, b| { let ca = &comparators[a.0]; let cb = &comparators[b.0]; @@ -185,14 +174,14 @@ fn sort_positions( }) } - trait Cmp { fn compare(&self, idx: usize, other: &Self, other_idx: usize) -> Ordering; } - -impl Cmp for T where T: Key, - T::Item: Ord +impl Cmp for T +where + T: Key, + T::Item: Ord { #[inline] fn compare(&self, idx: usize, other: &Self, other_idx: usize) -> Ordering { @@ -200,31 +189,34 @@ impl Cmp for T where T: Key, } } - struct Cons(H, T); - -impl Cmp for Cons where H: Cmp, T: Cmp { +impl Cmp for Cons +where + H: Cmp, + T: Cmp +{ fn compare(&self, idx: usize, other: &Self, other_idx: usize) -> Ordering { - match self.0.compare(idx, &other.0, other_idx) { + match self.0.compare(idx, &other.0, other_idx) { Ordering::Equal => self.1.compare(idx, &other.1, other_idx), ord => ord } } } - pub trait PosCmp { fn compare_positions(&self, a: Position, b: Position) -> Ordering; } - struct Column { batches: Vec } - -impl PosCmp for Column where K: Key, K::Item: Ord { +impl PosCmp for Column +where + K: Key, + K::Item: Ord +{ fn compare_positions(&self, a: Position, b: Position) -> Ordering { let a_batch = &self.batches[a.0]; let b_batch = &self.batches[b.0]; @@ -232,17 +224,15 @@ impl PosCmp for Column where K: Key, K::Item: Ord { } } - struct ColumnList { columns: Box<[Box]> } - impl PosCmp for ColumnList { fn compare_positions(&self, a: Position, b: Position) -> Ordering { for col in self.columns.iter() { match col.compare_positions(a, b) { - Ordering::Equal => {}, + Ordering::Equal => {} ord => return ord } } @@ -250,12 +240,15 @@ impl PosCmp for ColumnList { } } - impl ColumnList { fn new(record_batches: &Vec, key: &TableKey) -> Result { - let columns = key.indexes.iter().cloned().enumerate().map(|(i, kix)| { - get_column(record_batches, kix, &key.types[i]).map_err(|err| err.at(key.names[i])) - }).collect::, _>>()?; + let columns = key + .indexes + .iter() + .cloned() + .enumerate() + .map(|(i, kix)| get_column(record_batches, kix, &key.types[i]).map_err(|err| err.at(key.names[i]))) + .collect::, _>>()?; Ok(Self { columns: columns.into_boxed_slice() @@ -263,7 +256,6 @@ impl ColumnList { } } - fn get_column( record_batches: &Vec, idx: usize, @@ -271,15 +263,16 @@ fn get_column( ) -> Result, SchemaError> { macro_rules! make { ($array: ident, $exp:expr) => {{ - let batches = record_batches.iter().map(|batch| { - let array = batch.column(idx); - let $array = array.as_ref(); - $exp - }).collect::>(); - - Ok(Box::new(Column { - batches - })) + let batches = record_batches + .iter() + .map(|batch| { + let array = batch.column(idx); + let $array = array.as_ref(); + $exp + }) + .collect::>(); + + Ok(Box::new(Column { batches })) }}; } match data_type { @@ -288,32 +281,25 @@ fn get_column( DataType::UInt64 => make!(array, PrimitiveKey::::from(array)), DataType::Int32 => make!(array, PrimitiveKey::::from(array)), DataType::Int64 => make!(array, PrimitiveKey::::from(array)), - DataType::Utf8 => make!(array, array.as_string::().clone()), + DataType::Utf8 => make!(array, array.as_string::().clone()), DataType::List(field) if field.data_type() == &DataType::Int32 => { make!(array, PrimitiveListKey::::from(array)) - }, + } DataType::List(field) if field.data_type() == &DataType::UInt32 => { make!(array, PrimitiveListKey::::from(array)) - }, + } DataType::List(field) if field.data_type() == &DataType::UInt16 => { make!(array, PrimitiveListKey::::from(array)) - }, - _ => Err( - schema_error!("unsupported key column type - {}", data_type) - ) + } + _ => Err(schema_error!("unsupported key column type - {}", data_type)) } } - -pub fn make_pos_comparator( - record_batches: &Vec, - key: &[Name] -) -> Result -{ +pub fn make_pos_comparator(record_batches: &Vec, key: &[Name]) -> Result { if record_batches.len() == 0 { - return Err(schema_error!("got empty vector of record batches")) + return Err(schema_error!("got empty vector of record batches")); } let schema = record_batches[0].schema(); let key = TableKey::new(&schema, key)?; ColumnList::new(record_batches, &key) -} \ No newline at end of file +} diff --git a/crates/query/src/plan/table.rs b/crates/query/src/plan/table.rs index 9dd6238d..a7f7a464 100644 --- a/crates/query/src/plan/table.rs +++ b/crates/query/src/plan/table.rs @@ -1,18 +1,16 @@ use std::collections::{HashMap, HashSet}; -use crate::primitives::{Name, RowWeight}; +use crate::primitives::{Name, RowWeight}; pub enum ColumnWeight { Fixed(RowWeight), Stored(Name) } - pub enum ColumnDefault { Null } - pub struct Table { pub name: Name, pub primary_key: Vec, @@ -22,7 +20,6 @@ pub struct Table { pub children: HashMap> } - impl Table { pub fn set_weight(&mut self, column: Name, weight: RowWeight) -> &mut Self { self.column_weights.insert(column, ColumnWeight::Fixed(weight)); @@ -52,7 +49,8 @@ impl Table { } pub fn default_null_columns(&self) -> HashSet { - self.column_defaults.iter() + self.column_defaults + .iter() .filter_map(|(name, default)| match default { ColumnDefault::Null => Some(*name) }) @@ -66,17 +64,13 @@ impl Table { } } - pub struct TableSet { tables: Vec } - impl TableSet { pub fn new() -> TableSet { - TableSet { - tables: Vec::new() - } + TableSet { tables: Vec::new() } } pub fn get(&self, name: Name) -> &Table { @@ -86,7 +80,7 @@ impl TableSet { pub fn get_index(&self, name: Name) -> usize { for (idx, table) in self.tables.iter().enumerate() { if table.name == name { - return idx + return idx; } } panic!("table {} is not defined", name) @@ -99,12 +93,10 @@ impl TableSet { pub fn add_table(&mut self, name: Name, pk: Vec) -> &mut Table { assert!( !self.tables.iter().any(|t| t.name == name), - "table '{}' was already defined", name - ); - assert!( - pk.len() > 0, - "primary key of all tables must start with a block number" + "table '{}' was already defined", + name ); + assert!(pk.len() > 0, "primary key of all tables must start with a block number"); assert!( self.tables.len() > 0 || pk.len() == 1, "block header table must be defined first" @@ -121,4 +113,4 @@ impl TableSet { self.tables.last_mut().unwrap() } -} \ No newline at end of file +} diff --git a/crates/query/src/primitives.rs b/crates/query/src/primitives.rs index 4c385603..28c0f9c6 100644 --- a/crates/query/src/primitives.rs +++ b/crates/query/src/primitives.rs @@ -1,6 +1,7 @@ -use std::error::Error; -use std::fmt::{Display, Formatter}; - +use std::{ + error::Error, + fmt::{Display, Formatter} +}; pub type Name = sqd_primitives::Name; @@ -16,14 +17,12 @@ pub type RowWeight = u64; pub type RowWeightPolarsType = sqd_polars::prelude::UInt64Type; - #[derive(Debug)] pub struct SchemaError { pub path: Vec, pub message: String } - impl SchemaError { pub fn new(message: S) -> Self { Self { @@ -38,7 +37,6 @@ impl SchemaError { } } - impl Display for SchemaError { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { if self.path.len() > 0 { @@ -51,13 +49,11 @@ impl Display for SchemaError { } } - impl Error for SchemaError {} - macro_rules! schema_error { ($($arg:tt)*) => { SchemaError::new(format!($($arg)*)) }; } -pub(crate) use schema_error; \ No newline at end of file +pub(crate) use schema_error; diff --git a/crates/query/src/query/bitcoin.rs b/crates/query/src/query/bitcoin.rs index 80a95e36..33954ac2 100644 --- a/crates/query/src/query/bitcoin.rs +++ b/crates/query/src/query/bitcoin.rs @@ -1,42 +1,34 @@ -use super::util::{compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, PredicateBuilder}; -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{Plan, ScanBuilder, TableSet}; -use crate::primitives::BlockNumber; -use serde::{Deserialize, Serialize}; use std::sync::LazyLock; +use serde::{Deserialize, Serialize}; + +use super::util::{ + compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, + PredicateBuilder +}; +use crate::{ + json::{exp::Exp, lang::*}, + plan::{Plan, ScanBuilder, TableSet}, + primitives::BlockNumber +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]); + tables.add_table("blocks", vec!["number"]); - tables.add_table("transactions", vec![ - "block_number", - "transaction_index" - ]) - .add_child("inputs", vec!["block_number", "transaction_index"]) - .add_child("outputs", vec!["block_number", "transaction_index"]); + tables + .add_table("transactions", vec!["block_number", "transaction_index"]) + .add_child("inputs", vec!["block_number", "transaction_index"]) + .add_child("outputs", vec!["block_number", "transaction_index"]); - tables.add_table("inputs", vec![ - "block_number", - "transaction_index", - "input_index" - ]); + tables.add_table("inputs", vec!["block_number", "transaction_index", "input_index"]); - tables.add_table("outputs", vec![ - "block_number", - "transaction_index", - "output_index" - ]); + tables.add_table("outputs", vec!["block_number", "transaction_index", "output_index"]); tables }); - field_selection! { block: BlockFieldSelection, transaction: TransactionFieldSelection, @@ -44,7 +36,6 @@ field_selection! { output: OutputFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -83,7 +74,6 @@ item_field_selection! { }} } - item_field_selection! { TransactionFieldSelection { transaction_index, @@ -110,7 +100,6 @@ item_field_selection! { }} } - item_field_selection! { InputFieldSelection { transaction_index, @@ -157,7 +146,6 @@ item_field_selection! { } } - item_field_selection! { OutputFieldSelection { transaction_index, @@ -182,10 +170,8 @@ item_field_selection! { }} } - type Bytes = String; - request! { pub struct TransactionRequest { pub inputs: bool, @@ -193,30 +179,27 @@ request! { } } - impl TransactionRequest { - fn predicate(&self, _p: &mut PredicateBuilder) { - } + fn predicate(&self, _p: &mut PredicateBuilder) {} fn relations(&self, scan: &mut ScanBuilder) { if self.inputs { scan.join( "inputs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.outputs { scan.join( "outputs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct InputRequest { pub r#type: Option>, @@ -229,12 +212,17 @@ request! { } } - impl InputRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("type", self.r#type.as_deref()); - p.col_in_list("prevout_script_pub_key_address", self.prevout_script_pub_key_address.as_deref()); - p.col_in_list("prevout_script_pub_key_type", self.prevout_script_pub_key_type.as_deref()); + p.col_in_list( + "prevout_script_pub_key_address", + self.prevout_script_pub_key_address.as_deref() + ); + p.col_in_list( + "prevout_script_pub_key_type", + self.prevout_script_pub_key_type.as_deref() + ); p.col_eq("prevout_generated", self.prevout_generated); } @@ -243,27 +231,26 @@ impl InputRequest { scan.join( "transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_inputs { scan.join( "inputs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_outputs { scan.join( "outputs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct OutputRequest { pub script_pub_key_address: Option>, @@ -274,7 +261,6 @@ request! { } } - impl OutputRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("script_pub_key_address", self.script_pub_key_address.as_deref()); @@ -286,27 +272,26 @@ impl OutputRequest { scan.join( "transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_inputs { scan.join( "inputs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_outputs { scan.join( "outputs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct BitcoinQuery { pub from_block: BlockNumber, @@ -320,7 +305,6 @@ request! { } } - impl BitcoinQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); diff --git a/crates/query/src/query/eth.rs b/crates/query/src/query/eth.rs index 62c75f2b..b67d9e88 100644 --- a/crates/query/src/query/eth.rs +++ b/crates/query/src/query/eth.rs @@ -1,64 +1,58 @@ -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{ScanBuilder, TableSet}; -use crate::query::util::{compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, PredicateBuilder}; -use crate::{BlockNumber, Plan}; -use serde::{Deserialize, Serialize}; use std::sync::LazyLock; -use super::util::to_lowercase_list; +use serde::{Deserialize, Serialize}; +use super::util::to_lowercase_list; +use crate::{ + json::{exp::Exp, lang::*}, + plan::{ScanBuilder, TableSet}, + query::util::{ + compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, + PredicateBuilder + }, + BlockNumber, Plan +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]) - .set_weight("logs_bloom", 512) - .set_weight_column("extra_data", "extra_data_size") - .set_weight_column("withdrawals", "withdrawals_size"); - - tables.add_table("transactions", vec![ - "block_number", - "transaction_index" - ]) - .add_child("logs", vec!["block_number", "transaction_index"]) - .add_child("traces", vec!["block_number", "transaction_index"]) - .add_child("statediffs", vec!["block_number", "transaction_index"]) - .set_weight("logs_bloom", 512) - .set_weight_column("input", "input_size") - .set_weight_column("access_list", "access_list_size"); - - tables.add_table("logs", vec![ - "block_number", - "log_index" - ]) - .set_weight_column("data", "data_size"); - - tables.add_table("traces", vec![ - "block_number", - "transaction_index", - "trace_address" - ]) - .set_weight_column("create_init", "create_init_size") - .set_weight_column("create_result_code", "create_result_code_size") - .set_weight_column("call_input", "call_input_size") - .set_weight_column("call_result_output", "call_result_output_size"); - - tables.add_table("statediffs", vec![ - "block_number", - "transaction_index", - "address", - "key" - ]) - .set_weight_column("prev", "prev_size") - .set_weight_column("next", "next_size") - .set_result_item_name("stateDiffs"); + tables + .add_table("blocks", vec!["number"]) + .set_weight("logs_bloom", 512) + .set_weight_column("extra_data", "extra_data_size") + .set_weight_column("withdrawals", "withdrawals_size"); tables -}); + .add_table("transactions", vec!["block_number", "transaction_index"]) + .add_child("logs", vec!["block_number", "transaction_index"]) + .add_child("traces", vec!["block_number", "transaction_index"]) + .add_child("statediffs", vec!["block_number", "transaction_index"]) + .set_weight("logs_bloom", 512) + .set_weight_column("input", "input_size") + .set_weight_column("access_list", "access_list_size"); + + tables + .add_table("logs", vec!["block_number", "log_index"]) + .set_weight_column("data", "data_size"); + tables + .add_table("traces", vec!["block_number", "transaction_index", "trace_address"]) + .set_weight_column("create_init", "create_init_size") + .set_weight_column("create_result_code", "create_result_code_size") + .set_weight_column("call_input", "call_input_size") + .set_weight_column("call_result_output", "call_result_output_size"); + + tables + .add_table( + "statediffs", + vec!["block_number", "transaction_index", "address", "key"] + ) + .set_weight_column("prev", "prev_size") + .set_weight_column("next", "next_size") + .set_result_item_name("stateDiffs"); + + tables +}); field_selection! { block: BlockFieldSelection, @@ -68,7 +62,6 @@ field_selection! { state_diff: StateDiffFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -137,7 +130,6 @@ item_field_selection! { }} } - item_field_selection! { TransactionFieldSelection { transaction_index, @@ -261,7 +253,6 @@ item_field_selection! { } } - item_field_selection! { LogFieldSelection { log_index, @@ -291,7 +282,6 @@ item_field_selection! { }} } - item_field_selection! { TraceFieldSelection { transaction_index, @@ -393,7 +383,7 @@ item_field_selection! { call_action.add("type", prop("call_type", Exp::Value)); } if this.call_call_type { - call_action.add("callType", prop("call_type", Exp::Value)); + call_action.add("callType", prop("call_type", Exp::Value)); } if !call_action.is_empty() { call.add("action", call_action); @@ -452,7 +442,6 @@ item_field_selection! { } } - item_field_selection! { StateDiffFieldSelection { transaction_index, @@ -473,10 +462,8 @@ item_field_selection! { }} } - type Bytes = String; - request! { pub struct TransactionRequest { pub from: Option>, @@ -490,7 +477,6 @@ request! { } } - impl TransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("from", to_lowercase_list(&self.from)); @@ -525,7 +511,6 @@ impl TransactionRequest { } } - request! { pub struct LogRequest { pub address: Option>, @@ -540,7 +525,6 @@ request! { } } - impl LogRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("address", to_lowercase_list(&self.address)); @@ -582,7 +566,6 @@ impl LogRequest { } } - request! { pub struct TraceRequest { pub r#type: Option>, @@ -601,7 +584,6 @@ request! { } } - impl TraceRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("type", self.r#type.as_deref()); @@ -611,7 +593,10 @@ impl TraceRequest { p.col_in_list("call_to", to_lowercase_list(&self.call_to)); p.col_in_list("call_sighash", to_lowercase_list(&self.call_sighash)); p.col_in_list("suicide_address", to_lowercase_list(&self.suicide_address)); - p.col_in_list("suicide_refund_address", to_lowercase_list(&self.suicide_refund_address)); + p.col_in_list( + "suicide_refund_address", + to_lowercase_list(&self.suicide_refund_address) + ); p.col_in_list("reward_author", to_lowercase_list(&self.reward_author)); } @@ -639,7 +624,6 @@ impl TraceRequest { } } - request! { pub struct StateDiffRequest { pub address: Option>, @@ -649,7 +633,6 @@ request! { } } - impl StateDiffRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("address", to_lowercase_list(&self.address)); @@ -668,7 +651,6 @@ impl StateDiffRequest { } } - request! { pub struct EthQuery { pub from_block: BlockNumber, @@ -684,7 +666,6 @@ request! { } } - impl EthQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); diff --git a/crates/query/src/query/fuel.rs b/crates/query/src/query/fuel.rs index 4c9cb950..857f1e92 100644 --- a/crates/query/src/query/fuel.rs +++ b/crates/query/src/query/fuel.rs @@ -1,55 +1,45 @@ -use super::util::{compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, PredicateBuilder}; -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{Plan, ScanBuilder, TableSet}; -use crate::primitives::BlockNumber; -use serde::{Deserialize, Serialize}; use std::sync::LazyLock; +use serde::{Deserialize, Serialize}; + +use super::util::{ + compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, + PredicateBuilder +}; +use crate::{ + json::{exp::Exp, lang::*}, + plan::{Plan, ScanBuilder, TableSet}, + primitives::BlockNumber +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]); - - tables.add_table("transactions", vec![ - "block_number", - "index" - ]) - .set_weight_column("input_asset_ids", "input_asset_ids_size") - .set_weight_column("input_contracts", "input_contracts_size") - .set_weight_column("witnesses", "witnesses_size") - .set_weight_column("storage_slots", "storage_slots_size") - .set_weight_column("proof_set", "proof_set_size") - .set_weight_column("script_data", "script_data_size") - .set_weight_column("raw_payload", "raw_payload_size"); - - tables.add_table("receipts", vec![ - "block_number", - "transaction_index", - "index" - ]) - .set_weight_column("data", "data_size"); - - tables.add_table("inputs", vec![ - "block_number", - "transaction_index", - "index" - ]) - .set_weight_column("coin_predicate", "coin_predicate_size") - .set_weight_column("message_predicate", "message_predicate_size"); - - tables.add_table("outputs", vec![ - "block_number", - "transaction_index", - "index" - ]); + tables.add_table("blocks", vec!["number"]); tables -}); + .add_table("transactions", vec!["block_number", "index"]) + .set_weight_column("input_asset_ids", "input_asset_ids_size") + .set_weight_column("input_contracts", "input_contracts_size") + .set_weight_column("witnesses", "witnesses_size") + .set_weight_column("storage_slots", "storage_slots_size") + .set_weight_column("proof_set", "proof_set_size") + .set_weight_column("script_data", "script_data_size") + .set_weight_column("raw_payload", "raw_payload_size"); + + tables + .add_table("receipts", vec!["block_number", "transaction_index", "index"]) + .set_weight_column("data", "data_size"); + + tables + .add_table("inputs", vec!["block_number", "transaction_index", "index"]) + .set_weight_column("coin_predicate", "coin_predicate_size") + .set_weight_column("message_predicate", "message_predicate_size"); + tables.add_table("outputs", vec!["block_number", "transaction_index", "index"]); + + tables +}); field_selection! { block: BlockFieldSelection, @@ -59,7 +49,6 @@ field_selection! { output: OutputFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -94,7 +83,6 @@ item_field_selection! { }} } - item_field_selection! { TransactionFieldSelection { index, @@ -187,7 +175,6 @@ item_field_selection! { }} } - item_field_selection! { ReceiptFieldSelection { index, @@ -256,7 +243,6 @@ item_field_selection! { }} } - item_field_selection! { InputFieldSelection { transaction_index, @@ -380,7 +366,6 @@ item_field_selection! { } } - item_field_selection! { OutputFieldSelection { transaction_index, @@ -474,10 +459,8 @@ item_field_selection! { } } - type Bytes = String; - request! { pub struct ReceiptRequest { pub r#type: Option>, @@ -486,7 +469,6 @@ request! { } } - impl ReceiptRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("receipt_type", self.r#type.clone()); @@ -498,13 +480,12 @@ impl ReceiptRequest { scan.join( "transactions", vec!["block_number", "index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct TransactionRequest { pub r#type: Option>, @@ -514,7 +495,6 @@ request! { } } - impl TransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("type", self.r#type.as_deref()); @@ -545,7 +525,6 @@ impl TransactionRequest { } } - request! { pub struct InputRequest { pub r#type: Option>, @@ -558,7 +537,6 @@ request! { } } - impl InputRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("type", self.r#type.as_deref()); @@ -580,7 +558,6 @@ impl InputRequest { } } - request! { pub struct OutputRequest { pub r#type: Option>, @@ -588,7 +565,6 @@ request! { } } - impl OutputRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("type", self.r#type.as_deref()); @@ -605,7 +581,6 @@ impl OutputRequest { } } - request! { pub struct FuelQuery { pub from_block: BlockNumber, @@ -620,7 +595,6 @@ request! { } } - impl FuelQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); diff --git a/crates/query/src/query/hyperliquid_fills.rs b/crates/query/src/query/hyperliquid_fills.rs index 0e1c2dc3..b89e5ad1 100644 --- a/crates/query/src/query/hyperliquid_fills.rs +++ b/crates/query/src/query/hyperliquid_fills.rs @@ -1,34 +1,32 @@ -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{ScanBuilder, TableSet}; -use crate::query::util::{compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, PredicateBuilder}; -use crate::{BlockNumber, Plan}; -use serde::{Deserialize, Serialize}; use std::sync::LazyLock; +use serde::{Deserialize, Serialize}; + +use crate::{ + json::{exp::Exp, lang::*}, + plan::{ScanBuilder, TableSet}, + query::util::{ + compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, + PredicateBuilder + }, + BlockNumber, Plan +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]); + tables.add_table("blocks", vec!["number"]); - tables.add_table("fills", vec![ - "block_number", - "fill_index" - ]); + tables.add_table("fills", vec!["block_number", "fill_index"]); tables }); - field_selection! { block: BlockFieldSelection, fill: FillFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -45,7 +43,6 @@ item_field_selection! { }} } - item_field_selection! { FillFieldSelection { fill_index, @@ -94,10 +91,8 @@ item_field_selection! { }} } - type Bytes = String; - request! { pub struct FillRequest { pub user: Option>, @@ -109,7 +104,6 @@ request! { } } - impl FillRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("user", self.user.as_deref()); @@ -120,10 +114,9 @@ impl FillRequest { p.col_in_list("builder", self.builder.as_deref()); } - fn relations(&self, _scan: &mut ScanBuilder) { } + fn relations(&self, _scan: &mut ScanBuilder) {} } - request! { pub struct HyperliquidFillsQuery { pub from_block: BlockNumber, @@ -135,7 +128,6 @@ request! { } } - impl HyperliquidFillsQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); diff --git a/crates/query/src/query/hyperliquid_replica_cmds.rs b/crates/query/src/query/hyperliquid_replica_cmds.rs index 4165a3ba..560863f2 100644 --- a/crates/query/src/query/hyperliquid_replica_cmds.rs +++ b/crates/query/src/query/hyperliquid_replica_cmds.rs @@ -1,37 +1,36 @@ -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{ScanBuilder, TableSet}; -use crate::query::util::{compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, PredicateBuilder}; -use crate::{BlockNumber, Plan}; +use std::sync::LazyLock; + use arrow::datatypes::UInt32Type; use serde::{Deserialize, Serialize}; -use std::sync::LazyLock; +use crate::{ + json::{exp::Exp, lang::*}, + plan::{ScanBuilder, TableSet}, + query::util::{ + compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, + PredicateBuilder + }, + BlockNumber, Plan +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]); + tables.add_table("blocks", vec!["number"]); - tables.add_table("actions", vec![ - "block_number", - "action_index" - ]) - .set_weight_column("action", "action_size") - .set_weight_column("response", "response_size"); + tables + .add_table("actions", vec!["block_number", "action_index"]) + .set_weight_column("action", "action_size") + .set_weight_column("response", "response_size"); tables }); - field_selection! { block: BlockFieldSelection, action: ActionFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -56,7 +55,6 @@ item_field_selection! { }} } - item_field_selection! { ActionFieldSelection { action_index, @@ -81,19 +79,16 @@ item_field_selection! { }} } - type Bytes = String; type AssetIndex = u32; - #[derive(Deserialize, Serialize, Clone, Debug, Eq, PartialEq)] #[serde(rename_all = "lowercase")] pub enum Status { Ok, - Err, + Err } - request! { pub struct ActionRequest { pub action_type: Option>, @@ -103,22 +98,23 @@ request! { } } - impl ActionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("action_type", self.action_type.as_deref()); p.col_in_list("user", self.user.as_deref()); p.col_in_list("vault_address", self.vault_address.as_deref()); - p.col_eq("status", self.status.as_ref().map(|val| match val { - Status::Ok => "ok", - Status::Err => "err" - })); + p.col_eq( + "status", + self.status.as_ref().map(|val| match val { + Status::Ok => "ok", + Status::Err => "err" + }) + ); } - fn relations(&self, _scan: &mut ScanBuilder) { } + fn relations(&self, _scan: &mut ScanBuilder) {} } - request! { pub struct OrderActionRequest { pub contains_asset: Option>, @@ -129,23 +125,24 @@ request! { } } - impl OrderActionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_primitive_list_contains_any::("order_asset", self.contains_asset.as_deref()); p.col_string_list_contains_any("order_cloid", self.contains_cloid.as_deref()); p.col_in_list("user", self.user.as_deref()); p.col_in_list("vault_address", self.vault_address.as_deref()); - p.col_eq("status", self.status.as_ref().map(|val| match val { - Status::Ok => "ok", - Status::Err => "err" - })); + p.col_eq( + "status", + self.status.as_ref().map(|val| match val { + Status::Ok => "ok", + Status::Err => "err" + }) + ); } - fn relations(&self, _scan: &mut ScanBuilder) { } + fn relations(&self, _scan: &mut ScanBuilder) {} } - request! { pub struct CancelActionRequest { pub contains_asset: Option>, @@ -155,22 +152,23 @@ request! { } } - impl CancelActionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_primitive_list_contains_any::("cancel_asset", self.contains_asset.as_deref()); p.col_in_list("user", self.user.as_deref()); p.col_in_list("vault_address", self.vault_address.as_deref()); - p.col_eq("status", self.status.as_ref().map(|val| match val { - Status::Ok => "ok", - Status::Err => "err" - })); + p.col_eq( + "status", + self.status.as_ref().map(|val| match val { + Status::Ok => "ok", + Status::Err => "err" + }) + ); } - fn relations(&self, _scan: &mut ScanBuilder) { } + fn relations(&self, _scan: &mut ScanBuilder) {} } - request! { pub struct CancelByCloidActionRequest { pub contains_asset: Option>, @@ -181,23 +179,24 @@ request! { } } - impl CancelByCloidActionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_primitive_list_contains_any::("asset", self.contains_asset.as_deref()); p.col_string_list_contains_any("cloid", self.contains_cloid.as_deref()); p.col_in_list("user", self.user.as_deref()); p.col_in_list("vault_address", self.vault_address.as_deref()); - p.col_eq("status", self.status.as_ref().map(|val| match val { - Status::Ok => "ok", - Status::Err => "err" - })); + p.col_eq( + "status", + self.status.as_ref().map(|val| match val { + Status::Ok => "ok", + Status::Err => "err" + }) + ); } - fn relations(&self, _scan: &mut ScanBuilder) { } + fn relations(&self, _scan: &mut ScanBuilder) {} } - request! { pub struct BatchModifyActionRequest { pub contains_asset: Option>, @@ -208,23 +207,24 @@ request! { } } - impl BatchModifyActionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_primitive_list_contains_any::("batch_modify_asset", self.contains_asset.as_deref()); p.col_string_list_contains_any("batch_modify_cloid", self.contains_cloid.as_deref()); p.col_in_list("user", self.user.as_deref()); p.col_in_list("vault_address", self.vault_address.as_deref()); - p.col_eq("status", self.status.as_ref().map(|val| match val { - Status::Ok => "ok", - Status::Err => "err" - })); + p.col_eq( + "status", + self.status.as_ref().map(|val| match val { + Status::Ok => "ok", + Status::Err => "err" + }) + ); } - fn relations(&self, _scan: &mut ScanBuilder) { } + fn relations(&self, _scan: &mut ScanBuilder) {} } - request! { pub struct HyperliquidReplicaCmdsQuery { pub from_block: BlockNumber, @@ -240,7 +240,6 @@ request! { } } - impl HyperliquidReplicaCmdsQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); diff --git a/crates/query/src/query/mod.rs b/crates/query/src/query/mod.rs index 2ea1e707..a173c827 100644 --- a/crates/query/src/query/mod.rs +++ b/crates/query/src/query/mod.rs @@ -1,19 +1,17 @@ -use crate::plan::Plan; -use crate::primitives::BlockNumber; use serde::{Deserialize, Serialize}; +use crate::{plan::Plan, primitives::BlockNumber}; pub mod bitcoin; pub mod eth; -pub mod solana; -pub mod substrate; pub mod fuel; pub mod hyperliquid_fills; pub mod hyperliquid_replica_cmds; +pub mod solana; +pub mod substrate; pub mod tron; mod util; - #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] #[serde(tag = "type")] pub enum Query { @@ -35,14 +33,13 @@ pub enum Query { Tron(tron::TronQuery) } - impl Query { pub fn from_json_bytes(bytes: &[u8]) -> anyhow::Result { let query: Self = serde_json::from_slice(bytes)?; query.validate()?; Ok(query) } - + pub fn from_json_value(json: serde_json::Value) -> anyhow::Result { let query: Self = serde_json::from_value(json)?; query.validate()?; @@ -62,7 +59,7 @@ impl Query { Query::Fuel(q) => q.validate(), Query::HyperliquidFills(q) => q.validate(), Query::HyperliquidReplicaCmds(q) => q.validate(), - Query::Tron(q) => q.validate(), + Query::Tron(q) => q.validate() } } @@ -75,8 +72,9 @@ impl Query { Query::Fuel(q) => q.parent_block_hash.as_ref(), Query::HyperliquidFills(q) => q.parent_block_hash.as_ref(), Query::HyperliquidReplicaCmds(q) => q.parent_block_hash.as_ref(), - Query::Tron(q) => q.parent_block_hash.as_ref(), - }.map(|s| s.as_str()) + Query::Tron(q) => q.parent_block_hash.as_ref() + } + .map(|s| s.as_str()) } pub fn first_block(&self) -> BlockNumber { @@ -88,7 +86,7 @@ impl Query { Query::Fuel(q) => q.from_block, Query::HyperliquidFills(q) => q.from_block, Query::HyperliquidReplicaCmds(q) => q.from_block, - Query::Tron(q) => q.from_block, + Query::Tron(q) => q.from_block } } @@ -101,7 +99,7 @@ impl Query { Query::Fuel(q) => q.from_block = block_number, Query::HyperliquidFills(q) => q.from_block = block_number, Query::HyperliquidReplicaCmds(q) => q.from_block = block_number, - Query::Tron(q) => q.from_block = block_number, + Query::Tron(q) => q.from_block = block_number } } @@ -114,7 +112,7 @@ impl Query { Query::Fuel(q) => q.to_block, Query::HyperliquidFills(q) => q.to_block, Query::HyperliquidReplicaCmds(q) => q.to_block, - Query::Tron(q) => q.to_block, + Query::Tron(q) => q.to_block } } @@ -128,7 +126,7 @@ impl Query { Query::Fuel(q) => q.to_block = block_number, Query::HyperliquidFills(q) => q.to_block = block_number, Query::HyperliquidReplicaCmds(q) => q.to_block = block_number, - Query::Tron(q) => q.to_block = block_number, + Query::Tron(q) => q.to_block = block_number } } @@ -141,7 +139,7 @@ impl Query { Query::Fuel(q) => q.compile(), Query::HyperliquidFills(q) => q.compile(), Query::HyperliquidReplicaCmds(q) => q.compile(), - Query::Tron(q) => q.compile(), + Query::Tron(q) => q.compile() } } -} \ No newline at end of file +} diff --git a/crates/query/src/query/solana.rs b/crates/query/src/query/solana.rs index 64ec6df5..2c4703bd 100644 --- a/crates/query/src/query/solana.rs +++ b/crates/query/src/query/solana.rs @@ -1,86 +1,71 @@ -use super::util::{check_hex, compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, parse_hex, parse_static_hex, request, PredicateBuilder}; -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{Plan, ScanBuilder, TableSet}; -use crate::primitives::BlockNumber; -use crate::scan::{col_in_list, or, RowPredicateRef}; +use std::sync::{Arc, LazyLock}; + use anyhow::{anyhow, ensure}; use arrow::array::{ArrayRef, FixedSizeBinaryBuilder, UInt16Array, UInt32Array, UInt64Array, UInt8Array}; use serde::{Deserialize, Serialize}; -use std::sync::{Arc, LazyLock}; +use super::util::{ + check_hex, compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, parse_hex, + parse_static_hex, request, PredicateBuilder +}; +use crate::{ + json::{exp::Exp, lang::*}, + plan::{Plan, ScanBuilder, TableSet}, + primitives::BlockNumber, + scan::{col_in_list, or, RowPredicateRef} +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]); - - tables.add_table("transactions", vec![ - "block_number", - "transaction_index" - ]) - .add_child("logs", vec!["block_number", "transaction_index"]) - .add_child("balances", vec!["block_number", "transaction_index"]) - .add_child("token_balances", vec!["block_number", "transaction_index"]) - .set_weight_column("account_keys", "account_keys_size") - .set_weight_column("address_table_lookups", "address_table_lookups_size") - .set_weight_column("signatures", "signatures_size") - .set_weight_column("loaded_addresses", "loaded_addresses_size"); - - tables.add_table("instructions", vec![ - "block_number", - "transaction_index", - "instruction_address" - ]) - .set_weight_column("data", "data_size") - .set_weight_column("a0", "accounts_size") - .set_weight("a1", 0) - .set_weight("a2", 0) - .set_weight("a3", 0) - .set_weight("a4", 0) - .set_weight("a5", 0) - .set_weight("a6", 0) - .set_weight("a7", 0) - .set_weight("a8", 0) - .set_weight("a9", 0) - .set_weight("a10", 0) - .set_weight("a11", 0) - .set_weight("a12", 0) - .set_weight("a13", 0) - .set_weight("a14", 0) - .set_weight("a15", 0) - .set_weight("rest_accounts", 0); - - tables.add_table("logs", vec![ - "block_number", - "transaction_index", - "log_index" - ]) - .set_weight_column("message", "message_size"); - - tables.add_table("balances", vec![ - "block_number", - "transaction_index", - "account" - ]); - - tables.add_table("token_balances", vec![ - "block_number", - "transaction_index", - "account" - ]); - - tables.add_table("rewards", vec![ - "block_number", - "pubkey", - "reward_type" - ]); + tables.add_table("blocks", vec!["number"]); tables -}); + .add_table("transactions", vec!["block_number", "transaction_index"]) + .add_child("logs", vec!["block_number", "transaction_index"]) + .add_child("balances", vec!["block_number", "transaction_index"]) + .add_child("token_balances", vec!["block_number", "transaction_index"]) + .set_weight_column("account_keys", "account_keys_size") + .set_weight_column("address_table_lookups", "address_table_lookups_size") + .set_weight_column("signatures", "signatures_size") + .set_weight_column("loaded_addresses", "loaded_addresses_size"); + + tables + .add_table( + "instructions", + vec!["block_number", "transaction_index", "instruction_address"] + ) + .set_weight_column("data", "data_size") + .set_weight_column("a0", "accounts_size") + .set_weight("a1", 0) + .set_weight("a2", 0) + .set_weight("a3", 0) + .set_weight("a4", 0) + .set_weight("a5", 0) + .set_weight("a6", 0) + .set_weight("a7", 0) + .set_weight("a8", 0) + .set_weight("a9", 0) + .set_weight("a10", 0) + .set_weight("a11", 0) + .set_weight("a12", 0) + .set_weight("a13", 0) + .set_weight("a14", 0) + .set_weight("a15", 0) + .set_weight("rest_accounts", 0); + tables + .add_table("logs", vec!["block_number", "transaction_index", "log_index"]) + .set_weight_column("message", "message_size"); + + tables.add_table("balances", vec!["block_number", "transaction_index", "account"]); + + tables.add_table("token_balances", vec!["block_number", "transaction_index", "account"]); + + tables.add_table("rewards", vec!["block_number", "pubkey", "reward_type"]); + + tables +}); field_selection! { block: BlockFieldSelection, @@ -92,7 +77,6 @@ field_selection! { reward: RewardFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -113,7 +97,6 @@ item_field_selection! { }} } - item_field_selection! { TransactionFieldSelection { transaction_index, @@ -158,7 +141,6 @@ item_field_selection! { } } - item_field_selection! { InstructionFieldSelection { transaction_index, @@ -221,7 +203,6 @@ item_field_selection! { } } - item_field_selection! { LogFieldSelection { transaction_index, @@ -242,7 +223,6 @@ item_field_selection! { }} } - item_field_selection! { BalanceFieldSelection { transaction_index, @@ -259,7 +239,6 @@ item_field_selection! { }} } - item_field_selection! { TokenBalanceFieldSelection { transaction_index, @@ -292,7 +271,6 @@ item_field_selection! { }} } - item_field_selection! { RewardFieldSelection { pubkey, @@ -311,11 +289,9 @@ item_field_selection! { }} } - type Bytes = String; type Base58Bytes = String; - request! { pub struct InstructionRequest { pub program_id: Option>, @@ -352,15 +328,46 @@ request! { } } - impl InstructionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("program_id", self.program_id.as_deref()); self.discriminator_predicate(p); - p.col_in_list("d1", self.d1.as_ref().map(|list| list.iter().filter_map(|s| parse_static_hex::<1>(s)).map(u8::from_be_bytes).collect::>())); - p.col_in_list("d2", self.d2.as_ref().map(|list| list.iter().filter_map(|s| parse_static_hex::<2>(s)).map(u16::from_be_bytes).collect::>())); - p.col_in_list("d4", self.d4.as_ref().map(|list| list.iter().filter_map(|s| parse_static_hex::<4>(s)).map(u32::from_be_bytes).collect::>())); - p.col_in_list("d8", self.d8.as_ref().map(|list| list.iter().filter_map(|s| parse_static_hex::<8>(s)).map(u64::from_be_bytes).collect::>())); + p.col_in_list( + "d1", + self.d1.as_ref().map(|list| { + list.iter() + .filter_map(|s| parse_static_hex::<1>(s)) + .map(u8::from_be_bytes) + .collect::>() + }) + ); + p.col_in_list( + "d2", + self.d2.as_ref().map(|list| { + list.iter() + .filter_map(|s| parse_static_hex::<2>(s)) + .map(u16::from_be_bytes) + .collect::>() + }) + ); + p.col_in_list( + "d4", + self.d4.as_ref().map(|list| { + list.iter() + .filter_map(|s| parse_static_hex::<4>(s)) + .map(u32::from_be_bytes) + .collect::>() + }) + ); + p.col_in_list( + "d8", + self.d8.as_ref().map(|list| { + list.iter() + .filter_map(|s| parse_static_hex::<8>(s)) + .map(u64::from_be_bytes) + .collect::>() + }) + ); p.bloom_filter("accounts_bloom", 64, 7, self.mentions_account.as_deref()); p.col_in_list("a0", self.a0.as_deref()); p.col_in_list("a1", self.a1.as_deref()); @@ -382,25 +389,30 @@ impl InstructionRequest { } fn discriminator_predicate(&self, p: &mut PredicateBuilder) { - let Some(list) = self.discriminator.as_ref() else { return }; - - let list: Vec> = list.iter().filter_map(|s| { - let d = parse_hex(s)?; - if d.len() > 16 { - None - } else { - Some(d) - } - }).collect(); + let Some(list) = self.discriminator.as_ref() else { + return; + }; + + let list: Vec> = list + .iter() + .filter_map(|s| { + let d = parse_hex(s)?; + if d.len() > 16 { + None + } else { + Some(d) + } + }) + .collect(); - if list.is_empty() { + if list.is_empty() { p.mark_as_never(); - return + return; } if list.iter().any(|d| d.is_empty()) { // empty prefix always matches - return + return; } let mut ds: Vec>> = vec![vec![]; 17]; @@ -412,13 +424,14 @@ impl InstructionRequest { for (i, list) in ds.into_iter().enumerate() { if list.is_empty() { - continue + continue; } macro_rules! disc { ($t:ty, $array:ty) => { Arc::new(<$array>::from_iter_values( - list.into_iter().map(|d| <$t>::from_be_bytes(d.try_into().unwrap())) + list.into_iter() + .map(|d| <$t>::from_be_bytes(d.try_into().unwrap())) )) }; } @@ -457,9 +470,7 @@ impl InstructionRequest { _ => unreachable!() }; - predicates.push( - col_in_list(col, array) - ) + predicates.push(col_in_list(col, array)) } p.add(or(predicates)); @@ -470,21 +481,21 @@ impl InstructionRequest { scan.join( "transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_balances { scan.join( "balances", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_token_balances { scan.join( "token_balances", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_instructions { @@ -504,13 +515,12 @@ impl InstructionRequest { scan.join( "logs", vec!["block_number", "transaction_index", "instruction_address"], - vec!["block_number", "transaction_index", "instruction_address"], + vec!["block_number", "transaction_index", "instruction_address"] ); } } } - request! { pub struct TransactionRequest { pub fee_payer: Option>, @@ -522,7 +532,6 @@ request! { } } - impl TransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("fee_payer", self.fee_payer.as_deref()); @@ -561,7 +570,6 @@ impl TransactionRequest { } } - request! { pub struct LogRequest { pub program_id: Option>, @@ -571,7 +579,6 @@ request! { } } - impl LogRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("program_id", self.program_id.as_deref()); @@ -597,7 +604,6 @@ impl LogRequest { } } - request! { pub struct BalanceRequest { pub account: Option>, @@ -606,7 +612,6 @@ request! { } } - impl BalanceRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("account", self.account.as_deref()); @@ -630,7 +635,6 @@ impl BalanceRequest { } } - request! { pub struct TokenBalanceRequest { pub account: Option>, @@ -647,7 +651,6 @@ request! { } } - impl TokenBalanceRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("account", self.account.as_deref()); @@ -678,27 +681,25 @@ impl TokenBalanceRequest { scan.join( "balances", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.transaction_token_balances { scan.join( "token_balances", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct RewardRequest { pub pubkey: Option>, } } - impl RewardRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("pubkey", self.pubkey.as_deref()); @@ -707,7 +708,6 @@ impl RewardRequest { fn relations(&self, _scan: &mut ScanBuilder) {} } - request! { pub struct SolanaQuery { pub from_block: BlockNumber, @@ -724,11 +724,18 @@ request! { } } - impl SolanaQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); - ensure_item_count!(self, transactions, instructions, logs, balances, token_balances, rewards); + ensure_item_count!( + self, + transactions, + instructions, + logs, + balances, + token_balances, + rewards + ); for (i, tx) in self.transactions.iter().enumerate() { let len = tx.mentions_account.as_ref().map_or(0, |list| list.len()); ensure!( @@ -773,16 +780,22 @@ impl SolanaQuery { if let Some(ds) = ins.discriminator.as_ref() { for (dix, d) in ds.iter().enumerate() { - check_hex(d).and_then(|_| { - if d.len() > 34 { - Err("discriminator can't be longer than 16 bytes") - } else { - Ok(()) - } - }).map_err(|msg| anyhow!( - "invalid discriminator at .instructions[{}].discriminator[{}]: {}", - i, dix, msg - ))?; + check_hex(d) + .and_then(|_| { + if d.len() > 34 { + Err("discriminator can't be longer than 16 bytes") + } else { + Ok(()) + } + }) + .map_err(|msg| { + anyhow!( + "invalid discriminator at .instructions[{}].discriminator[{}]: {}", + i, + dix, + msg + ) + })?; } } } diff --git a/crates/query/src/query/substrate.rs b/crates/query/src/query/substrate.rs index 84e2a118..40256ef6 100644 --- a/crates/query/src/query/substrate.rs +++ b/crates/query/src/query/substrate.rs @@ -1,43 +1,37 @@ -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{Plan, ScanBuilder, TableSet}; -use crate::primitives::BlockNumber; -use crate::query::util::{compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, to_lowercase_list, PredicateBuilder}; -use serde::{Deserialize, Serialize}; use std::sync::LazyLock; +use serde::{Deserialize, Serialize}; + +use crate::{ + json::{exp::Exp, lang::*}, + plan::{Plan, ScanBuilder, TableSet}, + primitives::BlockNumber, + query::util::{ + compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, + to_lowercase_list, PredicateBuilder + } +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]) - .set_weight("digest", 32 * 4); - - tables.add_table("events", vec![ - "block_number", - "index" - ]) - .set_weight_column("args", "args_size"); - - tables.add_table("calls", vec![ - "block_number", - "extrinsic_index", - "address" - ]) - .set_weight_column("args", "args_size"); - - tables.add_table("extrinsics", vec![ - "block_number", - "index" - ]) - .add_child("calls", vec!["block_number", "extrinsic_index"]) - .set_weight("signature", 4 * 32); + tables.add_table("blocks", vec!["number"]).set_weight("digest", 32 * 4); tables -}); + .add_table("events", vec!["block_number", "index"]) + .set_weight_column("args", "args_size"); + tables + .add_table("calls", vec!["block_number", "extrinsic_index", "address"]) + .set_weight_column("args", "args_size"); + + tables + .add_table("extrinsics", vec!["block_number", "index"]) + .add_child("calls", vec!["block_number", "extrinsic_index"]) + .set_weight("signature", 4 * 32); + + tables +}); field_selection! { block: BlockFieldSelection, @@ -46,7 +40,6 @@ field_selection! { event: EventFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -79,7 +72,6 @@ item_field_selection! { }} } - item_field_selection! { ExtrinsicFieldSelection { index, @@ -104,7 +96,6 @@ item_field_selection! { }} } - item_field_selection! { CallFieldSelection { extrinsic_index, @@ -127,7 +118,6 @@ item_field_selection! { }} } - item_field_selection! { EventFieldSelection { index, @@ -151,10 +141,8 @@ item_field_selection! { }} } - type Bytes = String; - request! { pub struct EventRequest { pub name: Option>, @@ -164,7 +152,6 @@ request! { } } - impl EventRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("name", self.name.as_deref()); @@ -197,7 +184,6 @@ impl EventRequest { } } - request! { pub struct CallRequest { pub name: Option>, @@ -208,7 +194,6 @@ request! { } } - impl CallRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("name", self.name.as_deref()); @@ -241,7 +226,6 @@ impl CallRequest { } } - request! { pub struct EvmLogRequest { pub address: Option>, @@ -255,7 +239,6 @@ request! { } } - impl EvmLogRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("name", Some("EVM.Log")); @@ -293,7 +276,6 @@ impl EvmLogRequest { } } - request! { pub struct EthereumTransactionRequest { pub to: Option>, @@ -304,7 +286,6 @@ request! { } } - impl EthereumTransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("name", Some("Ethereum.transact")); @@ -335,7 +316,6 @@ impl EthereumTransactionRequest { } } - request! { pub struct ContractsContractEmittedRequest { pub contract_address: Option>, @@ -345,7 +325,6 @@ request! { } } - impl ContractsContractEmittedRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("name", Some("Contracts.ContractEmitted")); @@ -379,7 +358,6 @@ impl ContractsContractEmittedRequest { } } - request! { pub struct GearMessageEnqueuedRequest { pub program_id: Option>, @@ -389,7 +367,6 @@ request! { } } - impl GearMessageEnqueuedRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("name", Some("Gear.UserMessageEnqueued")); @@ -423,7 +400,6 @@ impl GearMessageEnqueuedRequest { } } - request! { pub struct GearUserMessageSentRequest { pub program_id: Option>, @@ -433,7 +409,6 @@ request! { } } - impl GearUserMessageSentRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("name", Some("Gear.UserMessageSent")); @@ -467,7 +442,6 @@ impl GearUserMessageSentRequest { } } - request! { pub struct ReviveContractEmittedRequest { pub contract: Option>, @@ -481,7 +455,6 @@ request! { } } - impl ReviveContractEmittedRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("name", Some("Revive.ContractEmitted")); @@ -519,7 +492,6 @@ impl ReviveContractEmittedRequest { } } - request! { pub struct SubstrateQuery { pub from_block: BlockNumber, @@ -538,7 +510,6 @@ request! { } } - impl SubstrateQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); diff --git a/crates/query/src/query/tron.rs b/crates/query/src/query/tron.rs index 006a5a0c..f594abe0 100644 --- a/crates/query/src/query/tron.rs +++ b/crates/query/src/query/tron.rs @@ -1,44 +1,41 @@ -use crate::json::exp::Exp; -use crate::json::lang::*; -use crate::plan::{Plan, ScanBuilder, TableSet}; -use crate::primitives::BlockNumber; -use crate::query::util::{compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, to_lowercase_list, PredicateBuilder}; -use serde::{Deserialize, Serialize}; use std::sync::LazyLock; +use serde::{Deserialize, Serialize}; + +use crate::{ + json::{exp::Exp, lang::*}, + plan::{Plan, ScanBuilder, TableSet}, + primitives::BlockNumber, + query::util::{ + compile_plan, ensure_block_range, ensure_item_count, field_selection, item_field_selection, request, + to_lowercase_list, PredicateBuilder + } +}; static TABLES: LazyLock = LazyLock::new(|| { let mut tables = TableSet::new(); - tables.add_table("blocks", vec![ - "number" - ]); - - tables.add_table("transactions", vec![ - "block_number", - "transaction_index" - ]) - .add_child("logs", vec!["block_number", "transaction_index"]) - .add_child("internal_transactions", vec!["block_number", "transaction_index"]) - .set_weight_column("raw_data_hex", "raw_data_hex_size"); - - tables.add_table("logs", vec![ - "block_number", - "transaction_index", - "log_index" - ]) - .set_weight_column("data", "data_size"); - - tables.add_table("internal_transactions", vec![ - "block_number", - "transaction_index", - "internal_transaction_index" - ]) - .set_result_item_name("internalTransactions"); + tables.add_table("blocks", vec!["number"]); tables -}); + .add_table("transactions", vec!["block_number", "transaction_index"]) + .add_child("logs", vec!["block_number", "transaction_index"]) + .add_child("internal_transactions", vec!["block_number", "transaction_index"]) + .set_weight_column("raw_data_hex", "raw_data_hex_size"); + tables + .add_table("logs", vec!["block_number", "transaction_index", "log_index"]) + .set_weight_column("data", "data_size"); + + tables + .add_table( + "internal_transactions", + vec!["block_number", "transaction_index", "internal_transaction_index"] + ) + .set_result_item_name("internalTransactions"); + + tables +}); field_selection! { block: BlockFieldSelection, @@ -47,7 +44,6 @@ field_selection! { internal_transaction: InternalTransactionFieldSelection, } - item_field_selection! { BlockFieldSelection { number, @@ -72,7 +68,6 @@ item_field_selection! { }} } - item_field_selection! { TransactionFieldSelection { transaction_index, @@ -139,7 +134,6 @@ item_field_selection! { }} } - item_field_selection! { LogFieldSelection { transaction_index, @@ -167,7 +161,6 @@ item_field_selection! { }} } - item_field_selection! { InternalTransactionFieldSelection { transaction_index, @@ -194,10 +187,8 @@ item_field_selection! { }} } - type Bytes = String; - request! { pub struct TransactionRequest { pub r#type: Option>, @@ -206,7 +197,6 @@ request! { } } - impl TransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("type", self.r#type.as_deref()); @@ -217,20 +207,19 @@ impl TransactionRequest { scan.join( "logs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.internal_transactions { scan.join( "internal_transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct TransferTransactionRequest { pub owner: Option>, @@ -240,7 +229,6 @@ request! { } } - impl TransferTransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("type", Some("TransferContract")); @@ -253,20 +241,19 @@ impl TransferTransactionRequest { scan.join( "logs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.internal_transactions { scan.join( "internal_transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct TransferAssetTransactionRequest { pub owner: Option>, @@ -277,7 +264,6 @@ request! { } } - impl TransferAssetTransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("type", Some("TransferAssetContract")); @@ -291,20 +277,19 @@ impl TransferAssetTransactionRequest { scan.join( "logs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.internal_transactions { scan.join( "internal_transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct TriggerSmartContractTransactionRequest { pub owner: Option>, @@ -315,7 +300,6 @@ request! { } } - impl TriggerSmartContractTransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_eq("type", Some("TriggerSmartContract")); @@ -329,20 +313,19 @@ impl TriggerSmartContractTransactionRequest { scan.join( "logs", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } if self.internal_transactions { scan.join( "internal_transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct LogRequest { pub address: Option>, @@ -354,7 +337,6 @@ request! { } } - impl LogRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("address", to_lowercase_list(&self.address)); @@ -369,13 +351,12 @@ impl LogRequest { scan.join( "transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct InternalTransactionRequest { pub caller: Option>, @@ -384,7 +365,6 @@ request! { } } - impl InternalTransactionRequest { fn predicate(&self, p: &mut PredicateBuilder) { p.col_in_list("caller_address", to_lowercase_list(&self.caller)); @@ -396,13 +376,12 @@ impl InternalTransactionRequest { scan.join( "transactions", vec!["block_number", "transaction_index"], - vec!["block_number", "transaction_index"], + vec!["block_number", "transaction_index"] ); } } } - request! { pub struct TronQuery { pub from_block: BlockNumber, @@ -419,7 +398,6 @@ request! { } } - impl TronQuery { pub fn validate(&self) -> anyhow::Result<()> { ensure_block_range!(self); diff --git a/crates/query/src/query/util.rs b/crates/query/src/query/util.rs index 9e3b970b..3ea126a1 100644 --- a/crates/query/src/query/util.rs +++ b/crates/query/src/query/util.rs @@ -1,9 +1,14 @@ -use crate::primitives::Name; -use crate::scan::{and, bloom_filter, col_eq, col_gt_eq, col_in_list, col_lt_eq, col_primitive_list_contains_any, col_string_list_contains_any, IntoArrowArray, IntoArrowScalar, RowPredicateRef}; -use arrow::array::StringArray; -use arrow::datatypes::ArrowPrimitiveType; use std::hash::Hash; +use arrow::{array::StringArray, datatypes::ArrowPrimitiveType}; + +use crate::{ + primitives::Name, + scan::{ + and, bloom_filter, col_eq, col_gt_eq, col_in_list, col_lt_eq, col_primitive_list_contains_any, + col_string_list_contains_any, IntoArrowArray, IntoArrowScalar, RowPredicateRef + } +}; macro_rules! item_field_selection { ( @@ -33,7 +38,6 @@ macro_rules! item_field_selection { } pub(crate) use item_field_selection; - macro_rules! field_selection { ( $($item_name:ident: $field_selection:ty ,)* @@ -50,12 +54,10 @@ macro_rules! field_selection { } pub(crate) use field_selection; - pub fn is_default(value: &T) -> bool { value.eq(&T::default()) } - macro_rules! request { ($( pub struct $name:ident { @@ -79,7 +81,6 @@ macro_rules! request { } pub(crate) use request; - macro_rules! ensure_block_range { ($query:ident) => { if let Some(to_block) = $query.to_block { @@ -90,7 +91,6 @@ macro_rules! ensure_block_range { } pub(crate) use ensure_block_range; - macro_rules! ensure_item_count { ($query:ident, $i:ident $(, $is:ident)*) => {{ let num_items = $query.$i.len() $(+ $query.$is.len())*; @@ -103,13 +103,11 @@ macro_rules! ensure_item_count { } pub(crate) use ensure_item_count; - pub struct PredicateBuilder { conditions: Vec, is_never: bool } - impl PredicateBuilder { pub fn new() -> Self { Self { @@ -127,7 +125,8 @@ impl PredicateBuilder { } pub fn col_in_list(&mut self, name: Name, maybe_list: Option) -> &mut Self - where L: IntoArrowArray + where + L: IntoArrowArray { if let Some(list) = maybe_list { let values = list.into_array(); @@ -157,13 +156,12 @@ impl PredicateBuilder { } pub fn bloom_filter( - &mut self, + &mut self, name: Name, byte_size: usize, num_hashes: usize, maybe_list: Option<&[T]> - ) -> &mut Self - { + ) -> &mut Self { if let Some(list) = maybe_list { if list.len() == 0 { self.is_never = true @@ -173,20 +171,16 @@ impl PredicateBuilder { } self } - + pub fn add(&mut self, condition: RowPredicateRef) -> &mut Self { self.conditions.push(condition); self } - pub fn col_primitive_list_contains_any( - &mut self, - name: Name, - maybe_list: Option<&[T::Native]>, - ) -> &mut Self + pub fn col_primitive_list_contains_any(&mut self, name: Name, maybe_list: Option<&[T::Native]>) -> &mut Self where T: ArrowPrimitiveType, - T::Native: Eq + Hash, + T::Native: Eq + Hash { if let Some(list) = maybe_list { if list.is_empty() { @@ -199,11 +193,7 @@ impl PredicateBuilder { self } - pub fn col_string_list_contains_any>( - &mut self, - name: Name, - maybe_list: Option<&[S]>, - ) -> &mut Self { + pub fn col_string_list_contains_any>(&mut self, name: Name, maybe_list: Option<&[S]>) -> &mut Self { if let Some(list) = maybe_list { if list.is_empty() { self.is_never = true; @@ -218,11 +208,11 @@ impl PredicateBuilder { pub fn is_never(&self) -> bool { self.is_never } - + pub fn mark_as_never(&mut self) { self.is_never = true } - + pub fn build(self) -> Option { if self.conditions.len() > 0 { Some(and(self.conditions)) @@ -232,7 +222,6 @@ impl PredicateBuilder { } } - macro_rules! compile_plan { ( $this:ident, @@ -279,27 +268,25 @@ macro_rules! compile_plan { } pub(crate) use compile_plan; - pub fn check_hex(s: &str) -> Result<(), &'static str> { if !s.starts_with("0x") { - return Err("binary hex string should start with '0x'") + return Err("binary hex string should start with '0x'"); } if s.len() % 2 != 0 { - return Err("binary hex string should have an even length") + return Err("binary hex string should have an even length"); } if !faster_hex::hex_check(s[2..].as_bytes()) { - return Err("contains non-hex character") + return Err("contains non-hex character"); } Ok(()) } - pub fn parse_hex(s: &str) -> Option> { if !s.starts_with("0x") { - return None + return None; } if s.len() % 2 != 0 { - return None + return None; } let mut bytes = vec![0; s.len() / 2 - 1]; faster_hex::hex_decode(s[2..].as_bytes(), &mut bytes) @@ -307,13 +294,12 @@ pub fn parse_hex(s: &str) -> Option> { .map(|_| bytes) } - pub fn parse_static_hex(s: &str) -> Option<[u8; N]> { if !s.starts_with("0x") { - return None + return None; } if s.len() != 2 + N * 2 { - return None + return None; } let mut bytes: [u8; N] = [0; N]; faster_hex::hex_decode(s[2..].as_bytes(), &mut bytes) @@ -321,11 +307,7 @@ pub fn parse_static_hex(s: &str) -> Option<[u8; N]> { .map(|_| bytes) } - pub fn to_lowercase_list(list: &Option>) -> Option { - list.as_ref().map(|v| { - StringArray::from_iter_values( - v.iter().map(|s| s.to_ascii_lowercase()) - ) - }) + list.as_ref() + .map(|v| StringArray::from_iter_values(v.iter().map(|s| s.to_ascii_lowercase()))) } diff --git a/crates/query/src/scan/array_predicate.rs b/crates/query/src/scan/array_predicate.rs index f9097b58..041bba37 100644 --- a/crates/query/src/scan/array_predicate.rs +++ b/crates/query/src/scan/array_predicate.rs @@ -1,19 +1,20 @@ -use crate::scan::arrow::IntoArrowScalar; +use std::{collections::HashSet, hash::Hash, ops::BitAnd, sync::Arc}; + use anyhow::{anyhow, bail, ensure}; -use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, Datum, PrimitiveArray, Scalar}; -use arrow::buffer::{BooleanBuffer, Buffer}; -use arrow::compute::{cast_with_options, CastOptions}; -use arrow::datatypes::{ArrowNativeType, ArrowNativeTypeOp, ArrowPrimitiveType, DataType, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type}; -use std::collections::HashSet; -use std::hash::Hash; -use std::ops::BitAnd; -use std::sync::Arc; -use crate::scan::IntoArrowArray; +use arrow::{ + array::{Array, ArrayRef, AsArray, BooleanArray, Datum, PrimitiveArray, Scalar}, + buffer::{BooleanBuffer, Buffer}, + compute::{cast_with_options, CastOptions}, + datatypes::{ + ArrowNativeType, ArrowNativeTypeOp, ArrowPrimitiveType, DataType, Int16Type, Int32Type, Int64Type, Int8Type, + UInt16Type, UInt32Type, UInt64Type, UInt8Type + } +}; +use crate::scan::{arrow::IntoArrowScalar, IntoArrowArray}; pub type ArrayPredicateRef = Arc; - pub trait ArrayPredicate: Sync + Send { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result; @@ -26,32 +27,26 @@ pub trait ArrayPredicate: Sync + Send { } } - #[derive(Clone)] pub struct ArrayStats { pub min: ArrayRef, pub max: ArrayRef } - pub struct And { predicates: Vec } - impl And { pub fn new(predicates: Vec) -> Self { - Self { - predicates - } + Self { predicates } } } - impl ArrayPredicate for And { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { if self.predicates.len() == 0 { - return Ok(zero_mask(arr.len(), true)) + return Ok(zero_mask(arr.len(), true)); } let mut result_mask = self.predicates[0].evaluate(arr)?; for i in 1..self.predicates.len() { @@ -67,7 +62,7 @@ impl ArrayPredicate for And { fn evaluate_stats(&self, stats: &ArrayStats) -> anyhow::Result { if self.predicates.len() == 0 { - return Ok(zero_mask(stats.min.len(), true)) + return Ok(zero_mask(stats.min.len(), true)); } let mut result_mask = self.predicates[0].evaluate_stats(stats)?; for i in 1..self.predicates.len() { @@ -78,25 +73,20 @@ impl ArrayPredicate for And { } } - pub struct Or { predicates: Vec } - impl Or { pub fn new(predicates: Vec) -> Self { - Or { - predicates - } + Or { predicates } } } - impl ArrayPredicate for Or { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { if self.predicates.len() == 0 { - return Ok(zero_mask(arr.len(), false)) + return Ok(zero_mask(arr.len(), false)); } let mut result_mask = self.predicates[0].evaluate(arr)?; for i in 1..self.predicates.len() { @@ -112,7 +102,7 @@ impl ArrayPredicate for Or { fn evaluate_stats(&self, stats: &ArrayStats) -> anyhow::Result { if self.predicates.len() == 0 { - return Ok(zero_mask(stats.min.len(), false)) + return Ok(zero_mask(stats.min.len(), false)); } let mut result_mask = self.predicates[0].evaluate_stats(stats)?; for i in 1..self.predicates.len() { @@ -123,7 +113,6 @@ impl ArrayPredicate for Or { } } - pub fn or(predicates: Vec) -> ArrayPredicateRef { if predicates.len() == 1 { predicates.into_iter().next().unwrap() @@ -132,7 +121,6 @@ pub fn or(predicates: Vec) -> ArrayPredicateRef { } } - macro_rules! cast_scalar { ($value:ident, $scalar:expr, $arr:ident, Less: $less:literal, Greater: $greater:literal) => { let scalar = $scalar; @@ -141,17 +129,15 @@ macro_rules! cast_scalar { CastResult::Same => scalar, CastResult::Cast(value) => value, CastResult::Less => return Ok(zero_mask($arr.len(), $less)), - CastResult::Greater => return Ok(zero_mask($arr.len(), $greater)), + CastResult::Greater => return Ok(zero_mask($arr.len(), $greater)) }; }; } - pub struct Eq { value: Scalar } - impl Eq { pub fn new(value: T) -> Self { Self { @@ -160,7 +146,6 @@ impl Eq { } } - impl ArrayPredicate for Eq { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { cast_scalar!(value, &self.value, arr, Less: false, Greater: false); @@ -181,13 +166,11 @@ impl ArrayPredicate for Eq { } } - /// value >= item pub struct GtEq { value: Scalar } - impl GtEq { pub fn new(value: T) -> Self { Self { @@ -196,7 +179,6 @@ impl GtEq { } } - impl ArrayPredicate for GtEq { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { cast_scalar!(value, &self.value, arr, Less: false, Greater: true); @@ -216,13 +198,11 @@ impl ArrayPredicate for GtEq { } } - /// value <= item pub struct LtEq { value: Scalar } - impl LtEq { pub fn new(value: T) -> Self { Self { @@ -231,7 +211,6 @@ impl LtEq { } } - impl ArrayPredicate for LtEq { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { cast_scalar!(value, &self.value, arr, Less: true, Greater: false); @@ -251,7 +230,6 @@ impl ArrayPredicate for LtEq { } } - pub fn zero_mask(len: usize, is_set: bool) -> BooleanArray { let buf = if is_set { BooleanBuffer::new_set(len) @@ -261,7 +239,6 @@ pub fn zero_mask(len: usize, is_set: bool) -> BooleanArray { BooleanArray::from(buf) } - enum CastResult { Less, Greater, @@ -269,34 +246,36 @@ enum CastResult { Cast(Scalar) } - fn cast_scalar(scalar: &Scalar, target_domain: &DataType) -> anyhow::Result { let array = scalar.get().0; if array.data_type() == target_domain { - return Ok(CastResult::Same) + return Ok(CastResult::Same); } if array.data_type().is_integer() && target_domain.is_integer() { - return Ok(tower_cast(array, target_domain)) + return Ok(tower_cast(array, target_domain)); } - let new_array = cast_with_options(array, target_domain, &CastOptions { - safe: false, - ..CastOptions::default() - })?; + let new_array = cast_with_options( + array, + target_domain, + &CastOptions { + safe: false, + ..CastOptions::default() + } + )?; Ok(CastResult::Cast(Scalar::new(new_array))) } - fn tower_cast(array: &dyn Array, target_domain: &DataType) -> CastResult { macro_rules! cast { ($from:ty, $to:ty, $common:ty) => { tower_cast_impl::<$from, $to, $common>(array) }; } - + match (array.data_type(), target_domain) { (DataType::UInt64, DataType::UInt32) => cast!(UInt64Type, UInt32Type, u64), (DataType::UInt64, DataType::UInt16) => cast!(UInt64Type, UInt16Type, u64), @@ -366,14 +345,14 @@ fn tower_cast(array: &dyn Array, target_domain: &DataType) -> CastResult { } } - fn tower_cast_impl(array: &dyn Array) -> CastResult - where FROM: ArrowPrimitiveType, - TO: ArrowPrimitiveType, - TO::Native: TryFrom, - C: From, - C: From, - C: Ord +where + FROM: ArrowPrimitiveType, + TO: ArrowPrimitiveType, + TO::Native: TryFrom, + C: From, + C: From, + C: Ord { let value = array.as_primitive::().value(0); let target_value = if let Ok(val) = TO::Native::try_from(value) { @@ -387,33 +366,24 @@ fn tower_cast_impl(array: &dyn Array) -> CastResult } else { assert!(value < min); CastResult::Less - } + }; }; - let scalar = Scalar::new( - Arc::new( - PrimitiveArray::::from_value(target_value, 1) - ) as Arc - ); + let scalar = Scalar::new(Arc::new(PrimitiveArray::::from_value(target_value, 1)) as Arc); CastResult::Cast(scalar) } - pub struct InList { list: sqd_polars::prelude::Series } - impl InList { pub fn new(values: L) -> Self { let arr = values.into_array(); let list = sqd_polars::arrow::array_series("value_list", &arr).unwrap(); - Self { - list - } + Self { list } } } - impl ArrayPredicate for InList { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { let series = sqd_polars::arrow::array_series("values", arr)?; @@ -423,7 +393,6 @@ impl ArrayPredicate for InList { } } - fn bitwise_and(value: &[u8; N], other: &[u8; N]) -> [u8; N] { let mut arr = [0; N]; for i in 0..N { @@ -432,12 +401,10 @@ fn bitwise_and(value: &[u8; N], other: &[u8; N]) -> [u8; N] { arr } - pub struct BloomFilter { value: Buffer } - impl BloomFilter { pub fn new(byte_size: usize, num_hashes: usize, value: T) -> Self { let mut bloom = sqd_bloom_filter::BloomFilter::new(byte_size, num_hashes); @@ -450,7 +417,7 @@ impl BloomFilter { #[inline(never)] fn eval_static(&self, blooms: &[u8]) -> BooleanBuffer where - T: ArrowNativeType + BitAnd + T: ArrowNativeType + BitAnd { let value = to_typed_fixed_slice::(&self.value); let blooms = to_typed_slice::(blooms); @@ -470,7 +437,7 @@ impl BloomFilter { #[inline(never)] fn eval_dynamic(&self, blooms: &[u8]) -> BooleanBuffer where - T: ArrowNativeType + BitAnd + T: ArrowNativeType + BitAnd { let value = to_typed_slice::(&self.value); let blooms = to_typed_slice::(blooms); @@ -483,25 +450,22 @@ impl BloomFilter { } } - fn to_typed_slice(value: &[u8]) -> &[T] { let (prefix, offsets, suffix) = unsafe { value.align_to::() }; assert!(prefix.is_empty() && suffix.is_empty()); offsets } - fn to_typed_fixed_slice(value: &[u8]) -> &[T; N] { let slice = to_typed_slice::(value); slice.try_into().unwrap() } - impl ArrayPredicate for BloomFilter { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { - let arr = arr.as_fixed_size_binary_opt().ok_or_else(|| { - anyhow!("expected fixed sized binary array, but got {}", arr.data_type()) - })?; + let arr = arr + .as_fixed_size_binary_opt() + .ok_or_else(|| anyhow!("expected fixed sized binary array, but got {}", arr.data_type()))?; let size = self.value.len(); @@ -517,12 +481,14 @@ impl ArrayPredicate for BloomFilter { let mask = match size { 64 => self.eval_static::(values), - _ => if size % 16 == 0 { - self.eval_dynamic::(values) - } else if size % 8 == 0 { - self.eval_dynamic::(values) - } else { - self.eval_dynamic::(values) + _ => { + if size % 16 == 0 { + self.eval_dynamic::(values) + } else if size % 8 == 0 { + self.eval_dynamic::(values) + } else { + self.eval_dynamic::(values) + } } }; @@ -530,32 +496,29 @@ impl ArrayPredicate for BloomFilter { } } - pub struct PrimitiveListContainsAny { - values: HashSet, + values: HashSet } - impl PrimitiveListContainsAny where - T::Native: std::cmp::Eq + Hash, + T::Native: std::cmp::Eq + Hash { pub fn new(values: &[T::Native]) -> Self { Self { - values: values.iter().copied().collect(), + values: values.iter().copied().collect() } } } - impl ArrayPredicate for PrimitiveListContainsAny where - T::Native: std::cmp::Eq + Hash, + T::Native: std::cmp::Eq + Hash { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { - let list_array = arr.as_list_opt::().ok_or_else(|| { - anyhow!("expected List array, but got {}", arr.data_type()) - })?; + let list_array = arr + .as_list_opt::() + .ok_or_else(|| anyhow!("expected List array, but got {}", arr.data_type()))?; let values_array = list_array.values().as_primitive_opt::().ok_or_else(|| { anyhow!( @@ -583,26 +546,23 @@ where } } - pub struct StringListContainsAny { - values: HashSet, + values: HashSet } - impl StringListContainsAny { pub fn new>(values: &[S]) -> Self { Self { - values: values.iter().map(|s| s.as_ref().to_string()).collect(), + values: values.iter().map(|s| s.as_ref().to_string()).collect() } } } - impl ArrayPredicate for StringListContainsAny { fn evaluate(&self, arr: &dyn Array) -> anyhow::Result { - let list_array = arr.as_list_opt::().ok_or_else(|| { - anyhow!("expected List array, but got {}", arr.data_type()) - })?; + let list_array = arr + .as_list_opt::() + .ok_or_else(|| anyhow!("expected List array, but got {}", arr.data_type()))?; let values_array = list_array.values().as_string_opt::().ok_or_else(|| { anyhow!( @@ -630,24 +590,16 @@ impl ArrayPredicate for StringListContainsAny { } } - #[cfg(feature = "_bench")] mod bench { - use crate::scan::array_predicate::{ArrayPredicate, BloomFilter}; - use arrow::array::FixedSizeBinaryArray; - use arrow::buffer::MutableBuffer; + use arrow::{array::FixedSizeBinaryArray, buffer::MutableBuffer}; + use crate::scan::array_predicate::{ArrayPredicate, BloomFilter}; #[divan::bench] fn bloom_filter(bench: divan::Bencher) { let pred = BloomFilter::new(64, 7, "hello"); - let array = FixedSizeBinaryArray::new( - 64, - MutableBuffer::from_len_zeroed(64 * 200_000).into(), - None - ); - bench.bench(|| { - pred.evaluate(&array).unwrap() - }) + let array = FixedSizeBinaryArray::new(64, MutableBuffer::from_len_zeroed(64 * 200_000).into(), None); + bench.bench(|| pred.evaluate(&array).unwrap()) } -} \ No newline at end of file +} diff --git a/crates/query/src/scan/arrow.rs b/crates/query/src/scan/arrow.rs index 6b3443fa..15cdcc99 100644 --- a/crates/query/src/scan/arrow.rs +++ b/crates/query/src/scan/arrow.rs @@ -1,31 +1,30 @@ -use arrow::array::{ArrayRef, BinaryArray, BooleanArray, FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, Int8Array, Scalar, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array}; use std::sync::Arc; +use arrow::array::{ + ArrayRef, BinaryArray, BooleanArray, FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, Int8Array, Scalar, + StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array +}; pub trait IntoArrowScalar: Sized { fn into_scalar(self) -> Scalar; } - impl IntoArrowScalar for Scalar { fn into_scalar(self) -> Scalar { self } } - pub trait IntoArrowArray { fn into_array(self) -> ArrayRef; } - impl IntoArrowArray for ArrayRef { fn into_array(self) -> ArrayRef { self } } - macro_rules! imp { ($t:ty, $arr_type:ty) => { impl IntoArrowScalar for $t { @@ -44,7 +43,6 @@ macro_rules! imp { }; } - imp!(bool, BooleanArray); imp!(u8, UInt8Array); imp!(u16, UInt16Array); @@ -58,7 +56,6 @@ imp!(&[u8], BinaryArray); imp!(&str, StringArray); imp!(String, StringArray); - impl IntoArrowArray for &[String] { fn into_array(self) -> ArrayRef { let arr = StringArray::from_iter(self.iter().map(Some)); @@ -66,7 +63,6 @@ impl IntoArrowArray for &[String] { } } - macro_rules! impl_into_ref { ($($t:ty),*) => { $( @@ -79,7 +75,6 @@ macro_rules! impl_into_ref { }; } - impl_into_ref!( UInt8Array, UInt16Array, @@ -88,4 +83,4 @@ impl_into_ref!( BooleanArray, StringArray, FixedSizeBinaryArray -); \ No newline at end of file +); diff --git a/crates/query/src/scan/chunk.rs b/crates/query/src/scan/chunk.rs index 7cd4e482..cf70e4ed 100644 --- a/crates/query/src/scan/chunk.rs +++ b/crates/query/src/scan/chunk.rs @@ -1,6 +1,4 @@ -use crate::primitives::Name; -use crate::scan::scan::Scan; - +use crate::{primitives::Name, scan::scan::Scan}; pub trait Chunk: Send + Sync { fn scan_table(&self, name: Name) -> anyhow::Result>; diff --git a/crates/query/src/scan/errors.rs b/crates/query/src/scan/errors.rs index f45ee7da..62f46f71 100644 --- a/crates/query/src/scan/errors.rs +++ b/crates/query/src/scan/errors.rs @@ -2,7 +2,7 @@ use crate::primitives::Name; #[derive(Debug)] pub struct TableDoesNotExist { - pub table_name: Name, + pub table_name: Name } impl TableDoesNotExist { @@ -22,25 +22,21 @@ impl std::error::Error for TableDoesNotExist {} #[derive(Debug)] pub struct ColumnDoesNotExist { pub column_name: Name, - pub table_name: String, + pub table_name: String } impl ColumnDoesNotExist { pub fn new(table_name: String, column_name: Name) -> Self { Self { table_name, - column_name, + column_name } } } impl std::fmt::Display for ColumnDoesNotExist { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "column '{}' is not found in '{}'", - self.column_name, self.table_name - ) + write!(f, "column '{}' is not found in '{}'", self.column_name, self.table_name) } } diff --git a/crates/query/src/scan/mod.rs b/crates/query/src/scan/mod.rs index 0f33b477..aa90e2c5 100644 --- a/crates/query/src/scan/mod.rs +++ b/crates/query/src/scan/mod.rs @@ -1,6 +1,7 @@ pub mod array_predicate; mod arrow; mod chunk; +mod errors; #[cfg(feature = "parquet")] pub mod parquet; mod reader; @@ -9,10 +10,8 @@ mod row_predicate_dsl; pub(crate) mod scan; #[cfg(feature = "storage")] mod storage; -mod errors; mod util; - pub use arrow::*; pub use chunk::*; pub use errors::*; diff --git a/crates/query/src/scan/parquet/chunk.rs b/crates/query/src/scan/parquet/chunk.rs index fe00e044..75a5dd66 100644 --- a/crates/query/src/scan/parquet/chunk.rs +++ b/crates/query/src/scan/parquet/chunk.rs @@ -1,18 +1,18 @@ -use crate::scan::chunk::Chunk; -use crate::scan::parquet::file::ParquetFile; -use crate::scan::scan::Scan; -use crate::TableDoesNotExist; +use std::sync::Arc; + use anyhow::anyhow; use sqd_primitives::Name; -use std::sync::Arc; +use crate::{ + scan::{chunk::Chunk, parquet::file::ParquetFile, scan::Scan}, + TableDoesNotExist +}; pub struct ParquetChunk { path: String, tables: dashmap::DashMap> } - impl ParquetChunk { pub fn new>(path: P) -> Self { Self { @@ -22,25 +22,24 @@ impl ParquetChunk { } } - impl Chunk for ParquetChunk { fn scan_table(&self, name: Name) -> anyhow::Result> { let entry = self.tables.entry(name); - let file = entry.or_try_insert_with(|| { - let file_path = format!("{}/{}.parquet", self.path, name); - ParquetFile::open(file_path) - .map_err(|err| { - if let Some(err) = err.downcast_ref::() { - if err.kind() == std::io::ErrorKind::NotFound { - return anyhow!(TableDoesNotExist::new(name)) + let file = entry + .or_try_insert_with(|| { + let file_path = format!("{}/{}.parquet", self.path, name); + ParquetFile::open(file_path) + .map_err(|err| { + if let Some(err) = err.downcast_ref::() { + if err.kind() == std::io::ErrorKind::NotFound { + return anyhow!(TableDoesNotExist::new(name)); + } } - } - err - }) - .map(Arc::new) - }).map(|r| { - r.value().clone() - })?; + err + }) + .map(Arc::new) + }) + .map(|r| r.value().clone())?; Ok(Scan::new(file)) } -} \ No newline at end of file +} diff --git a/crates/query/src/scan/parquet/file.rs b/crates/query/src/scan/parquet/file.rs index 5ba1a937..471c114f 100644 --- a/crates/query/src/scan/parquet/file.rs +++ b/crates/query/src/scan/parquet/file.rs @@ -1,49 +1,55 @@ -use crate::primitives::{Name, RowIndex, RowRangeList}; -use crate::scan::parquet::io::MmapIO; -use crate::scan::parquet::metadata::ParquetMetadata; -use crate::scan::reader::TableReader; -use crate::scan::row_predicate::{RowPredicate, RowPredicateRef}; -use crate::scan::util::{add_row_index, build_row_index_array}; -use crate::ColumnDoesNotExist; -use arrow::array::{new_null_array, RecordBatch}; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use parquet::arrow::arrow_reader::{ - ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, - RowSelector, +use std::{cmp::Ordering, collections::HashSet, ops::Not, path::PathBuf, sync::Arc}; + +use arrow::{ + array::{new_null_array, RecordBatch}, + datatypes::{DataType, Field, Schema, SchemaRef} +}; +use parquet::{ + arrow::{ + arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowSelection, RowSelector + }, + ProjectionMask + }, + file::metadata::RowGroupMetaData }; -use parquet::arrow::ProjectionMask; -use parquet::file::metadata::RowGroupMetaData; use rayon::prelude::*; -use std::cmp::Ordering; -use std::collections::HashSet; -use std::ops::Not; -use std::path::PathBuf; -use std::sync::Arc; + +use crate::{ + primitives::{Name, RowIndex, RowRangeList}, + scan::{ + parquet::{io::MmapIO, metadata::ParquetMetadata}, + reader::TableReader, + row_predicate::{RowPredicate, RowPredicateRef}, + util::{add_row_index, build_row_index_array} + }, + ColumnDoesNotExist +}; #[derive(Clone)] pub struct ParquetFile { io: MmapIO, metadata: Arc, - table_name: String, + table_name: String } impl ParquetFile { pub fn open(file: impl Into) -> anyhow::Result { let path = file.into(); - let table_name = path.file_stem() + let table_name = path + .file_stem() .map(|s| s.to_string_lossy().into_owned()) .unwrap_or_default(); let io = MmapIO::open(&path)?; - let metadata = - ArrowReaderMetadata::load(&io, ArrowReaderOptions::new().with_page_index(true))?; + let metadata = ArrowReaderMetadata::load(&io, ArrowReaderOptions::new().with_page_index(true))?; Ok(Self { io, metadata: Arc::new(ParquetMetadata::new(metadata)), - table_name, + table_name }) } } @@ -83,8 +89,7 @@ impl TableReader for ParquetFile { row_selection: Option<&RowRangeList>, with_row_index: bool, default_null_columns: Option<&HashSet> - ) -> anyhow::Result> - { + ) -> anyhow::Result> { // Stage 1: Row group stats pruning let mut maybe_new_row_selection = None; @@ -116,9 +121,7 @@ impl TableReader for ParquetFile { maybe_row_group_offsets = Some(row_group_offsets); selected_row_groups } else { - (0..parquet_metadata.num_row_groups()) - .map(|idx| (idx, None)) - .collect() + (0..parquet_metadata.num_row_groups()).map(|idx| (idx, None)).collect() }; // Stage 2: Page stats pruning @@ -135,11 +138,7 @@ impl TableReader for ParquetFile { let _ = std::mem::replace(sel_ptr, Some(new_selection)); } } - row_groups.retain(|rg| { - rg.1.as_ref() - .map(|ranges| !ranges.is_empty()) - .unwrap_or(true) - }) + row_groups.retain(|rg| rg.1.as_ref().map(|ranges| !ranges.is_empty()).unwrap_or(true)) } } @@ -160,12 +159,7 @@ impl TableReader for ParquetFile { let mut indices = Vec::with_capacity(columns.len() + predicate_columns.len()); - let fields = self - .metadata - .metadata() - .parquet_schema() - .root_schema() - .get_fields(); + let fields = self.metadata.metadata().parquet_schema().root_schema().get_fields(); for name in columns.iter().chain(predicate_columns.iter()).copied() { match fields.iter().position(|f| f.name() == name) { @@ -183,8 +177,7 @@ impl TableReader for ParquetFile { } } - let projection_mask = - ProjectionMask::roots(parquet_metadata.file_metadata().schema_descr(), indices); + let projection_mask = ProjectionMask::roots(parquet_metadata.file_metadata().schema_descr(), indices); (projection_mask, predicate_columns) } else { @@ -193,10 +186,8 @@ impl TableReader for ParquetFile { tracing::trace!("missing columnds: {missing_null_columns:?}"); - let maybe_row_index_offsets = with_row_index.then(|| { - maybe_row_group_offsets - .unwrap_or_else(|| build_row_group_offsets(parquet_metadata.row_groups())) - }); + let maybe_row_index_offsets = with_row_index + .then(|| maybe_row_group_offsets.unwrap_or_else(|| build_row_group_offsets(parquet_metadata.row_groups()))); // Stage 4: Parallel row group reading let results: Vec<_> = row_groups @@ -209,20 +200,13 @@ impl TableReader for ParquetFile { projection_mask.clone(), predicate.clone().map(|p| (p, &predicate_columns)), maybe_row_selection, - maybe_row_index_offsets - .as_ref() - .map(|offsets| offsets[row_group_idx]), - 1_000_000_000, + maybe_row_index_offsets.as_ref().map(|offsets| offsets[row_group_idx]), + 1_000_000_000 ) }) .collect(); - let mut record_batches = Vec::with_capacity( - results - .iter() - .map(|r| r.as_ref().map_or(0, |bs| bs.len())) - .sum(), - ); + let mut record_batches = Vec::with_capacity(results.iter().map(|r| r.as_ref().map_or(0, |bs| bs.len())).sum()); for r in results { record_batches.extend(r?) @@ -263,7 +247,7 @@ fn read_row_group( maybe_predicate: Option<(RowPredicateRef, &HashSet)>, maybe_row_selection: Option, maybe_row_index_offset: Option, - record_batch_size: usize, + record_batch_size: usize ) -> anyhow::Result> { let mut reader = ParquetRecordBatchReaderBuilder::new_with_metadata(io, metadata.clone()); @@ -320,7 +304,7 @@ fn build_row_group_offsets(row_groups: &[RowGroupMetaData]) -> Vec { fn apply_predicate( mut batch: RecordBatch, predicate: &dyn RowPredicate, - predicate_columns: &HashSet, + predicate_columns: &HashSet ) -> anyhow::Result { let mask = predicate.evaluate(&batch)?; @@ -356,13 +340,13 @@ fn to_parquet_row_selection(ranges: &RowRangeList) -> RowSelection { match range.start.cmp(&last_end) { Ordering::Equal => match selectors.last_mut() { Some(last) => last.row_count = last.row_count.checked_add(len).unwrap(), - None => selectors.push(RowSelector::select(len)), + None => selectors.push(RowSelector::select(len)) }, Ordering::Greater => { selectors.push(RowSelector::skip((range.start - last_end) as usize)); selectors.push(RowSelector::select(len)) } - Ordering::Less => panic!("out of order"), + Ordering::Less => panic!("out of order") } last_end = range.end; } diff --git a/crates/query/src/scan/parquet/io.rs b/crates/query/src/scan/parquet/io.rs index cc423c9a..efa027bd 100644 --- a/crates/query/src/scan/parquet/io.rs +++ b/crates/query/src/scan/parquet/io.rs @@ -1,37 +1,28 @@ -use std::io::Cursor; -use std::sync::Arc; +use std::{io::Cursor, sync::Arc}; use bytes::Bytes; use memmap2::{Mmap, MmapOptions}; use parquet::file::reader::{ChunkReader, Length}; - #[derive(Clone)] pub struct MmapIO { mmap: Arc } - impl MmapIO { pub fn open(filename: impl AsRef) -> std::io::Result { let file = std::fs::File::open(filename)?; - let mmap = unsafe { - MmapOptions::new().map(&file) - }?; - Ok(MmapIO { - mmap: Arc::new(mmap) - }) + let mmap = unsafe { MmapOptions::new().map(&file) }?; + Ok(MmapIO { mmap: Arc::new(mmap) }) } } - impl AsRef<[u8]> for MmapIO { fn as_ref(&self) -> &[u8] { self.mmap.as_ref() } } - impl ChunkReader for MmapIO { type T = Cursor; @@ -48,9 +39,8 @@ impl ChunkReader for MmapIO { } } - impl Length for MmapIO { fn len(&self) -> u64 { self.mmap.len() as u64 } -} \ No newline at end of file +} diff --git a/crates/query/src/scan/parquet/metadata.rs b/crates/query/src/scan/parquet/metadata.rs index c56cd7f4..76f0bfff 100644 --- a/crates/query/src/scan/parquet/metadata.rs +++ b/crates/query/src/scan/parquet/metadata.rs @@ -1,16 +1,22 @@ -use std::collections::HashMap; -use std::sync::Arc; - -use arrow::array::{Array, ArrayBuilder, ArrayRef, AsArray, BinaryArray, BinaryBuilder, BooleanArray, BooleanBuilder, Int32Array, Int32Builder, Int64Array, Int64Builder, UInt32Array}; -use arrow::buffer::OffsetBuffer; -use arrow::datatypes::{DataType, Int32Type, Int64Type, UInt32Type, UInt64Type}; -use parquet::arrow::arrow_reader::ArrowReaderMetadata; -use parquet::file::page_index::index::Index; -use parquet::file::statistics::Statistics; - -use crate::primitives::Name; -use crate::scan::row_predicate::{ColumnStats, RowStats}; - +use std::{collections::HashMap, sync::Arc}; + +use arrow::{ + array::{ + Array, ArrayBuilder, ArrayRef, AsArray, BinaryArray, BinaryBuilder, BooleanArray, BooleanBuilder, Int32Array, + Int32Builder, Int64Array, Int64Builder, UInt32Array + }, + buffer::OffsetBuffer, + datatypes::{DataType, Int32Type, Int64Type, UInt32Type, UInt64Type} +}; +use parquet::{ + arrow::arrow_reader::ArrowReaderMetadata, + file::{page_index::index::Index, statistics::Statistics} +}; + +use crate::{ + primitives::Name, + scan::row_predicate::{ColumnStats, RowStats} +}; pub struct ParquetMetadata { metadata: ArrowReaderMetadata, @@ -18,7 +24,6 @@ pub struct ParquetMetadata { page_stats: Vec } - impl ParquetMetadata { pub fn new(metadata: ArrowReaderMetadata) -> Self { let num_row_groups = metadata.metadata().num_row_groups(); @@ -44,13 +49,11 @@ impl ParquetMetadata { } } - struct RowGroupStats { metadata: ArrowReaderMetadata, column_stats: parking_lot::Mutex>> } - impl RowGroupStats { pub fn new(metadata: ArrowReaderMetadata) -> Self { let num_columns = metadata.parquet_schema().num_columns(); @@ -61,20 +64,18 @@ impl RowGroupStats { } } - impl RowStats for RowGroupStats { - fn get_column_stats(&self, column: Name) -> anyhow::Result> { + fn get_column_stats(&self, column: Name) -> anyhow::Result> { let mut column_stats = self.column_stats.lock(); - let s = column_stats.entry(column).or_insert_with(|| { - self.build_column_stats(column) - }); + let s = column_stats + .entry(column) + .or_insert_with(|| self.build_column_stats(column)); Ok(s.clone()) } } - impl RowGroupStats { fn build_column_stats(&self, column_name: Name) -> Option { let arrow_column_index = self.metadata.schema().index_of(column_name).ok()?; @@ -94,70 +95,73 @@ impl RowGroupStats { let statistics = rg.column(parquet_col_idx).statistics()?; match statistics { Statistics::Boolean(s) => { - let min_max = boolean.get_or_insert_with(|| ( - BooleanBuilder::with_capacity(num_row_groups), - BooleanBuilder::with_capacity(num_row_groups) - )); + let min_max = boolean.get_or_insert_with(|| { + ( + BooleanBuilder::with_capacity(num_row_groups), + BooleanBuilder::with_capacity(num_row_groups) + ) + }); match (s.min_opt(), s.max_opt()) { (Some(min), Some(max)) => { min_max.0.append_value(*min); min_max.1.append_value(*max); - }, + } (None, None) => { min_max.0.append_null(); min_max.1.append_null(); - }, + } _ => return None } } Statistics::Int32(s) => { - let min_max = int32.get_or_insert_with(|| ( - Int32Builder::with_capacity(num_row_groups), - Int32Builder::with_capacity(num_row_groups) - )); + let min_max = int32.get_or_insert_with(|| { + ( + Int32Builder::with_capacity(num_row_groups), + Int32Builder::with_capacity(num_row_groups) + ) + }); match (s.min_opt(), s.max_opt()) { (Some(min), Some(max)) => { min_max.0.append_value(*min); min_max.1.append_value(*max); - }, + } (None, None) => { min_max.0.append_null(); min_max.1.append_null(); - }, + } _ => return None } } Statistics::Int64(s) => { - let min_max = int64.get_or_insert_with(|| ( - Int64Builder::with_capacity(num_row_groups), - Int64Builder::with_capacity(num_row_groups) - )); + let min_max = int64.get_or_insert_with(|| { + ( + Int64Builder::with_capacity(num_row_groups), + Int64Builder::with_capacity(num_row_groups) + ) + }); match (s.min_opt(), s.max_opt()) { (Some(min), Some(max)) => { min_max.0.append_value(*min); min_max.1.append_value(*max); - }, + } (None, None) => { min_max.0.append_null(); min_max.1.append_null(); - }, + } _ => return None } } Statistics::ByteArray(s) => { - let min_max = binary.get_or_insert_with(|| ( - BinaryBuilder::new(), - BinaryBuilder::new() - )); + let min_max = binary.get_or_insert_with(|| (BinaryBuilder::new(), BinaryBuilder::new())); match (s.min_opt(), s.max_opt()) { (Some(min), Some(max)) => { min_max.0.append_value(min); min_max.1.append_value(max); - }, + } (None, None) => { min_max.0.append_null(); min_max.1.append_null(); - }, + } _ => return None } } @@ -174,7 +178,7 @@ impl RowGroupStats { if min_max.0.len() == $num_row_groups { Some(( Arc::new(min_max.0.finish()) as ArrayRef, - Arc::new(min_max.1.finish()) as ArrayRef, + Arc::new(min_max.1.finish()) as ArrayRef )) } else { None @@ -183,51 +187,46 @@ impl RowGroupStats { }; } - None.or_else(|| { - complete_min_max!(binary, num_row_groups) - }).or_else(|| { - complete_min_max!(int32, num_row_groups) - }).or_else(|| { - complete_min_max!(int64, num_row_groups) - }).or_else(|| { - complete_min_max!(boolean, num_row_groups) - }).and_then(|min_max| { - let data_type = self.metadata.schema().field(arrow_column_index).data_type(); - let min = cast_stat_array(min_max.0, data_type)?; - let max = cast_stat_array(min_max.1, data_type)?; - Some(ColumnStats { - offsets: OffsetBuffer::new(offsets.finish().into_parts().1), - min, - max + None.or_else(|| complete_min_max!(binary, num_row_groups)) + .or_else(|| complete_min_max!(int32, num_row_groups)) + .or_else(|| complete_min_max!(int64, num_row_groups)) + .or_else(|| complete_min_max!(boolean, num_row_groups)) + .and_then(|min_max| { + let data_type = self.metadata.schema().field(arrow_column_index).data_type(); + let min = cast_stat_array(min_max.0, data_type)?; + let max = cast_stat_array(min_max.1, data_type)?; + Some(ColumnStats { + offsets: OffsetBuffer::new(offsets.finish().into_parts().1), + min, + max + }) }) - }) } } - fn cast_stat_array(array: ArrayRef, target_type: &DataType) -> Option { if array.data_type() == target_type { - return Some(array) + return Some(array); } match (array.data_type(), target_type) { - (DataType::Int32, DataType::UInt32) => Some(Arc::new( - arrow::compute::unary::<_, _, UInt32Type>(array.as_primitive::(), |x| x as u32)) - ), - (DataType::Int64, DataType::UInt64) => Some(Arc::new( - arrow::compute::unary::<_, _, UInt64Type>(array.as_primitive::(), |x| x as u64)) - ), + (DataType::Int32, DataType::UInt32) => Some(Arc::new(arrow::compute::unary::<_, _, UInt32Type>( + array.as_primitive::(), + |x| x as u32 + ))), + (DataType::Int64, DataType::UInt64) => Some(Arc::new(arrow::compute::unary::<_, _, UInt64Type>( + array.as_primitive::(), + |x| x as u64 + ))), _ => arrow::compute::cast(&array, target_type).ok() } } - struct PageStats { metadata: ArrowReaderMetadata, row_group_idx: usize, column_stats: parking_lot::Mutex>> } - impl PageStats { pub fn new(metadata: ArrowReaderMetadata, row_group_idx: usize) -> Self { let num_columns = metadata.parquet_schema().num_columns(); @@ -239,89 +238,70 @@ impl PageStats { } } - impl RowStats for PageStats { - fn get_column_stats(&self, column: Name) -> anyhow::Result> { + fn get_column_stats(&self, column: Name) -> anyhow::Result> { let mut column_stats = self.column_stats.lock(); - let s = column_stats.entry(column).or_insert_with(|| { - self.build_column_stats(column) - }); + let s = column_stats + .entry(column) + .or_insert_with(|| self.build_column_stats(column)); Ok(s.clone()) } } - impl PageStats { fn build_column_stats(&self, column_name: Name) -> Option { let arrow_col_idx = self.metadata.schema().index_of(column_name).ok()?; let parquet_col_idx = find_primitive_column(&self.metadata, column_name)?; - let offsets = self.metadata - .metadata() - .offset_index() - .map(|offset_index| { - let pages = &offset_index[self.row_group_idx][parquet_col_idx].page_locations(); - let mut offsets = UInt32Array::builder(pages.len() + 1); + let offsets = self.metadata.metadata().offset_index().map(|offset_index| { + let pages = &offset_index[self.row_group_idx][parquet_col_idx].page_locations(); + let mut offsets = UInt32Array::builder(pages.len() + 1); - for page in pages.iter() { - offsets.append_value(page.first_row_index as u32) - } + for page in pages.iter() { + offsets.append_value(page.first_row_index as u32) + } - let num_rows = self.metadata.metadata().row_group(self.row_group_idx).num_rows(); - offsets.append_value(num_rows as u32); - OffsetBuffer::new(offsets.finish().into_parts().1) - })?; + let num_rows = self.metadata.metadata().row_group(self.row_group_idx).num_rows(); + offsets.append_value(num_rows as u32); + OffsetBuffer::new(offsets.finish().into_parts().1) + })?; - let page_index = self.metadata + let page_index = self + .metadata .metadata() .column_index() - .map(|ci| { - &ci[self.row_group_idx][parquet_col_idx] - })?; + .map(|ci| &ci[self.row_group_idx][parquet_col_idx])?; let (min, max): (ArrayRef, ArrayRef) = match page_index { Index::NONE => return None, - Index::BYTE_ARRAY(s) => { - ( - Arc::new(BinaryArray::from_iter(s.indexes.iter().map(|p| p.min.clone()))), - Arc::new(BinaryArray::from_iter(s.indexes.iter().map(|p| p.max.clone()))) - ) - }, - Index::INT32(s) => { - ( - Arc::new(Int32Array::from_iter(s.indexes.iter().map(|p| p.min))), - Arc::new(Int32Array::from_iter(s.indexes.iter().map(|p| p.max))) - ) - }, - Index::INT64(s) => { - ( - Arc::new(Int64Array::from_iter(s.indexes.iter().map(|p| p.min))), - Arc::new(Int64Array::from_iter(s.indexes.iter().map(|p| p.max))) - ) - }, - Index::BOOLEAN(s) => { - ( - Arc::new(BooleanArray::from_iter(s.indexes.iter().map(|p| p.min))), - Arc::new(BooleanArray::from_iter(s.indexes.iter().map(|p| p.max))) - ) - }, + Index::BYTE_ARRAY(s) => ( + Arc::new(BinaryArray::from_iter(s.indexes.iter().map(|p| p.min.clone()))), + Arc::new(BinaryArray::from_iter(s.indexes.iter().map(|p| p.max.clone()))) + ), + Index::INT32(s) => ( + Arc::new(Int32Array::from_iter(s.indexes.iter().map(|p| p.min))), + Arc::new(Int32Array::from_iter(s.indexes.iter().map(|p| p.max))) + ), + Index::INT64(s) => ( + Arc::new(Int64Array::from_iter(s.indexes.iter().map(|p| p.min))), + Arc::new(Int64Array::from_iter(s.indexes.iter().map(|p| p.max))) + ), + Index::BOOLEAN(s) => ( + Arc::new(BooleanArray::from_iter(s.indexes.iter().map(|p| p.min))), + Arc::new(BooleanArray::from_iter(s.indexes.iter().map(|p| p.max))) + ), _ => return None }; let data_type = self.metadata.schema().field(arrow_col_idx).data_type(); let min = cast_stat_array(min, data_type)?; let max = cast_stat_array(max, data_type)?; - Some(ColumnStats { - offsets, - min, - max - }) + Some(ColumnStats { offsets, min, max }) } } - fn find_primitive_column(metadata: &ArrowReaderMetadata, name: Name) -> Option { for (idx, col) in metadata.parquet_schema().columns().iter().enumerate() { if col.name() == name && col.self_type().is_primitive() { @@ -329,4 +309,4 @@ fn find_primitive_column(metadata: &ArrowReaderMetadata, name: Name) -> Option; - pub trait RowPredicate: Sync + Send { fn projection(&self) -> &[Name]; @@ -25,12 +30,10 @@ pub trait RowPredicate: Sync + Send { } } - pub trait RowStats { fn get_column_stats(&self, column: Name) -> anyhow::Result>; } - #[derive(Clone)] pub struct ColumnStats { pub offsets: OffsetBuffer, @@ -38,13 +41,11 @@ pub struct ColumnStats { pub max: ArrayRef } - pub struct ColumnPredicate { column: [Name; 1], array_predicate: ArrayPredicateRef } - impl ColumnPredicate { pub fn new(column_name: Name, array_predicate: ArrayPredicateRef) -> Self { Self { @@ -54,7 +55,6 @@ impl ColumnPredicate { } } - impl RowPredicate for ColumnPredicate { fn projection(&self) -> &[Name] { &self.column @@ -73,43 +73,38 @@ impl RowPredicate for ColumnPredicate { } fn evaluate_stats(&self, row_stats: &dyn RowStats) -> anyhow::Result> { - row_stats.get_column_stats(self.column[0])?.map(|column_stats| { - let mask = self.array_predicate.evaluate_stats(&ArrayStats { - min: column_stats.min.clone(), - max: column_stats.max.clone() - })?; + row_stats + .get_column_stats(self.column[0])? + .map(|column_stats| { + let mask = self.array_predicate.evaluate_stats(&ArrayStats { + min: column_stats.min.clone(), + max: column_stats.max.clone() + })?; - let offsets = &column_stats.offsets; + let offsets = &column_stats.offsets; - let ranges = (0..offsets.len() - 1) - .filter(|&i| { - mask.value(i) && !mask.is_null(i) - }) - .map(|i| offsets[i]..offsets[i + 1]); + let ranges = (0..offsets.len() - 1) + .filter(|&i| mask.value(i) && !mask.is_null(i)) + .map(|i| offsets[i]..offsets[i + 1]); - Ok(RowRangeList::seal(ranges)) - }).transpose() + Ok(RowRangeList::seal(ranges)) + }) + .transpose() } } - pub struct AndPredicate { predicates: Vec, projection: Vec } - impl AndPredicate { pub fn new(predicates: Vec) -> Self { let projection = predicates_projection(&predicates); - Self { - predicates, - projection - } + Self { predicates, projection } } } - impl RowPredicate for AndPredicate { fn projection(&self) -> &[Name] { &self.projection @@ -117,7 +112,7 @@ impl RowPredicate for AndPredicate { fn evaluate(&self, batch: &RecordBatch) -> anyhow::Result { if self.predicates.len() == 0 { - return Ok(array_predicate::zero_mask(batch.num_rows(), true)) + return Ok(array_predicate::zero_mask(batch.num_rows(), true)); } let mut result_mask = self.predicates[0].evaluate(batch)?; for i in 1..self.predicates.len() { @@ -148,24 +143,18 @@ impl RowPredicate for AndPredicate { } } - pub struct OrPredicate { predicates: Vec, projection: Vec } - impl OrPredicate { pub fn new(predicates: Vec) -> Self { let projection = predicates_projection(&predicates); - Self { - predicates, - projection - } + Self { predicates, projection } } } - impl RowPredicate for OrPredicate { fn projection(&self) -> &[Name] { &self.projection @@ -173,7 +162,7 @@ impl RowPredicate for OrPredicate { fn evaluate(&self, batch: &RecordBatch) -> anyhow::Result { if self.predicates.len() == 0 { - return Ok(array_predicate::zero_mask(batch.num_rows(), false)) + return Ok(array_predicate::zero_mask(batch.num_rows(), false)); } let mut result_mask = self.predicates[0].evaluate(batch)?; for i in 1..self.predicates.len() { @@ -198,17 +187,16 @@ impl RowPredicate for OrPredicate { sel }) } else { - return Ok(None) + return Ok(None); } } else { - return Ok(None) + return Ok(None); } } Ok(selection) } } - fn predicates_projection(predicates: &[RowPredicateRef]) -> Vec { let n_columns = predicates.iter().map(|p| p.projection().len()).sum(); let mut projected_set: HashSet = HashSet::with_capacity(n_columns); @@ -220,4 +208,4 @@ fn predicates_projection(predicates: &[RowPredicateRef]) -> Vec { } } projection -} \ No newline at end of file +} diff --git a/crates/query/src/scan/row_predicate_dsl.rs b/crates/query/src/scan/row_predicate_dsl.rs index e668d556..8c7809fc 100644 --- a/crates/query/src/scan/row_predicate_dsl.rs +++ b/crates/query/src/scan/row_predicate_dsl.rs @@ -1,30 +1,28 @@ -use crate::primitives::Name; -use crate::scan::array_predicate::ArrayPredicateRef; -use crate::scan::arrow::IntoArrowScalar; -use crate::scan::row_predicate::{AndPredicate, ColumnPredicate, OrPredicate, RowPredicateRef}; -use crate::scan::{array_predicate, IntoArrowArray}; +use std::{hash::Hash, sync::Arc}; + use arrow::array::{Array, Scalar}; -use std::hash::Hash; -use std::sync::Arc; +use crate::{ + primitives::Name, + scan::{ + array_predicate, + array_predicate::ArrayPredicateRef, + arrow::IntoArrowScalar, + row_predicate::{AndPredicate, ColumnPredicate, OrPredicate, RowPredicateRef}, + IntoArrowArray + } +}; macro_rules! make_column_predicate { ($col:expr, $arr_predicate:expr) => { - Arc::new( - ColumnPredicate::new( - $col, - Arc::new($arr_predicate) - ) - ) + Arc::new(ColumnPredicate::new($col, Arc::new($arr_predicate))) }; } - pub fn col_eq(name: Name, value: T) -> RowPredicateRef { make_column_predicate!(name, array_predicate::Eq::new(value)) } - pub fn col_in_list(name: Name, values: L) -> RowPredicateRef { let values = values.into_array(); match values.len() { @@ -33,97 +31,79 @@ pub fn col_in_list(name: Name, values: L) -> RowPredicateRef make_column_predicate!( name, array_predicate::Or::new( - (0..values.len()).map(|i| { - let val = Scalar::new(values.slice(i, 1)); - Arc::new(array_predicate::Eq::new(val)) as ArrayPredicateRef - }).collect() + (0..values.len()) + .map(|i| { + let val = Scalar::new(values.slice(i, 1)); + Arc::new(array_predicate::Eq::new(val)) as ArrayPredicateRef + }) + .collect() ) ) - }, + } _ => { - make_column_predicate!( - name, - array_predicate::InList::new(values) - ) + make_column_predicate!(name, array_predicate::InList::new(values)) } } } - /// column <= value pub fn col_lt_eq(name: Name, value: T) -> RowPredicateRef { make_column_predicate!(name, array_predicate::GtEq::new(value)) } - /// column >= value pub fn col_gt_eq(name: Name, value: T) -> RowPredicateRef { make_column_predicate!(name, array_predicate::LtEq::new(value)) } - /// low <= column <= high pub fn col_between(name: Name, low: T, high: T) -> RowPredicateRef { - make_column_predicate!(name, array_predicate::And::new(vec![ - Arc::new(array_predicate::LtEq::new(low)), - Arc::new(array_predicate::GtEq::new(high)) - ])) + make_column_predicate!( + name, + array_predicate::And::new(vec![ + Arc::new(array_predicate::LtEq::new(low)), + Arc::new(array_predicate::GtEq::new(high)) + ]) + ) } - -pub fn bloom_filter( - name: Name, - bytes_size: usize, - num_hashes: usize, - values: L -) -> RowPredicateRef -where +pub fn bloom_filter(name: Name, bytes_size: usize, num_hashes: usize, values: L) -> RowPredicateRef +where L: IntoIterator { let array_pred = array_predicate::or( - values.into_iter().map(|val| { - Arc::new(array_predicate::BloomFilter::new( - bytes_size, - num_hashes, - val - )) as ArrayPredicateRef - }).collect() + values + .into_iter() + .map(|val| Arc::new(array_predicate::BloomFilter::new(bytes_size, num_hashes, val)) as ArrayPredicateRef) + .collect() ); Arc::new(ColumnPredicate::new(name, array_pred)) } - pub fn and(predicates: Vec) -> RowPredicateRef { if predicates.len() == 1 { predicates.into_iter().next().unwrap() } else { - Arc::new( - AndPredicate::new(predicates) - ) + Arc::new(AndPredicate::new(predicates)) } } - pub fn or(predicates: Vec) -> RowPredicateRef { if predicates.len() == 1 { predicates.into_iter().next().unwrap() } else { - Arc::new( - OrPredicate::new(predicates) - ) + Arc::new(OrPredicate::new(predicates)) } } - pub fn col_primitive_list_contains_any(name: Name, values: &[T::Native]) -> RowPredicateRef where T: arrow::datatypes::ArrowPrimitiveType, - T::Native: Eq + std::hash::Hash, + T::Native: Eq + std::hash::Hash { make_column_predicate!(name, array_predicate::PrimitiveListContainsAny::::new(values)) } - pub fn col_string_list_contains_any>(name: Name, values: &[S]) -> RowPredicateRef { make_column_predicate!(name, array_predicate::StringListContainsAny::new(values)) } diff --git a/crates/query/src/scan/scan.rs b/crates/query/src/scan/scan.rs index e2140f61..9ad48000 100644 --- a/crates/query/src/scan/scan.rs +++ b/crates/query/src/scan/scan.rs @@ -1,12 +1,12 @@ -use crate::primitives::{Name, RowRangeList}; -use crate::scan::reader::TableReader; -use crate::scan::RowPredicateRef; -use arrow::array::RecordBatch; -use arrow::datatypes::SchemaRef; +use std::{collections::HashSet, sync::Arc}; + +use arrow::{array::RecordBatch, datatypes::SchemaRef}; use sqd_polars::arrow::record_batch_vec_to_lazy_polars_df; -use std::collections::HashSet; -use std::sync::Arc; +use crate::{ + primitives::{Name, RowRangeList}, + scan::{reader::TableReader, RowPredicateRef} +}; pub struct Scan<'a> { reader: Arc, @@ -17,8 +17,7 @@ pub struct Scan<'a> { default_null_columns: Option> } - -impl <'a> Scan<'a> { +impl<'a> Scan<'a> { pub fn new(reader: Arc) -> Self { Self { reader, @@ -49,7 +48,8 @@ impl <'a> Scan<'a> { } pub fn with_columns(mut self, columns: I) -> Self - where I: IntoIterator + where + I: IntoIterator { if let Some(projection) = self.projection.as_mut() { projection.extend(columns); diff --git a/crates/query/src/scan/storage/chunk.rs b/crates/query/src/scan/storage/chunk.rs index 65c76d73..b11d60ce 100644 --- a/crates/query/src/scan/storage/chunk.rs +++ b/crates/query/src/scan/storage/chunk.rs @@ -1,15 +1,14 @@ -use crate::scan::scan::Scan; -use crate::scan::{Chunk, TableDoesNotExist}; use anyhow::ensure; use sqd_primitives::Name; use sqd_storage::db::ChunkReader; +use crate::scan::{scan::Scan, Chunk, TableDoesNotExist}; -impl <'a> Chunk for ChunkReader<'a> { +impl<'a> Chunk for ChunkReader<'a> { fn scan_table(&self, name: Name) -> anyhow::Result> { ensure!(self.tables().contains_key(name), TableDoesNotExist::new(name)); let table_reader = self.get_table_reader(name)?; let scan = Scan::new(table_reader); Ok(scan) } -} \ No newline at end of file +} diff --git a/crates/query/src/scan/storage/mod.rs b/crates/query/src/scan/storage/mod.rs index 4b553d75..83ffaa0e 100644 --- a/crates/query/src/scan/storage/mod.rs +++ b/crates/query/src/scan/storage/mod.rs @@ -1,2 +1,2 @@ +mod chunk; mod reader; -mod chunk; \ No newline at end of file diff --git a/crates/query/src/scan/storage/reader.rs b/crates/query/src/scan/storage/reader.rs index e06b9544..6d62c482 100644 --- a/crates/query/src/scan/storage/reader.rs +++ b/crates/query/src/scan/storage/reader.rs @@ -1,15 +1,19 @@ -use crate::primitives::{Name, RowRangeList}; -use crate::scan::reader::TableReader; -use crate::scan::row_predicate::{ColumnStats, RowStats}; -use crate::scan::util::{add_row_index, build_row_index_array}; -use crate::scan::RowPredicateRef; -use arrow::array::RecordBatch; -use arrow::datatypes::SchemaRef; -use sqd_storage::db::SnapshotTableReader; use std::collections::HashSet; +use arrow::{array::RecordBatch, datatypes::SchemaRef}; +use sqd_storage::db::SnapshotTableReader; -impl <'a> TableReader for SnapshotTableReader<'a> { +use crate::{ + primitives::{Name, RowRangeList}, + scan::{ + reader::TableReader, + row_predicate::{ColumnStats, RowStats}, + util::{add_row_index, build_row_index_array}, + RowPredicateRef + } +}; + +impl<'a> TableReader for SnapshotTableReader<'a> { fn read( &self, predicate: Option, @@ -17,8 +21,7 @@ impl <'a> TableReader for SnapshotTableReader<'a> { row_selection: Option<&RowRangeList>, with_row_index: bool, _default_null_columns: Option<&HashSet> - ) -> anyhow::Result> - { + ) -> anyhow::Result> { let mut maybe_new_row_selection = None; let mut maybe_new_projection = None; @@ -33,9 +36,10 @@ impl <'a> TableReader for SnapshotTableReader<'a> { } }); } - + if let Some(columns) = projection { - let new_columns = predicate.projection() + let new_columns = predicate + .projection() .iter() .filter(|col| !columns.contains(*col)) .count(); @@ -51,17 +55,10 @@ impl <'a> TableReader for SnapshotTableReader<'a> { let row_selection = maybe_new_row_selection.as_ref().or(row_selection); - let mut record_batch = self.read_table( - maybe_new_projection.as_ref().or(projection), - row_selection - )?; + let mut record_batch = self.read_table(maybe_new_projection.as_ref().or(projection), row_selection)?; if with_row_index { - let row_index = build_row_index_array( - 0, - record_batch.num_rows(), - row_selection - ); + let row_index = build_row_index_array(0, record_batch.num_rows(), row_selection); record_batch = add_row_index(&record_batch, row_index) } @@ -71,7 +68,8 @@ impl <'a> TableReader for SnapshotTableReader<'a> { if maybe_new_projection.is_some() { let projected_columns = projection.unwrap(); - let indexes: Vec = record_batch.schema() + let indexes: Vec = record_batch + .schema() .fields() .iter() .enumerate() @@ -81,11 +79,12 @@ impl <'a> TableReader for SnapshotTableReader<'a> { } else { None } - }).collect(); + }) + .collect(); record_batch = record_batch.project(&indexes)?; } - + record_batch = arrow::compute::filter_record_batch(&record_batch, &mask)?; } @@ -97,17 +96,14 @@ impl <'a> TableReader for SnapshotTableReader<'a> { } } - -impl <'a> RowStats for SnapshotTableReader<'a> { +impl<'a> RowStats for SnapshotTableReader<'a> { fn get_column_stats(&self, column: Name) -> anyhow::Result> { let index = self.schema().index_of(column)?; let stats = self.get_column_stats(index)?; - Ok(stats.map(|stats| { - ColumnStats { - offsets: stats.offsets, - min: stats.min, - max: stats.max - } + Ok(stats.map(|stats| ColumnStats { + offsets: stats.offsets, + min: stats.min, + max: stats.max })) } -} \ No newline at end of file +} diff --git a/crates/query/src/scan/util.rs b/crates/query/src/scan/util.rs index f1710a55..5863fa00 100644 --- a/crates/query/src/scan/util.rs +++ b/crates/query/src/scan/util.rs @@ -1,8 +1,11 @@ -use crate::primitives::{RowIndex, RowIndexArrowType, RowRangeList}; -use arrow::array::{ArrayRef, PrimitiveArray, RecordBatch, RecordBatchOptions, UInt32Array}; -use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef}; use std::sync::Arc; +use arrow::{ + array::{ArrayRef, PrimitiveArray, RecordBatch, RecordBatchOptions, UInt32Array}, + datatypes::{DataType, Field, SchemaBuilder, SchemaRef} +}; + +use crate::primitives::{RowIndex, RowIndexArrowType, RowRangeList}; pub fn build_row_index_array( offset: RowIndex, @@ -10,9 +13,7 @@ pub fn build_row_index_array( maybe_row_selection: Option<&RowRangeList> ) -> PrimitiveArray { if let Some(row_ranges) = maybe_row_selection { - let num_rows = row_ranges.iter() - .map(|r| r.end - r.start) - .sum::() as usize; + let num_rows = row_ranges.iter().map(|r| r.end - r.start).sum::() as usize; assert!(num_rows <= len); let mut array = UInt32Array::builder(num_rows); @@ -29,7 +30,6 @@ pub fn build_row_index_array( } } - pub fn add_row_index(batch: &RecordBatch, index: PrimitiveArray) -> RecordBatch { let mut schema_builder = SchemaBuilder::from(batch.schema().as_ref()); schema_builder.reverse(); @@ -47,5 +47,6 @@ pub fn add_row_index(batch: &RecordBatch, index: PrimitiveArray) -> anyhow::Result> { - let query = Query::from_json_bytes( - &std::fs::read(query_file)? - )?; + let query = Query::from_json_bytes(&std::fs::read(query_file)?)?; let data = Vec::with_capacity(4 * 1024 * 1024); let mut writer = JsonArrayWriter::new(data); if let Some(mut blocks) = query.compile().execute(chunk)? { @@ -15,14 +15,13 @@ fn execute_query(chunk: &dyn Chunk, query_file: impl AsRef) -> anyhow::Res Ok(writer.finish()?) } - -fn test_fixture(chunk: &dyn Chunk, query_file: PathBuf) { +fn test_fixture(chunk: &dyn Chunk, query_file: PathBuf) { let case_dir = query_file.parent().unwrap(); let result_file = case_dir.join("result.json"); let actual: serde_json::Value = match execute_query(chunk, &query_file) { Ok(bytes) => serde_json::from_slice(&bytes).unwrap(), - Err(err) => serde_json::Value::String(err.to_string()), + Err(err) => serde_json::Value::String(err.to_string()) }; let expected: serde_json::Value = match std::fs::read(&result_file) { @@ -31,7 +30,8 @@ fn test_fixture(chunk: &dyn Chunk, query_file: PathBuf) { serde_json::to_writer_pretty( std::fs::File::create(case_dir.join("actual.temp.json")).unwrap(), &actual - ).unwrap(); + ) + .unwrap(); return; } Err(err) => panic!("{:?}", err) @@ -41,23 +41,21 @@ fn test_fixture(chunk: &dyn Chunk, query_file: PathBuf) { serde_json::to_writer_pretty( std::fs::File::create(case_dir.join("actual.temp.json")).unwrap(), &actual - ).unwrap(); + ) + .unwrap(); panic!("actual != expected") } } - #[cfg(feature = "parquet")] mod parquet { use std::path::PathBuf; use rstest::rstest; - use sqd_query::ParquetChunk; use crate::test_fixture; - #[rstest] fn query(#[files("fixtures/*/queries/*/query.json")] query_file: PathBuf) { let case_dir = query_file.parent().unwrap(); @@ -67,23 +65,24 @@ mod parquet { } } - #[cfg(feature = "storage")] mod storage { - use crate::test_fixture; - use arrow::array::RecordBatchReader; - use arrow::datatypes::Schema; + use std::{collections::BTreeMap, fs::File}; + + use arrow::{array::RecordBatchReader, datatypes::Schema}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use sqd_data::solana::tables::SolanaChunkBuilder; use sqd_dataset::DatasetDescription; use sqd_storage::db::{Chunk, Database, DatabaseSettings, DatasetId, DatasetKind}; - use std::collections::BTreeMap; - use std::fs::File; + use crate::test_fixture; fn get_columns_with_stats(d: &DatasetDescription, name: &str, schema: &Schema) -> Vec { if let Some(table_desc) = d.tables.get(name) { - table_desc.options.column_options.iter() + table_desc + .options + .column_options + .iter() .filter_map(|(&name, opts)| { if opts.stats_enable { schema.index_of(name).ok() @@ -103,8 +102,7 @@ mod storage { kind: &str, desc: &DatasetDescription, chunk_path: &str - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { let dataset_id = DatasetId::from_str(name); let dataset_kind = DatasetKind::from_str(kind); @@ -117,19 +115,14 @@ mod storage { let item_name = item.to_str().unwrap(); if let Some(table) = item_name.strip_suffix(".parquet") { - let mut reader = ParquetRecordBatchReaderBuilder::try_new( - File::open(format!("{}/{}", chunk_path, item_name))? - )?.with_batch_size(500).build()?; + let mut reader = + ParquetRecordBatchReaderBuilder::try_new(File::open(format!("{}/{}", chunk_path, item_name))?)? + .with_batch_size(500) + .build()?; let mut builder = db.new_table_builder(reader.schema()); - - builder.set_stats( - get_columns_with_stats( - desc, - table, - &reader.schema() - ) - )?; + + builder.set_stats(get_columns_with_stats(desc, table, &reader.schema()))?; while let Some(record_batch) = reader.next().transpose()? { builder.write_record_batch(&record_batch)?; @@ -139,13 +132,16 @@ mod storage { } } - db.insert_chunk(dataset_id, &Chunk::V0 { - first_block: 0, - last_block: 0, - last_block_hash: "hello".to_string(), - parent_block_hash: "".to_string(), - tables - })?; + db.insert_chunk( + dataset_id, + &Chunk::V0 { + first_block: 0, + last_block: 0, + last_block_hash: "hello".to_string(), + parent_block_hash: "".to_string(), + tables + } + )?; Ok(()) } @@ -169,11 +165,10 @@ mod storage { .list_chunks(DatasetId::from_str("solana"), 0, None) .next() .expect("chunk must be present")?; - + let chunk_reader = snapshot.create_chunk_reader(chunk); - let queries = glob::glob("fixtures/solana/queries/*/query.json")? - .collect::, _>>()?; + let queries = glob::glob("fixtures/solana/queries/*/query.json")?.collect::, _>>()?; assert!(queries.len() > 0, "no solana queries found"); diff --git a/crates/storage/src/db/data.rs b/crates/storage/src/db/data.rs index 39f72a87..a4fda754 100644 --- a/crates/storage/src/db/data.rs +++ b/crates/storage/src/db/data.rs @@ -1,20 +1,19 @@ -use crate::db::table_id::TableId; +use std::{ + collections::BTreeMap, + fmt::{Debug, Display, Formatter} +}; + use borsh::{BorshDeserialize, BorshSerialize}; -use sqd_primitives::sid::SID; -use sqd_primitives::{BlockNumber, BlockRef}; -use std::collections::BTreeMap; -use std::fmt::{Debug, Display, Formatter}; +use sqd_primitives::{sid::SID, BlockNumber, BlockRef}; +use crate::db::table_id::TableId; pub type DatasetId = SID<48>; - pub type DatasetVersion = u64; - pub type DatasetKind = SID<16>; - #[derive(Debug, Clone, Eq, PartialEq, BorshSerialize, BorshDeserialize)] pub enum DatasetLabel { V0 { @@ -24,20 +23,19 @@ pub enum DatasetLabel { } } - impl DatasetLabel { pub fn kind(&self) -> DatasetKind { - match self { - DatasetLabel::V0 { kind, .. } => *kind + match self { + DatasetLabel::V0 { kind, .. } => *kind } } - + pub fn version(&self) -> DatasetVersion { match self { DatasetLabel::V0 { version, .. } => *version } } - + pub fn bump_version(&mut self) { match self { DatasetLabel::V0 { version, .. } => *version += 1 @@ -57,28 +55,23 @@ impl DatasetLabel { } } - #[derive(Clone, Eq, PartialEq, Debug)] pub struct Dataset { pub id: DatasetId, pub label: DatasetLabel } - #[derive(Copy, Clone, Hash, Ord, PartialOrd, Eq, PartialEq, BorshSerialize, BorshDeserialize)] pub struct ChunkId { bytes: [u8; 56] } - impl ChunkId { pub fn new(dataset_id: DatasetId, last_block: BlockNumber) -> Self { let mut bytes = [0; 56]; bytes[..48].copy_from_slice(dataset_id.as_ref()); bytes[48..].copy_from_slice(&last_block.to_be_bytes()); - Self { - bytes - } + Self { bytes } } pub fn new_for_chunk(dataset_id: DatasetId, chunk: &Chunk) -> Self { @@ -94,21 +87,18 @@ impl ChunkId { } } - impl AsRef<[u8]> for ChunkId { fn as_ref(&self) -> &[u8] { &self.bytes } } - impl Display for ChunkId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "{}/{}", self.dataset_id(), self.last_block()) } } - #[derive(BorshSerialize, BorshDeserialize, Debug, Clone, Eq, PartialEq)] pub enum Chunk { V0 { @@ -129,36 +119,35 @@ pub enum Chunk { } } - impl Chunk { pub fn first_block(&self) -> BlockNumber { match self { Chunk::V0 { first_block, .. } => *first_block, - Chunk::V1 { first_block, .. } => *first_block, + Chunk::V1 { first_block, .. } => *first_block } } pub fn last_block(&self) -> BlockNumber { match self { Chunk::V0 { last_block, .. } => *last_block, - Chunk::V1 { last_block, .. } => *last_block, + Chunk::V1 { last_block, .. } => *last_block } } pub fn last_block_hash(&self) -> &str { match self { Chunk::V0 { last_block_hash, .. } => last_block_hash, - Chunk::V1 { last_block_hash, .. } => last_block_hash, + Chunk::V1 { last_block_hash, .. } => last_block_hash } } pub fn parent_block_hash(&self) -> &str { match self { Chunk::V0 { parent_block_hash, .. } => parent_block_hash, - Chunk::V1 { parent_block_hash, .. } => parent_block_hash, + Chunk::V1 { parent_block_hash, .. } => parent_block_hash } } - + pub fn first_block_time(&self) -> Option { match self { Chunk::V0 { .. } => None, @@ -176,34 +165,41 @@ impl Chunk { pub fn tables(&self) -> &BTreeMap { match self { Chunk::V0 { tables, .. } => tables, - Chunk::V1 { tables, .. } => tables, + Chunk::V1 { tables, .. } => tables } } pub fn blocks_count(&self) -> u64 { match self { - Chunk::V0 { first_block, last_block, .. } => *last_block - *first_block + 1, - Chunk::V1 { first_block, last_block, .. } => *last_block - *first_block + 1, + Chunk::V0 { + first_block, + last_block, + .. + } => *last_block - *first_block + 1, + Chunk::V1 { + first_block, + last_block, + .. + } => *last_block - *first_block + 1 } } pub fn next_block(&self) -> BlockNumber { match self { Chunk::V0 { last_block, .. } => *last_block + 1, - Chunk::V1 { last_block, .. } => *last_block + 1, + Chunk::V1 { last_block, .. } => *last_block + 1 } } } - impl Display for Chunk { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!( - f, + f, "{}-{}-{}", self.first_block(), self.last_block(), self.last_block_hash() ) } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/db.rs b/crates/storage/src/db/db.rs index 1b75b312..53271858 100644 --- a/crates/storage/src/db/db.rs +++ b/crates/storage/src/db/db.rs @@ -1,17 +1,20 @@ -use super::data::{Dataset, DatasetId, DatasetKind, DatasetLabel}; -use super::read::snapshot::ReadSnapshot; -use crate::db::ops::{perform_dataset_compaction, CompactionStatus}; -use crate::db::read::datasets::list_all_datasets; -use crate::db::write::ops::deleted_deleted_tables; -use crate::db::write::table_builder::TableBuilder; -use crate::db::write::tx::Tx; -use crate::db::{Chunk, DatasetUpdate}; +use std::path::Path; + use anyhow::ensure; use arrow::datatypes::SchemaRef; use rocksdb::{ColumnFamilyDescriptor, Options as RocksOptions}; use sqd_primitives::Name; -use std::path::Path; +use super::{ + data::{Dataset, DatasetId, DatasetKind, DatasetLabel}, + read::snapshot::ReadSnapshot +}; +use crate::db::{ + ops::{perform_dataset_compaction, CompactionStatus}, + read::datasets::list_all_datasets, + write::{ops::deleted_deleted_tables, table_builder::TableBuilder, tx::Tx}, + Chunk, DatasetUpdate +}; pub(super) const CF_DATASETS: Name = "DATASETS"; pub(super) const CF_CHUNKS: Name = "CHUNKS"; @@ -19,7 +22,6 @@ pub(super) const CF_TABLES: Name = "TABLES"; pub(super) const CF_DIRTY_TABLES: Name = "DIRTY_TABLES"; pub(super) const CF_DELETED_TABLES: Name = "DELETED_TABLES"; - pub(super) type RocksDB = rocksdb::OptimisticTransactionDB; pub(super) type RocksTransaction<'a> = rocksdb::Transaction<'a, RocksDB>; pub(super) type RocksTransactionOptions = rocksdb::OptimisticTransactionOptions; @@ -27,16 +29,14 @@ pub(super) type RocksWriteBatch = rocksdb::WriteBatchWithTransaction; pub(super) type RocksIterator<'a, DB> = rocksdb::DBRawIteratorWithThreadMode<'a, DB>; pub(super) type RocksSnapshot<'a, DB> = rocksdb::SnapshotWithThreadMode<'a, DB>; - pub struct DatabaseSettings { chunk_cache_size: usize, data_cache_size: usize, with_rocksdb_stats: bool, direct_io: bool, - cache_index_and_filter_blocks: bool, + cache_index_and_filter_blocks: bool } - impl Default for DatabaseSettings { fn default() -> Self { Self { @@ -44,18 +44,17 @@ impl Default for DatabaseSettings { data_cache_size: 256, with_rocksdb_stats: false, direct_io: false, - cache_index_and_filter_blocks: false, + cache_index_and_filter_blocks: false } } } - impl DatabaseSettings { pub fn with_chunk_cache_size(mut self, mb: usize) -> Self { self.chunk_cache_size = mb; self } - + pub fn with_data_cache_size(mut self, mb: usize) -> Self { self.data_cache_size = mb; self @@ -65,7 +64,7 @@ impl DatabaseSettings { self.with_rocksdb_stats = on; self } - + pub fn with_direct_io(mut self, yes: bool) -> Self { self.direct_io = yes; self @@ -133,42 +132,44 @@ impl DatabaseSettings { options.set_block_based_table_factory(&block_based_table_factory); options } - + pub fn open(&self, path: impl AsRef) -> anyhow::Result { let options = self.db_options(); - - let db = RocksDB::open_cf_descriptors(&options, path, [ - ColumnFamilyDescriptor::new(CF_DATASETS, self.cf_default_options()), - ColumnFamilyDescriptor::new(CF_CHUNKS, self.chunks_cf_options()), - ColumnFamilyDescriptor::new(CF_TABLES, self.tables_cf_options()), - ColumnFamilyDescriptor::new(CF_DIRTY_TABLES, self.cf_default_options()), - ColumnFamilyDescriptor::new(CF_DELETED_TABLES, self.cf_default_options()) - ])?; - - Ok(Database { - db, - options - }) + + let db = RocksDB::open_cf_descriptors( + &options, + path, + [ + ColumnFamilyDescriptor::new(CF_DATASETS, self.cf_default_options()), + ColumnFamilyDescriptor::new(CF_CHUNKS, self.chunks_cf_options()), + ColumnFamilyDescriptor::new(CF_TABLES, self.tables_cf_options()), + ColumnFamilyDescriptor::new(CF_DIRTY_TABLES, self.cf_default_options()), + ColumnFamilyDescriptor::new(CF_DELETED_TABLES, self.cf_default_options()) + ] + )?; + + Ok(Database { db, options }) } } - pub struct Database { db: RocksDB, options: RocksOptions } - impl Database { pub fn create_dataset(&self, id: DatasetId, kind: DatasetKind) -> anyhow::Result<()> { Tx::new(&self.db).run(|tx| { let label = tx.find_label_for_update(id)?; ensure!(label.is_none(), "dataset {} already exists", id); - tx.write_label(id, &DatasetLabel::V0 { - kind, - version: 0, - finalized_head: None - }) + tx.write_label( + id, + &DatasetLabel::V0 { + kind, + version: 0, + finalized_head: None + } + ) }) } @@ -184,11 +185,14 @@ impl Database { ); Ok(()) } else { - tx.write_label(id, &DatasetLabel::V0 { - kind, - version: 0, - finalized_head: None - }) + tx.write_label( + id, + &DatasetLabel::V0 { + kind, + version: 0, + finalized_head: None + } + ) } }) } @@ -197,34 +201,16 @@ impl Database { TableBuilder::new(&self.db, schema) } - pub fn insert_chunk( - &self, - dataset_id: DatasetId, - chunk: &Chunk - ) -> anyhow::Result<()> - { - self.update_dataset(dataset_id, |tx| { - tx.insert_chunk(chunk) - }) + pub fn insert_chunk(&self, dataset_id: DatasetId, chunk: &Chunk) -> anyhow::Result<()> { + self.update_dataset(dataset_id, |tx| tx.insert_chunk(chunk)) } - - pub fn insert_fork( - &self, - dataset_id: DatasetId, - chunk: &Chunk - ) -> anyhow::Result<()> - { - self.update_dataset(dataset_id, |tx| { - tx.insert_fork(chunk) - }) + + pub fn insert_fork(&self, dataset_id: DatasetId, chunk: &Chunk) -> anyhow::Result<()> { + self.update_dataset(dataset_id, |tx| tx.insert_fork(chunk)) } - - pub fn update_dataset( - &self, - dataset_id: DatasetId, - mut cb: F - ) -> anyhow::Result - where + + pub fn update_dataset(&self, dataset_id: DatasetId, mut cb: F) -> anyhow::Result + where F: FnMut(&mut DatasetUpdate<'_>) -> anyhow::Result { Tx::new(&self.db).run(|tx| { @@ -240,9 +226,7 @@ impl Database { } pub fn get_all_datasets(&self) -> anyhow::Result> { - let cursor = self.db.raw_iterator_cf( - self.db.cf_handle(CF_DATASETS).unwrap() - ); + let cursor = self.db.raw_iterator_cf(self.db.cf_handle(CF_DATASETS).unwrap()); list_all_datasets(cursor).collect() } @@ -251,10 +235,15 @@ impl Database { dataset_id: DatasetId, max_chunk_size: Option, write_amplification_limit: Option, - compaction_len_limit: Option, - ) -> anyhow::Result - { - perform_dataset_compaction(&self.db, dataset_id, max_chunk_size, write_amplification_limit, compaction_len_limit) + compaction_len_limit: Option + ) -> anyhow::Result { + perform_dataset_compaction( + &self.db, + dataset_id, + max_chunk_size, + write_amplification_limit, + compaction_len_limit + ) } pub fn delete_dataset(&self, dataset_id: DatasetId) -> anyhow::Result<()> { @@ -289,18 +278,15 @@ impl Database { pub fn get_property(&self, cf: &str, name: &str) -> anyhow::Result> { let Some(cf_handle) = self.db.cf_handle(cf) else { - return Ok(None) + return Ok(None); }; let val = self.db.property_value_cf(cf_handle, name)?; Ok(val) } } - impl std::fmt::Debug for Database { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Database") - .field("path", &self.db.path()) - .finish() + f.debug_struct("Database").field("path", &self.db.path()).finish() } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/mod.rs b/crates/storage/src/db/mod.rs index 3f85c3c2..bb4007cf 100644 --- a/crates/storage/src/db/mod.rs +++ b/crates/storage/src/db/mod.rs @@ -6,19 +6,13 @@ mod rocks; mod table_id; mod write; - -pub use data::{ - Chunk, - Dataset, - DatasetId, - DatasetKind, - DatasetLabel, - DatasetVersion -}; +pub use data::{Chunk, Dataset, DatasetId, DatasetKind, DatasetLabel, DatasetVersion}; pub use db::*; pub use ops::{CompactionStatus, MergedChunk}; pub use read::snapshot::*; pub use table_id::TableId; -pub use write::dataset_update::*; -pub use write::table_builder::*; -pub use write::tx::{get_global_tx_restarts, get_local_tx_restarts}; +pub use write::{ + dataset_update::*, + table_builder::*, + tx::{get_global_tx_restarts, get_local_tx_restarts} +}; diff --git a/crates/storage/src/db/ops/cast.rs b/crates/storage/src/db/ops/cast.rs index c2274b04..6897fac6 100644 --- a/crates/storage/src/db/ops/cast.rs +++ b/crates/storage/src/db/ops/cast.rs @@ -1,10 +1,11 @@ use arrow::datatypes::DataType; -use sqd_array::builder::{AnyBuilder, ArrayBuilder}; -use sqd_array::item_index_cast::cast_item_index; -use sqd_array::reader::ArrayReader; -use sqd_array::slice::AsSlice; -use sqd_array::writer::ArrayWriter; - +use sqd_array::{ + builder::{AnyBuilder, ArrayBuilder}, + item_index_cast::cast_item_index, + reader::ArrayReader, + slice::AsSlice, + writer::ArrayWriter +}; pub struct IndexCastReader { src: S, @@ -12,7 +13,6 @@ pub struct IndexCastReader { target_type: DataType } - impl IndexCastReader { pub fn new(src: S, src_type: &DataType, target_type: DataType) -> Self { Self { @@ -21,21 +21,16 @@ impl IndexCastReader { target_type } } - + pub fn len(&self) -> usize { self.src.len() } - + pub fn read(&mut self, dst: &mut impl ArrayWriter) -> anyhow::Result<()> { self.read_slice(dst, 0, self.len()) } - pub fn read_slice( - &mut self, - dst: &mut impl ArrayWriter, - mut offset: usize, - mut len: usize - ) -> anyhow::Result<()> { + pub fn read_slice(&mut self, dst: &mut impl ArrayWriter, mut offset: usize, mut len: usize) -> anyhow::Result<()> { while len > 0 { let step = std::cmp::min(len, 1000); self.src_buf.clear(); @@ -48,35 +43,23 @@ impl IndexCastReader { } } - pub enum MaybeCastedReader { Plain(R), Cast(IndexCastReader) } - impl MaybeCastedReader { - pub fn read( - &mut self, - dst: &mut impl ArrayWriter - ) -> anyhow::Result<()> - { + pub fn read(&mut self, dst: &mut impl ArrayWriter) -> anyhow::Result<()> { match self { MaybeCastedReader::Plain(r) => r.read(dst), - MaybeCastedReader::Cast(r) => r.read(dst), + MaybeCastedReader::Cast(r) => r.read(dst) } } - - pub fn read_slice( - &mut self, - dst: &mut impl ArrayWriter, - offset: usize, - len: usize - ) -> anyhow::Result<()> - { + + pub fn read_slice(&mut self, dst: &mut impl ArrayWriter, offset: usize, len: usize) -> anyhow::Result<()> { match self { MaybeCastedReader::Plain(r) => r.read_slice(dst, offset, len), - MaybeCastedReader::Cast(r) => r.read_slice(dst, offset, len), - } + MaybeCastedReader::Cast(r) => r.read_slice(dst, offset, len) + } } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/ops/compaction.rs b/crates/storage/src/db/ops/compaction.rs index 25cad01b..4f081723 100644 --- a/crates/storage/src/db/ops/compaction.rs +++ b/crates/storage/src/db/ops/compaction.rs @@ -1,14 +1,18 @@ -use crate::db::db::RocksDB; -use crate::db::ops::schema_merge::can_merge_schemas; -use crate::db::ops::table_merge::TableMerge; -use crate::db::table_id::TableId; -use crate::db::write::tx::Tx; -use crate::db::{Chunk, ChunkReader, DatasetId, ReadSnapshot, TableBuilder}; +use std::{ + cmp::{max, min}, + collections::BTreeMap +}; + use arrow::datatypes::SchemaRef; use sqd_primitives::BlockNumber; -use std::cmp::{max, min}; -use std::collections::BTreeMap; +use crate::db::{ + db::RocksDB, + ops::{schema_merge::can_merge_schemas, table_merge::TableMerge}, + table_id::TableId, + write::tx::Tx, + Chunk, ChunkReader, DatasetId, ReadSnapshot, TableBuilder +}; pub const MAX_CHUNK_SIZE: usize = 200_000; pub const WA_LIMIT: f64 = 1.9; @@ -18,7 +22,7 @@ pub const COMPACTION_LEN_LIMIT: usize = 50; pub enum CompactionStatus { Ok(Vec), Canceled, - NotingToCompact, + NotingToCompact } #[derive(Debug)] @@ -33,9 +37,8 @@ pub fn perform_dataset_compaction( dataset_id: DatasetId, max_chunk_size: Option, write_amplification_limit: Option, - compaction_len_limit: Option, -) -> anyhow::Result -{ + compaction_len_limit: Option +) -> anyhow::Result { DatasetCompaction { db, snapshot: &ReadSnapshot::new(db), @@ -43,7 +46,7 @@ pub fn perform_dataset_compaction( merge: Vec::new(), max_chunk_size: max_chunk_size.unwrap_or(MAX_CHUNK_SIZE), write_amplification_limit: write_amplification_limit.unwrap_or(WA_LIMIT), - compaction_len_limit: compaction_len_limit.unwrap_or(COMPACTION_LEN_LIMIT), + compaction_len_limit: compaction_len_limit.unwrap_or(COMPACTION_LEN_LIMIT) } .execute() } @@ -55,7 +58,7 @@ struct DatasetCompaction<'a> { merge: Vec>, max_chunk_size: usize, write_amplification_limit: f64, - compaction_len_limit: usize, + compaction_len_limit: usize } impl<'a> DatasetCompaction<'a> { @@ -75,7 +78,7 @@ impl<'a> DatasetCompaction<'a> { Tx::new(self.db).run(|tx| { let mut label = match tx.find_label_for_update(self.dataset_id)? { Some(label) => label, - None => return Ok(CompactionStatus::Canceled), + None => return Ok(CompactionStatus::Canceled) }; if self.data_was_changed(tx)? { @@ -88,21 +91,26 @@ impl<'a> DatasetCompaction<'a> { label.bump_version(); tx.write_label(self.dataset_id, &label)?; - let merged_chunks = self.merge.iter().map(|c| { - let size = c.tables() - .keys() - .map(|name| c.get_table_reader(name).map(|r| r.num_rows())) - .fold(Ok::<_, anyhow::Error>(0), |acc, size| { - let acc = acc?; - let size = size?; - Ok(max(acc, size)) - })?; - Ok(MergedChunk { - first_block: c.first_block(), - last_block: c.last_block(), - size + let merged_chunks = self + .merge + .iter() + .map(|c| { + let size = c + .tables() + .keys() + .map(|name| c.get_table_reader(name).map(|r| r.num_rows())) + .fold(Ok::<_, anyhow::Error>(0), |acc, size| { + let acc = acc?; + let size = size?; + Ok(max(acc, size)) + })?; + Ok(MergedChunk { + first_block: c.first_block(), + last_block: c.last_block(), + size + }) }) - }).collect::>()?; + .collect::>()?; Ok(CompactionStatus::Ok(merged_chunks)) }) @@ -112,7 +120,7 @@ impl<'a> DatasetCompaction<'a> { let current_chunks = tx.list_chunks( self.dataset_id, self.merge[0].first_block(), - Some(self.merge.last().unwrap().last_block()), + Some(self.merge.last().unwrap().last_block()) ); let mut compared = 0; @@ -144,7 +152,7 @@ impl<'a> DatasetCompaction<'a> { parent_block_hash: first_chunk.base_block_hash().to_string(), first_block_time: first_chunk.chunk().first_block_time(), last_block_time: last_chunk.chunk().last_block_time(), - tables, + tables } } @@ -155,11 +163,7 @@ impl<'a> DatasetCompaction<'a> { Ok(()) } - fn merge_table( - &self, - name: &str, - tables: &mut BTreeMap, - ) -> anyhow::Result<()> { + fn merge_table(&self, name: &str, tables: &mut BTreeMap) -> anyhow::Result<()> { let chunks = self .merge .iter() @@ -224,8 +228,7 @@ impl<'a> DatasetCompaction<'a> { .iter() .position(|element| element == max_el) .unwrap(); - let left_range = - Self::find_range(chunk_sizes, start, start + max_idx, *max_el, wa_threshold, len_limit); + let left_range = Self::find_range(chunk_sizes, start, start + max_idx, *max_el, wa_threshold, len_limit); if left_range.is_some() { return left_range; } @@ -233,10 +236,7 @@ impl<'a> DatasetCompaction<'a> { } fn prepare_merge_plan(&mut self) -> anyhow::Result<()> { - let mut reversed_chunk_iterator = self - .snapshot - .list_chunks(self.dataset_id, 0, None) - .into_reversed(); + let mut reversed_chunk_iterator = self.snapshot.list_chunks(self.dataset_id, 0, None).into_reversed(); let mut first_applicable_block = u64::MAX; let mut chunk_data_sizes: Vec> = Default::default(); let mut last_schema_map: BTreeMap = Default::default(); diff --git a/crates/storage/src/db/ops/mod.rs b/crates/storage/src/db/ops/mod.rs index 2e867c1e..727edc1e 100644 --- a/crates/storage/src/db/ops/mod.rs +++ b/crates/storage/src/db/ops/mod.rs @@ -1,7 +1,7 @@ #![allow(unused)] mod cast; +mod compaction; pub mod schema_merge; mod table_merge; -mod compaction; -pub use compaction::*; \ No newline at end of file +pub use compaction::*; diff --git a/crates/storage/src/db/ops/schema_merge.rs b/crates/storage/src/db/ops/schema_merge.rs index 9edcd90c..1813db97 100644 --- a/crates/storage/src/db/ops/schema_merge.rs +++ b/crates/storage/src/db/ops/schema_merge.rs @@ -1,18 +1,16 @@ -use anyhow::anyhow; -use arrow::datatypes::{DataType, Field, FieldRef, Schema}; -use sqd_array::item_index_cast::common_item_index_type; -use sqd_array::schema_metadata::get_sort_key; -use sqd_array::schema_patch::SchemaPatch; use std::sync::Arc; +use anyhow::anyhow; +use arrow::datatypes::{DataType, Field, FieldRef, Schema}; +use sqd_array::{item_index_cast::common_item_index_type, schema_metadata::get_sort_key, schema_patch::SchemaPatch}; pub fn can_merge_schemas(a: &Schema, b: &Schema) -> bool { if a == b { - return true + return true; } if a.fields().len() != b.fields().len() { - return false + return false; } let (a_sort_key, b_sort_key) = match (get_sort_key(a), get_sort_key(b)) { @@ -21,59 +19,57 @@ pub fn can_merge_schemas(a: &Schema, b: &Schema) -> bool { }; if a_sort_key.len() != b_sort_key.len() { - return false + return false; } for (ai, bi) in a_sort_key.iter().zip(b_sort_key.iter()) { if a.field(*ai).name() != b.field(*bi).name() { - return false - } + return false; + } } - + b.fields().iter().all(|bf| { - a.fields().iter() + a.fields() + .iter() .find(|f| f.name() == bf.name()) - .map(|af| if af.data_type() == bf.data_type() { - true - } else { - common_item_index_type(af.data_type(), bf.data_type()).is_some() + .map(|af| { + if af.data_type() == bf.data_type() { + true + } else { + common_item_index_type(af.data_type(), bf.data_type()).is_some() + } }) .unwrap_or(false) }) } - pub fn merge_schema(base: &mut SchemaPatch, schema: &Schema) -> anyhow::Result<()> { if base.fields() == schema.fields().as_ref() { - return Ok(()) + return Ok(()); } for new_field in schema.fields().iter() { let (bi, bf) = base .find_by_name(new_field.name()) - .ok_or_else(|| { - anyhow!("field `{}` does not exist in the base schema", new_field.name()) - })?; - + .ok_or_else(|| anyhow!("field `{}` does not exist in the base schema", new_field.name()))?; + if &bf == new_field { - continue + continue; } - - let new_field = merge_fields(&bf, new_field) - .ok_or_else(|| { - anyhow!( - "failed to merge field `{}`: data types {} and {} are not compatible", - new_field.name(), - bf.data_type(), - new_field.data_type() - ) - })?; - + + let new_field = merge_fields(&bf, new_field).ok_or_else(|| { + anyhow!( + "failed to merge field `{}`: data types {} and {} are not compatible", + new_field.name(), + bf.data_type(), + new_field.data_type() + ) + })?; + base.set_field(bi, new_field) } Ok(()) } - fn merge_fields(a: &Field, b: &Field) -> Option { assert_eq!(a.name(), b.name()); let data_type = if a.data_type() == b.data_type() { @@ -82,20 +78,16 @@ fn merge_fields(a: &Field, b: &Field) -> Option { common_item_index_type(a.data_type(), b.data_type()) }; data_type.map(|data_type| { - let field = Field::new( - a.name(), - data_type, - a.is_nullable() || b.is_nullable() - ); + let field = Field::new(a.name(), data_type, a.is_nullable() || b.is_nullable()); Arc::new(field) }) } - /// DataType equality, that ignores list item names and nullability pub fn data_types_equal(a: &DataType, b: &DataType) -> bool { - a == b || match (a, b) { - (DataType::List(a), DataType::List(b)) => data_types_equal(a.data_type(), b.data_type()), - _ => false - } -} \ No newline at end of file + a == b + || match (a, b) { + (DataType::List(a), DataType::List(b)) => data_types_equal(a.data_type(), b.data_type()), + _ => false + } +} diff --git a/crates/storage/src/db/ops/table_merge.rs b/crates/storage/src/db/ops/table_merge.rs index 4e068c91..75ab35cd 100644 --- a/crates/storage/src/db/ops/table_merge.rs +++ b/crates/storage/src/db/ops/table_merge.rs @@ -1,19 +1,24 @@ -use super::cast::{IndexCastReader, MaybeCastedReader}; -use super::schema_merge::{data_types_equal, merge_schema}; -use crate::db::SnapshotTableReader; -use anyhow::ensure; -use arrow::datatypes::{DataType, SchemaRef}; -use sqd_array::builder::AnyTableBuilder; -use sqd_array::chunking::ChunkRange; -use sqd_array::reader::{AnyChunkedReader, ArrayReader, ChunkedArrayReader}; -use sqd_array::schema_metadata::{get_sort_key, SQD_SORT_KEY}; -use sqd_array::schema_patch::SchemaPatch; -use sqd_array::slice::{AsSlice, Slice}; -use sqd_array::sort::sort_table_to_indexes; -use sqd_array::util::{build_field_offsets, build_offsets}; -use sqd_array::writer::ArrayWriter; use std::sync::Arc; +use anyhow::ensure; +use arrow::datatypes::{DataType, SchemaRef}; +use sqd_array::{ + builder::AnyTableBuilder, + chunking::ChunkRange, + reader::{AnyChunkedReader, ArrayReader, ChunkedArrayReader}, + schema_metadata::{get_sort_key, SQD_SORT_KEY}, + schema_patch::SchemaPatch, + slice::{AsSlice, Slice}, + sort::sort_table_to_indexes, + util::{build_field_offsets, build_offsets}, + writer::ArrayWriter +}; + +use super::{ + cast::{IndexCastReader, MaybeCastedReader}, + schema_merge::{data_types_equal, merge_schema} +}; +use crate::db::SnapshotTableReader; pub struct TableMerge<'a> { chunks: &'a [Arc>], @@ -23,15 +28,12 @@ pub struct TableMerge<'a> { column_offsets: Vec } - impl<'a> TableMerge<'a> { pub fn prepare(chunks: &'a [Arc>]) -> anyhow::Result { ensure!(chunks.len() > 0, "nothing to merge"); let last_chunk = chunks.last().unwrap().clone(); - let mut schema = SchemaPatch::new( - strip_unknown_metadata(last_chunk.schema()) - ); + let mut schema = SchemaPatch::new(strip_unknown_metadata(last_chunk.schema())); for t in chunks.iter().rev().skip(1) { merge_schema(&mut schema, &t.schema())?; @@ -43,7 +45,8 @@ impl<'a> TableMerge<'a> { let columns_with_stats = (0..last_chunk.schema().fields().len()) .filter_map(|i| { - last_chunk.get_column_stats(i) + last_chunk + .get_column_stats(i) .map(|maybe_stats| maybe_stats.map(|_| i)) .transpose() }) @@ -82,10 +85,7 @@ impl<'a> TableMerge<'a> { let sort_table_builder = self.read_sort_key()?; let sort_table = sort_table_builder.as_slice(); - let order = sort_table_to_indexes( - &sort_table, - &(0..sort_table.num_columns()).collect::>() - ); + let order = sort_table_to_indexes(&sort_table, &(0..sort_table.num_columns()).collect::>()); for i in self.sort_key.iter().copied() { let mut dst = dst.shift(self.column_offsets[i]); @@ -95,17 +95,15 @@ impl<'a> TableMerge<'a> { drop(sort_table_builder); if self.sort_key.len() == self.num_columns() { - return Ok(()) + return Ok(()); } - let chunks = ChunkRange::build_rel_order_list( - &build_offsets(0, self.chunks.iter().map(|t| t.num_rows())), - &order - ); + let chunks = + ChunkRange::build_rel_order_list(&build_offsets(0, self.chunks.iter().map(|t| t.num_rows())), &order); for i in 0..self.num_columns() { if self.sort_key.contains(&i) { - continue + continue; } let mut dst = dst.shift(self.column_offsets[i]); self.read_sorted_column(i, &chunks, &mut dst)?; @@ -134,32 +132,18 @@ impl<'a> TableMerge<'a> { Ok(()) } - fn read_sorted_column( - &self, - index: usize, - order: &[ChunkRange], - dst: &mut impl ArrayWriter - ) -> anyhow::Result<()> - { + fn read_sorted_column(&self, index: usize, order: &[ChunkRange], dst: &mut impl ArrayWriter) -> anyhow::Result<()> { let field = self.schema.field(index); let chunk_columns = self.chunk_columns(index); - let needs_cast = self.chunks.iter() + let needs_cast = self + .chunks + .iter() .enumerate() - .any(|(i, t)| { - !data_types_equal( - field.data_type(), - t.schema().field(chunk_columns[i]).data_type() - ) - }); + .any(|(i, t)| !data_types_equal(field.data_type(), t.schema().field(chunk_columns[i]).data_type())); if needs_cast { - self.read_sorted_column_with_cast( - field.data_type(), - &chunk_columns, - order, - dst - ) + self.read_sorted_column_with_cast(field.data_type(), &chunk_columns, order, dst) } else { let mut reader = AnyChunkedReader::with_capacity(self.chunks.len(), field.data_type()); for (i, t) in self.chunks.iter().enumerate() { @@ -176,9 +160,10 @@ impl<'a> TableMerge<'a> { chunk_columns: &[usize], order: &[ChunkRange], dst: &mut impl ArrayWriter - ) -> anyhow::Result<()> - { - let mut readers = self.chunks.iter() + ) -> anyhow::Result<()> { + let mut readers = self + .chunks + .iter() .enumerate() .map(|(i, t)| { let ci = chunk_columns[i]; @@ -187,11 +172,7 @@ impl<'a> TableMerge<'a> { .collect::>>()?; for range in order.iter() { - readers[range.chunk_index()].read_slice( - dst, - range.offset_index(), - range.len_index() - )?; + readers[range.chunk_index()].read_slice(dst, range.offset_index(), range.len_index())?; } Ok(()) @@ -199,14 +180,17 @@ impl<'a> TableMerge<'a> { fn chunk_columns(&self, index: usize) -> Vec { let name = self.schema.field(index).name(); - self.chunks.iter().map(|c| { - let schema = c.schema(); - if schema.fields()[index].name() == name { - index - } else { - schema.index_of(name).expect("chunk and target schema mismatch") - } - }).collect() + self.chunks + .iter() + .map(|c| { + let schema = c.schema(); + if schema.fields()[index].name() == name { + index + } else { + schema.index_of(name).expect("chunk and target schema mismatch") + } + }) + .collect() } fn unsorted_write(&self, dst: &mut impl ArrayWriter) -> anyhow::Result<()> { @@ -218,30 +202,25 @@ impl<'a> TableMerge<'a> { } } - fn create_maybe_casted_reader<'a>( table: &SnapshotTableReader<'a>, column_index: usize, target_type: &DataType -) -> anyhow::Result> -{ +) -> anyhow::Result> { let schema = table.schema(); let field = schema.field(column_index); let reader = table.create_column_reader(column_index)?; Ok(if data_types_equal(field.data_type(), target_type) { MaybeCastedReader::Plain(reader) } else { - MaybeCastedReader::Cast( - IndexCastReader::new(reader, field.data_type(), target_type.clone()) - ) + MaybeCastedReader::Cast(IndexCastReader::new(reader, field.data_type(), target_type.clone())) }) } - fn strip_unknown_metadata(schema: SchemaRef) -> SchemaRef { let mut patch = SchemaPatch::new(schema); if patch.metadata().keys().any(|key| key != SQD_SORT_KEY) { patch.metadata_mut().retain(|k, _| k == SQD_SORT_KEY) } patch.finish() -} \ No newline at end of file +} diff --git a/crates/storage/src/db/read/blocks_table.rs b/crates/storage/src/db/read/blocks_table.rs index 5b134d9a..c9a947cd 100644 --- a/crates/storage/src/db/read/blocks_table.rs +++ b/crates/storage/src/db/read/blocks_table.rs @@ -1,56 +1,56 @@ -use crate::kv::KvRead; -use crate::table::read::TableReader; use anyhow::{anyhow, bail}; -use arrow::array::{Array, AsArray}; -use arrow::datatypes::{DataType, UInt32Type, UInt64Type}; -use sqd_array::builder::{AnyBuilder, ArrayBuilder}; -use sqd_array::reader::ArrayReader; +use arrow::{ + array::{Array, AsArray}, + datatypes::{DataType, UInt32Type, UInt64Type} +}; +use sqd_array::{ + builder::{AnyBuilder, ArrayBuilder}, + reader::ArrayReader +}; use sqd_primitives::BlockNumber; +use crate::{kv::KvRead, table::read::TableReader}; pub fn get_parent_block_hash( blocks_table: &TableReader, block_number: BlockNumber -) -> anyhow::Result -{ +) -> anyhow::Result { let numbers = { let col_idx = blocks_table.schema().index_of("number")?; let mut builder = AnyBuilder::new(blocks_table.schema().field(col_idx).data_type()); blocks_table.create_column_reader(col_idx)?.read(&mut builder)?; builder.finish() }; - + let maybe_row_idx = match numbers.data_type() { DataType::UInt32 => find_block_row(numbers.as_primitive::().values(), block_number as u32), DataType::UInt64 => find_block_row(numbers.as_primitive::().values(), block_number), ty => bail!("'number' column has unexpected data type - {}", ty) }; - - let row_index = maybe_row_idx.ok_or_else(|| { - anyhow!("block {} was not found in the given table", block_number) - })?; - + + let row_index = maybe_row_idx.ok_or_else(|| anyhow!("block {} was not found in the given table", block_number))?; + let parent_hash = { let col_idx = blocks_table.schema().index_of("parent_hash")?; let mut builder = AnyBuilder::new(blocks_table.schema().field(col_idx).data_type()); - blocks_table.create_column_reader(col_idx)?.read_slice(&mut builder, row_index, 1)?; + blocks_table + .create_column_reader(col_idx)? + .read_slice(&mut builder, row_index, 1)?; builder.finish() }; - + Ok(match parent_hash.data_type() { - DataType::Utf8 => { - parent_hash.as_string::().value(0).to_string() - }, + DataType::Utf8 => parent_hash.as_string::().value(0).to_string(), ty => bail!("'parent_hash' column has unexpected data type - {}", ty) }) } - fn find_block_row(numbers: &[BN], block: BN) -> Option { - numbers.iter() + numbers + .iter() .copied() .enumerate() .filter(|e| e.1 >= block) .min_by_key(|e| e.1) .map(|e| e.0) -} \ No newline at end of file +} diff --git a/crates/storage/src/db/read/chunk.rs b/crates/storage/src/db/read/chunk.rs index f16d43ce..0e99a7e0 100644 --- a/crates/storage/src/db/read/chunk.rs +++ b/crates/storage/src/db/read/chunk.rs @@ -1,8 +1,10 @@ -use crate::db::data::{Chunk, ChunkId, DatasetId}; -use crate::kv::KvReadCursor; use anyhow::{ensure, Context}; use sqd_primitives::BlockNumber; +use crate::{ + db::data::{Chunk, ChunkId, DatasetId}, + kv::KvReadCursor +}; pub struct ChunkIterator { cursor: C, @@ -13,15 +15,8 @@ pub struct ChunkIterator { is_reversed: bool } - impl ChunkIterator { - pub fn new( - cursor: C, - dataset_id: DatasetId, - from_block: BlockNumber, - to_block: Option - ) -> Self - { + pub fn new(cursor: C, dataset_id: DatasetId, from_block: BlockNumber, to_block: Option) -> Self { Self { cursor, dataset_id, @@ -31,7 +26,7 @@ impl ChunkIterator { is_reversed: false } } - + pub fn into_reversed(self) -> Self { Self { cursor: self.cursor, @@ -42,7 +37,7 @@ impl ChunkIterator { is_reversed: !self.is_reversed } } - + fn chunk_id(&self, last_block: BlockNumber) -> ChunkId { ChunkId::new(self.dataset_id, last_block) } @@ -54,13 +49,9 @@ impl ChunkIterator { fn seek_last(&mut self) -> anyhow::Result<()> { if let Some(to_block) = self.to_block { - self.cursor.seek_prev( - self.chunk_id(to_block).as_ref() - ) + self.cursor.seek_prev(self.chunk_id(to_block).as_ref()) } else { - self.cursor.seek_prev( - self.chunk_id(BlockNumber::MAX).as_ref() - ) + self.cursor.seek_prev(self.chunk_id(BlockNumber::MAX).as_ref()) } } @@ -81,29 +72,27 @@ impl ChunkIterator { } if !self.cursor.is_valid() { - return Ok(None) + return Ok(None); } let current_id: ChunkId = borsh::from_slice(self.cursor.key())?; if current_id.dataset_id() != self.dataset_id { - return Ok(None) + return Ok(None); } - let chunk: Chunk = borsh::from_slice(self.cursor.value()).with_context(|| { - format!("failed to deserialize chunk {}", current_id) - })?; + let chunk: Chunk = borsh::from_slice(self.cursor.value()) + .with_context(|| format!("failed to deserialize chunk {}", current_id))?; validate_chunk(¤t_id, &chunk)?; - + if self.from_block <= chunk.last_block() && self.to_block.map_or(true, |end| chunk.first_block() <= end) { - return Ok(Some(chunk)) + return Ok(Some(chunk)); } Ok(None) } } - fn validate_chunk(chunk_id: &ChunkId, chunk: &Chunk) -> anyhow::Result<()> { ensure!( chunk_id.last_block() == chunk.last_block(), @@ -121,7 +110,6 @@ fn validate_chunk(chunk_id: &ChunkId, chunk: &Chunk) -> anyhow::Result<()> { Ok(()) } - impl Iterator for ChunkIterator { type Item = anyhow::Result; diff --git a/crates/storage/src/db/read/datasets.rs b/crates/storage/src/db/read/datasets.rs index dffa9e9d..ee8bf5f2 100644 --- a/crates/storage/src/db/read/datasets.rs +++ b/crates/storage/src/db/read/datasets.rs @@ -1,15 +1,13 @@ -use crate::db::data::Dataset; -use crate::db::DatasetLabel; -use crate::kv::KvReadCursor; use anyhow::Context; +use crate::{ + db::{data::Dataset, DatasetLabel}, + kv::KvReadCursor +}; -pub fn list_all_datasets( - mut cursor: C -) -> impl Iterator> -{ +pub fn list_all_datasets(mut cursor: C) -> impl Iterator> { let mut first_seek = true; - + let mut next = move || -> anyhow::Result> { if first_seek { cursor.seek_first()?; @@ -17,24 +15,18 @@ pub fn list_all_datasets( } else { cursor.next()?; } - + if !cursor.is_valid() { - return Ok(None) + return Ok(None); } - - let id = borsh::from_slice(cursor.key()) - .context("invalid key in datasets CF")?; - - let label: DatasetLabel = borsh::from_slice(cursor.value()) - .with_context(|| { - format!("invalid dataset label under id {}", id) - })?; - - Ok(Some(Dataset { - id, - label - })) + + let id = borsh::from_slice(cursor.key()).context("invalid key in datasets CF")?; + + let label: DatasetLabel = + borsh::from_slice(cursor.value()).with_context(|| format!("invalid dataset label under id {}", id))?; + + Ok(Some(Dataset { id, label })) }; - + std::iter::from_fn(move || next().transpose()) -} \ No newline at end of file +} diff --git a/crates/storage/src/db/read/mod.rs b/crates/storage/src/db/read/mod.rs index 5a886966..1b8346dd 100644 --- a/crates/storage/src/db/read/mod.rs +++ b/crates/storage/src/db/read/mod.rs @@ -1,5 +1,4 @@ - pub mod blocks_table; pub mod chunk; pub mod datasets; -pub mod snapshot; \ No newline at end of file +pub mod snapshot; diff --git a/crates/storage/src/db/read/snapshot.rs b/crates/storage/src/db/read/snapshot.rs index 9c25aa2d..31659392 100644 --- a/crates/storage/src/db/read/snapshot.rs +++ b/crates/storage/src/db/read/snapshot.rs @@ -1,26 +1,28 @@ -use crate::db::data::{Chunk, DatasetId}; -use crate::db::db::{RocksDB, RocksIterator, RocksSnapshot, CF_CHUNKS, CF_DATASETS, CF_TABLES}; -use crate::db::read::chunk::ChunkIterator; -use crate::db::table_id::TableId; -use crate::db::DatasetLabel; -use crate::kv::KvRead; -use crate::table::read::TableReader; +use std::{collections::BTreeMap, ops::Deref, sync::Arc}; + use anyhow::anyhow; use parking_lot::Mutex; use rocksdb::{ColumnFamily, ReadOptions}; use sqd_primitives::{BlockNumber, Name}; -use std::collections::BTreeMap; -use std::ops::Deref; -use std::sync::Arc; +use crate::{ + db::{ + data::{Chunk, DatasetId}, + db::{RocksDB, RocksIterator, RocksSnapshot, CF_CHUNKS, CF_DATASETS, CF_TABLES}, + read::chunk::ChunkIterator, + table_id::TableId, + DatasetLabel + }, + kv::KvRead, + table::read::TableReader +}; pub struct ReadSnapshot<'a> { db: &'a RocksDB, snapshot: RocksSnapshot<'a, RocksDB> } - -impl <'a> ReadSnapshot<'a> { +impl<'a> ReadSnapshot<'a> { pub fn new(db: &'a RocksDB) -> Self { Self { db, @@ -29,11 +31,9 @@ impl <'a> ReadSnapshot<'a> { } pub fn get_label(&self, dataset_id: DatasetId) -> anyhow::Result> { - let maybe_bytes = self.db.get_pinned_cf_opt( - self.cf_handle(CF_DATASETS), - dataset_id, - &self.new_options() - )?; + let maybe_bytes = self + .db + .get_pinned_cf_opt(self.cf_handle(CF_DATASETS), dataset_id, &self.new_options())?; Ok(if let Some(bytes) = maybe_bytes { let label = borsh::from_slice(bytes.as_ref())?; Some(label) @@ -60,20 +60,13 @@ impl <'a> ReadSnapshot<'a> { dataset_id: DatasetId, from_block: BlockNumber, to_block: Option - ) -> ReadSnapshotChunkIterator<'a> - { - let cursor = self.db.raw_iterator_cf_opt( - self.cf_handle(CF_CHUNKS), - self.new_options() - ); - ChunkIterator::new( - cursor, - dataset_id, - from_block, - to_block - ) - } - + ) -> ReadSnapshotChunkIterator<'a> { + let cursor = self + .db + .raw_iterator_cf_opt(self.cf_handle(CF_CHUNKS), self.new_options()); + ChunkIterator::new(cursor, dataset_id, from_block, to_block) + } + pub fn get_first_chunk(&self, dataset_id: DatasetId) -> anyhow::Result> { self.list_chunks(dataset_id, 0, None).next().transpose() } @@ -93,28 +86,23 @@ impl <'a> ReadSnapshot<'a> { } } - pub type ReadSnapshotChunkIterator<'a> = ChunkIterator>; - pub struct ChunkReader<'a> { snapshot: &'a ReadSnapshot<'a>, chunk: Chunk, cache: BTreeMap>>>> } - -impl <'a> ChunkReader<'a> { +impl<'a> ChunkReader<'a> { fn new(snapshot: &'a ReadSnapshot<'a>, chunk: Chunk) -> Self { - let cache = chunk.tables().keys() + let cache = chunk + .tables() + .keys() .map(|name| (name.to_string(), Mutex::new(None))) .collect(); - Self { - snapshot, - chunk, - cache - } + Self { snapshot, chunk, cache } } pub fn first_block(&self) -> BlockNumber { @@ -124,11 +112,11 @@ impl <'a> ChunkReader<'a> { pub fn last_block(&self) -> BlockNumber { self.chunk.last_block() } - + pub fn last_block_hash(&self) -> &str { &self.chunk.last_block_hash() } - + pub fn base_block_hash(&self) -> &str { &self.chunk.parent_block_hash() } @@ -136,62 +124,59 @@ impl <'a> ChunkReader<'a> { pub fn has_table(&self, name: &str) -> bool { self.chunk.tables().contains_key(name) } - + pub fn chunk(&self) -> &Chunk { &self.chunk } - + pub fn tables(&self) -> &BTreeMap { self.chunk.tables() } pub fn get_table_reader(&self, name: &str) -> anyhow::Result>> { - let mut reader_lock = self.cache.get(name).ok_or_else(|| { - anyhow!("table `{}` does not exist in this chunk", name) - })?.lock(); - + let mut reader_lock = self + .cache + .get(name) + .ok_or_else(|| anyhow!("table `{}` does not exist in this chunk", name))? + .lock(); + if let Some(reader) = reader_lock.as_ref() { - return Ok(reader.clone()) + return Ok(reader.clone()); } let table_id = self.chunk.tables().get(name).unwrap(); let reader = self.snapshot.create_table_reader(*table_id)?; let reader = Arc::new(reader); - + *reader_lock = Some(reader.clone()); Ok(reader) } - + pub fn into_chunk(self) -> Chunk { self.chunk } } - pub type SnapshotTableReader<'a> = TableReader>; - pub struct CFSnapshot<'a> { snapshot: &'a ReadSnapshot<'a>, cf: Name } - -impl <'a> KvRead for CFSnapshot<'a> { +impl<'a> KvRead for CFSnapshot<'a> { type Cursor = RocksIterator<'a, RocksDB>; - - fn get(&self, key: &[u8]) -> anyhow::Result>> { - Ok(self.snapshot.db.get_pinned_cf_opt( - self.snapshot.cf_handle(self.cf), - key, - &self.snapshot.new_options() - )?) + + fn get(&self, key: &[u8]) -> anyhow::Result>> { + Ok(self + .snapshot + .db + .get_pinned_cf_opt(self.snapshot.cf_handle(self.cf), key, &self.snapshot.new_options())?) } fn new_cursor(&self) -> Self::Cursor { - self.snapshot.db.raw_iterator_cf_opt( - self.snapshot.cf_handle(self.cf), - self.snapshot.new_options() - ) + self.snapshot + .db + .raw_iterator_cf_opt(self.snapshot.cf_handle(self.cf), self.snapshot.new_options()) } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/rocks.rs b/crates/storage/src/db/rocks.rs index 54cf7059..da3e7713 100644 --- a/crates/storage/src/db/rocks.rs +++ b/crates/storage/src/db/rocks.rs @@ -1,8 +1,6 @@ -use crate::db::db::RocksIterator; -use crate::kv::KvReadCursor; +use crate::{db::db::RocksIterator, kv::KvReadCursor}; - -impl <'a, DB: rocksdb::DBAccess> KvReadCursor for RocksIterator<'a, DB> { +impl<'a, DB: rocksdb::DBAccess> KvReadCursor for RocksIterator<'a, DB> { fn seek_first(&mut self) -> anyhow::Result<()> { self.seek_to_first(); self.status()?; @@ -44,4 +42,4 @@ impl <'a, DB: rocksdb::DBAccess> KvReadCursor for RocksIterator<'a, DB> { fn value(&self) -> &[u8] { self.value().expect("cursor position is not valid") } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/table_id.rs b/crates/storage/src/db/table_id.rs index 0313787c..dfca7357 100644 --- a/crates/storage/src/db/table_id.rs +++ b/crates/storage/src/db/table_id.rs @@ -1,26 +1,22 @@ -use borsh::{BorshDeserialize, BorshSerialize}; use std::fmt::{Display, Formatter}; -use uuid::Uuid; +use borsh::{BorshDeserialize, BorshSerialize}; +use uuid::Uuid; #[derive(Copy, Clone, Hash, Ord, PartialOrd, Eq, PartialEq, Debug, BorshSerialize, BorshDeserialize)] pub struct TableId { uuid: Uuid } - impl AsRef<[u8]> for TableId { - fn as_ref(&self) -> &[u8]{ + fn as_ref(&self) -> &[u8] { self.uuid.as_bytes() } } - impl TableId { pub fn new() -> Self { - Self { - uuid: Uuid::now_v7() - } + Self { uuid: Uuid::now_v7() } } pub fn from_slice(bytes: &[u8]) -> Self { @@ -30,9 +26,8 @@ impl TableId { } } - impl Display for TableId { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { self.uuid.fmt(f) } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/write/dataset_update.rs b/crates/storage/src/db/write/dataset_update.rs index a22579dd..34006f88 100644 --- a/crates/storage/src/db/write/dataset_update.rs +++ b/crates/storage/src/db/write/dataset_update.rs @@ -1,9 +1,11 @@ -use crate::db::db::{RocksIterator, RocksTransaction}; -use crate::db::read::chunk::ChunkIterator; -use crate::db::write::tx::Tx; -use crate::db::{Chunk, DatasetId, DatasetLabel}; use sqd_primitives::{BlockNumber, BlockRef}; +use crate::db::{ + db::{RocksIterator, RocksTransaction}, + read::chunk::ChunkIterator, + write::tx::Tx, + Chunk, DatasetId, DatasetLabel +}; pub struct DatasetUpdate<'a> { tx: &'a Tx<'a>, @@ -11,63 +13,57 @@ pub struct DatasetUpdate<'a> { label: DatasetLabel } - impl<'a> DatasetUpdate<'a> { pub(crate) fn new(tx: &'a Tx<'a>, dataset_id: DatasetId) -> anyhow::Result { let label = tx.get_label_for_update(dataset_id)?; - Ok(Self { - tx, - dataset_id, - label - }) + Ok(Self { tx, dataset_id, label }) } - + pub fn dataset_id(&self) -> DatasetId { self.dataset_id } - + pub fn label(&self) -> &DatasetLabel { &self.label } - + pub fn insert_chunk(&self, chunk: &Chunk) -> anyhow::Result<()> { self.tx.validate_chunk_insertion(self.dataset_id, chunk)?; self.tx.write_chunk(self.dataset_id, chunk) } - + pub fn insert_fork(&self, chunk: &Chunk) -> anyhow::Result<()> { self.tx.insert_fork(self.dataset_id, chunk) } - + pub fn validate_parent_block_hash( &self, chunk: &Chunk, block_number: BlockNumber, expected_parent_hash: &str - ) -> anyhow::Result> - { - self.tx.validate_parent_block_hash(chunk, block_number, expected_parent_hash) + ) -> anyhow::Result> { + self.tx + .validate_parent_block_hash(chunk, block_number, expected_parent_hash) } - + pub fn delete_chunk(&self, chunk: &Chunk) -> anyhow::Result<()> { self.tx.delete_chunk(self.dataset_id, chunk) } - + pub fn set_finalized_head(&mut self, block_ref: impl Into>) { self.label.set_finalized_head(block_ref.into()) } - + pub fn list_chunks( - &self, - from_block: BlockNumber, + &self, + from_block: BlockNumber, to_block: Option - ) -> ChunkIterator>> - { - self.tx.list_chunks(self.dataset_id, from_block, to_block) + ) -> ChunkIterator>> { + self.tx.list_chunks(self.dataset_id, from_block, to_block) } - + pub fn finish(mut self) -> anyhow::Result<()> { self.label.bump_version(); self.tx.write_label(self.dataset_id, &self.label) } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/write/mod.rs b/crates/storage/src/db/write/mod.rs index 2295f4fb..d5545b53 100644 --- a/crates/storage/src/db/write/mod.rs +++ b/crates/storage/src/db/write/mod.rs @@ -1,5 +1,5 @@ -mod storage; pub mod dataset_update; pub mod ops; +mod storage; pub mod table_builder; -pub mod tx; \ No newline at end of file +pub mod tx; diff --git a/crates/storage/src/db/write/ops.rs b/crates/storage/src/db/write/ops.rs index 12b29dde..3e0ce35e 100644 --- a/crates/storage/src/db/write/ops.rs +++ b/crates/storage/src/db/write/ops.rs @@ -1,7 +1,8 @@ -use crate::db::db::{RocksDB, RocksWriteBatch, CF_DELETED_TABLES, CF_DIRTY_TABLES, CF_TABLES}; -use crate::kv::KvReadCursor; -use crate::table::key::TableKeyFactory; - +use crate::{ + db::db::{RocksDB, RocksWriteBatch, CF_DELETED_TABLES, CF_DIRTY_TABLES, CF_TABLES}, + kv::KvReadCursor, + table::key::TableKeyFactory +}; pub fn deleted_deleted_tables(db: &RocksDB) -> anyhow::Result { let mut deleted = 0; @@ -14,21 +15,18 @@ pub fn deleted_deleted_tables(db: &RocksDB) -> anyhow::Result { Ok(deleted) } - fn delete_table(db: &RocksDB, table_id: &[u8]) -> anyhow::Result<()> { let mut key1 = TableKeyFactory::new(table_id); let mut key2 = TableKeyFactory::new(table_id); let start = key1.start(); let end = key2.end(); - + let cf_tables = db.cf_handle(CF_TABLES).unwrap(); let mut batch = RocksWriteBatch::default(); let mut cursor = db.raw_iterator_cf(cf_tables); - - list_keys(&mut cursor, start, end, |key| { - batch.delete_cf(cf_tables, key) - })?; - + + list_keys(&mut cursor, start, end, |key| batch.delete_cf(cf_tables, key))?; + let cf_dirty_tables = db.cf_handle(CF_DIRTY_TABLES).unwrap(); batch.delete_cf(cf_dirty_tables, table_id); @@ -39,14 +37,7 @@ fn delete_table(db: &RocksDB, table_id: &[u8]) -> anyhow::Result<()> { Ok(()) } - -fn list_keys( - cursor: &mut impl KvReadCursor, - from: &[u8], - to: &[u8], - mut cb: impl FnMut(&[u8]) -) -> anyhow::Result<()> -{ +fn list_keys(cursor: &mut impl KvReadCursor, from: &[u8], to: &[u8], mut cb: impl FnMut(&[u8])) -> anyhow::Result<()> { cursor.seek(from)?; while cursor.is_valid() && cursor.key() < to { cb(cursor.key()); @@ -55,16 +46,11 @@ fn list_keys( Ok(()) } - -fn for_each_key( - cursor: &mut impl KvReadCursor, - mut cb: impl FnMut(&[u8]) -> anyhow::Result<()> -) -> anyhow::Result<()> -{ +fn for_each_key(cursor: &mut impl KvReadCursor, mut cb: impl FnMut(&[u8]) -> anyhow::Result<()>) -> anyhow::Result<()> { cursor.seek_first()?; while cursor.is_valid() { cb(cursor.key())?; cursor.next()?; } Ok(()) -} \ No newline at end of file +} diff --git a/crates/storage/src/db/write/storage.rs b/crates/storage/src/db/write/storage.rs index 1c8446b8..f7cd89b6 100644 --- a/crates/storage/src/db/write/storage.rs +++ b/crates/storage/src/db/write/storage.rs @@ -1,17 +1,20 @@ -use crate::db::db::{RocksDB, RocksWriteBatch, CF_DIRTY_TABLES, CF_TABLES}; -use crate::db::table_id::TableId; -use crate::kv::KvWrite; use rocksdb::ColumnFamily; +use crate::{ + db::{ + db::{RocksDB, RocksWriteBatch, CF_DIRTY_TABLES, CF_TABLES}, + table_id::TableId + }, + kv::KvWrite +}; pub struct TableStorage<'a> { write_batch: RocksWriteBatch, db: &'a RocksDB, - cf: &'a ColumnFamily, + cf: &'a ColumnFamily } - -impl <'a> TableStorage<'a> { +impl<'a> TableStorage<'a> { pub fn new(db: &'a RocksDB) -> Self { Self { write_batch: RocksWriteBatch::default(), @@ -34,15 +37,14 @@ impl <'a> TableStorage<'a> { self.db.write(batch)?; Ok(()) } - + pub fn finish(self) -> anyhow::Result<()> { self.db.write(self.write_batch)?; Ok(()) } } - -impl <'a> KvWrite for TableStorage<'a> { +impl<'a> KvWrite for TableStorage<'a> { fn put(&mut self, key: &[u8], value: &[u8]) -> anyhow::Result<()> { self.write_batch.put_cf(self.cf, key, value); if self.byte_size() > 8 * 1024 * 1024 { diff --git a/crates/storage/src/db/write/table_builder.rs b/crates/storage/src/db/write/table_builder.rs index 657db60d..1185e78a 100644 --- a/crates/storage/src/db/write/table_builder.rs +++ b/crates/storage/src/db/write/table_builder.rs @@ -1,21 +1,28 @@ -use crate::db::db::{RocksDB, CF_TABLES}; -use crate::db::table_id::TableId; -use crate::db::write::storage::TableStorage; -use crate::db::ReadSnapshot; -use crate::table::key::TableKeyFactory; -use crate::table::stats::{can_have_stats, serialize_stats}; -use crate::table::write::StorageCell; -use anyhow::{ensure, Context}; -use arrow::array::RecordBatch; -use arrow::datatypes::SchemaRef; -use sqd_array::slice::{AsSlice, Slice}; -use sqd_array::writer::{ArrayWriter, Writer}; use std::collections::BTreeSet; +use anyhow::{ensure, Context}; +use arrow::{array::RecordBatch, datatypes::SchemaRef}; +use sqd_array::{ + slice::{AsSlice, Slice}, + writer::{ArrayWriter, Writer} +}; + +use crate::{ + db::{ + db::{RocksDB, CF_TABLES}, + table_id::TableId, + write::storage::TableStorage, + ReadSnapshot + }, + table::{ + key::TableKeyFactory, + stats::{can_have_stats, serialize_stats}, + write::StorageCell + } +}; type TableWriter<'a> = crate::table::write::TableWriter>>; - pub struct TableBuilder<'a> { table_id: TableId, schema: SchemaRef, @@ -24,7 +31,6 @@ pub struct TableBuilder<'a> { db: &'a RocksDB } - impl<'a> TableBuilder<'a> { pub fn new(db: &'a RocksDB, schema: SchemaRef) -> Self { let table_id = TableId::new(); @@ -32,12 +38,8 @@ impl<'a> TableBuilder<'a> { let mut storage = TableStorage::new(db); storage.mark_table_dirty(table_id); - let writer = TableWriter::new( - StorageCell::new(storage), - table_id.as_ref(), - schema.clone() - ); - + let writer = TableWriter::new(StorageCell::new(storage), table_id.as_ref(), schema.clone()); + Self { table_id, schema, @@ -46,7 +48,7 @@ impl<'a> TableBuilder<'a> { db } } - + pub fn write_record_batch(&mut self, record_batch: &RecordBatch) -> anyhow::Result<()> { record_batch.as_slice().write(&mut self.writer) } @@ -64,20 +66,23 @@ impl<'a> TableBuilder<'a> { Ok(()) } - pub fn set_stats(&mut self, columns: impl IntoIterator) -> anyhow::Result<()> { + pub fn set_stats(&mut self, columns: impl IntoIterator) -> anyhow::Result<()> { let num_columns = self.schema.fields().len(); - self.columns_with_stats = columns.into_iter().map(|index| { - ensure!(index < num_columns, "column {} does not exist", index); - let field = self.schema.field(index); - ensure!( - can_have_stats(field.data_type()), - "can't stat column {} ({}): columns of type {} can't have stats", - index, - field.name(), - field.data_type() - ); - Ok(index) - }).collect::>>()?; + self.columns_with_stats = columns + .into_iter() + .map(|index| { + ensure!(index < num_columns, "column {} does not exist", index); + let field = self.schema.field(index); + ensure!( + can_have_stats(field.data_type()), + "can't stat column {} ({}): columns of type {} can't have stats", + index, + field.name(), + field.data_type() + ); + Ok(index) + }) + .collect::>>()?; Ok(()) } @@ -88,15 +93,9 @@ impl<'a> TableBuilder<'a> { } } - -fn build_table_stats( - db: &RocksDB, - table_id: TableId, - columns_with_stats: &BTreeSet -) -> anyhow::Result<()> -{ +fn build_table_stats(db: &RocksDB, table_id: TableId, columns_with_stats: &BTreeSet) -> anyhow::Result<()> { if columns_with_stats.is_empty() { - return Ok(()) + return Ok(()); } let snapshot = ReadSnapshot::new(db); @@ -106,35 +105,27 @@ fn build_table_stats( let mut key = TableKeyFactory::new(table_id); for column_index in columns_with_stats.iter().copied() { - let stats = table_reader - .build_column_stats(4096, column_index) - .with_context(|| { - format!( - "failed to build stats for column '{}'", - table_reader.schema().field(column_index).name() - ) - })?; + let stats = table_reader.build_column_stats(4096, column_index).with_context(|| { + format!( + "failed to build stats for column '{}'", + table_reader.schema().field(column_index).name() + ) + })?; bytes.clear(); - serialize_stats(&mut bytes, &stats) - .with_context(|| { - format!( - "failed to serialize stats of column {}", - table_reader.schema().field(column_index).name() - ) - })?; - - db.put_cf( - table_cf, - key.statistic(column_index), - &bytes - )? + serialize_stats(&mut bytes, &stats).with_context(|| { + format!( + "failed to serialize stats of column {}", + table_reader.schema().field(column_index).name() + ) + })?; + + db.put_cf(table_cf, key.statistic(column_index), &bytes)? } Ok(()) } - impl<'a> ArrayWriter for TableBuilder<'a> { type Writer = as ArrayWriter>::Writer; @@ -157,4 +148,4 @@ impl<'a> ArrayWriter for TableBuilder<'a> { fn offset(&mut self, buf: usize) -> &mut ::Offset { self.writer.offset(buf) } -} \ No newline at end of file +} diff --git a/crates/storage/src/db/write/tx.rs b/crates/storage/src/db/write/tx.rs index ad5f7db0..b7ba820f 100644 --- a/crates/storage/src/db/write/tx.rs +++ b/crates/storage/src/db/write/tx.rs @@ -1,61 +1,56 @@ -use crate::db::data::ChunkId; -use crate::db::db::{RocksDB, RocksIterator, RocksTransaction, RocksTransactionOptions, CF_CHUNKS, CF_DATASETS, CF_DELETED_TABLES, CF_DIRTY_TABLES}; -use crate::db::read::blocks_table::get_parent_block_hash; -use crate::db::read::chunk::ChunkIterator; -use crate::db::table_id::TableId; -use crate::db::{Chunk, DatasetId, DatasetLabel, ReadSnapshot}; +use std::{ + cell::RefCell, + cmp::{max, min}, + sync::atomic::{AtomicU64, Ordering} +}; + use anyhow::{anyhow, bail, ensure, Context}; use rocksdb::ColumnFamily; use sqd_primitives::BlockNumber; -use std::cell::RefCell; -use std::cmp::{max, min}; -use std::sync::atomic::{AtomicU64, Ordering}; +use crate::db::{ + data::ChunkId, + db::{ + RocksDB, RocksIterator, RocksTransaction, RocksTransactionOptions, CF_CHUNKS, CF_DATASETS, CF_DELETED_TABLES, + CF_DIRTY_TABLES + }, + read::{blocks_table::get_parent_block_hash, chunk::ChunkIterator}, + table_id::TableId, + Chunk, DatasetId, DatasetLabel, ReadSnapshot +}; static GLOBAL_RESTARTS: AtomicU64 = AtomicU64::new(0); - thread_local! { static LOCAL_RESTARTS: RefCell = RefCell::new(0); } - pub fn get_global_tx_restarts() -> u64 { GLOBAL_RESTARTS.load(Ordering::Relaxed) } - pub fn get_local_tx_restarts() -> u64 { LOCAL_RESTARTS.with_borrow(|val| *val) } - fn record_restart() { GLOBAL_RESTARTS.fetch_add(1, Ordering::SeqCst); LOCAL_RESTARTS.with_borrow_mut(|val| *val = val.wrapping_add(1)) } - pub struct Tx<'a> { db: &'a RocksDB, transaction: RocksTransaction<'a> } - -impl <'a> Tx<'a> { +impl<'a> Tx<'a> { pub fn new(db: &'a RocksDB) -> Self { let mut tx_options = RocksTransactionOptions::default(); tx_options.set_snapshot(true); - let transaction = db.transaction_opt( - &rocksdb::WriteOptions::default(), - &tx_options - ); + let transaction = db.transaction_opt(&rocksdb::WriteOptions::default(), &tx_options); - Self { - db, - transaction, - } + Self { db, transaction } } pub fn run(self, mut cb: F) -> anyhow::Result @@ -71,7 +66,7 @@ impl <'a> Tx<'a> { Err(err) if err.kind() == rocksdb::ErrorKind::TryAgain || err.kind() == rocksdb::ErrorKind::Busy => { record_restart(); tx = Self::new(db) - }, + } Err(err) => return Err(err.into()) } } @@ -82,11 +77,9 @@ impl <'a> Tx<'a> { } pub fn find_label_for_update(&self, dataset_id: DatasetId) -> anyhow::Result> { - let maybe_bytes = self.transaction.get_pinned_for_update_cf( - self.cf_handle(CF_DATASETS), - dataset_id, - true - )?; + let maybe_bytes = self + .transaction + .get_pinned_for_update_cf(self.cf_handle(CF_DATASETS), dataset_id, true)?; Ok(if let Some(bytes) = maybe_bytes { let label = borsh::from_slice(bytes.as_ref())?; Some(label) @@ -96,17 +89,13 @@ impl <'a> Tx<'a> { } pub fn get_label_for_update(&self, dataset_id: DatasetId) -> anyhow::Result { - self.find_label_for_update(dataset_id).and_then(|maybe_chunk| { - maybe_chunk.ok_or_else(|| anyhow!("dataset {} not found", dataset_id)) - }) + self.find_label_for_update(dataset_id) + .and_then(|maybe_chunk| maybe_chunk.ok_or_else(|| anyhow!("dataset {} not found", dataset_id))) } pub fn write_label(&self, dataset_id: DatasetId, label: &DatasetLabel) -> anyhow::Result<()> { - self.transaction.put_cf( - self.cf_handle(CF_DATASETS), - dataset_id, - &borsh::to_vec(label).unwrap() - )?; + self.transaction + .put_cf(self.cf_handle(CF_DATASETS), dataset_id, &borsh::to_vec(label).unwrap())?; Ok(()) } @@ -128,37 +117,23 @@ impl <'a> Tx<'a> { } pub fn delete_chunk(&self, dataset_id: DatasetId, chunk: &Chunk) -> anyhow::Result<()> { - self.transaction.delete_cf( - self.cf_handle(CF_CHUNKS), - ChunkId::new_for_chunk(dataset_id, chunk) - )?; + self.transaction + .delete_cf(self.cf_handle(CF_CHUNKS), ChunkId::new_for_chunk(dataset_id, chunk))?; for table_id in chunk.tables().values() { self.delete_table(table_id)? } Ok(()) } - + pub fn delete_table(&self, table_id: &TableId) -> anyhow::Result<()> { - self.transaction.put_cf( - self.cf_handle(CF_DELETED_TABLES), - table_id, - [] - )?; + self.transaction + .put_cf(self.cf_handle(CF_DELETED_TABLES), table_id, [])?; Ok(()) } - pub fn insert_fork( - &self, - dataset_id: DatasetId, - chunk: &Chunk - ) -> anyhow::Result<()> - { - let existing = self.list_chunks( - dataset_id, - 0, - None - ).into_reversed(); - + pub fn insert_fork(&self, dataset_id: DatasetId, chunk: &Chunk) -> anyhow::Result<()> { + let existing = self.list_chunks(dataset_id, 0, None).into_reversed(); + for head_result in existing { let head = head_result?; if chunk.first_block() <= head.first_block() { @@ -171,7 +146,7 @@ impl <'a> Tx<'a> { head, chunk.parent_block_hash() ); - break + break; } else if head.last_block() < chunk.first_block() { bail!( "there is a gap between new chunk {} and existing {}, that is just below", @@ -180,37 +155,28 @@ impl <'a> Tx<'a> { ) } else { bail!("new chunk {} overlaps with existing {}", chunk, head) - } + } } - + self.write_chunk(dataset_id, chunk)?; - + Ok(()) } - pub fn validate_chunk_insertion( - &self, - dataset_id: DatasetId, - chunk: &Chunk - ) -> anyhow::Result<()> - { + pub fn validate_chunk_insertion(&self, dataset_id: DatasetId, chunk: &Chunk) -> anyhow::Result<()> { ensure!(chunk.first_block() <= chunk.last_block()); - let existing = self.list_chunks(dataset_id, 0, Some(chunk.last_block() + 1)) + let existing = self + .list_chunks(dataset_id, 0, Some(chunk.last_block() + 1)) .into_reversed() .take(2); - + for chunk_result in existing { let n = chunk_result.context("failed to get neighbors")?; - + let is_disjoint = min(n.last_block(), chunk.last_block()) < max(n.first_block(), chunk.first_block()); - ensure!( - is_disjoint, - "new chunk {} overlaps with existing {}", - chunk, - n - ); - + ensure!(is_disjoint, "new chunk {} overlaps with existing {}", chunk, n); + if chunk.last_block() + 1 == n.first_block() { ensure!( chunk.last_block_hash() == n.parent_block_hash(), @@ -219,7 +185,7 @@ impl <'a> Tx<'a> { n ); } - + if n.last_block() + 1 == chunk.first_block() { ensure!( n.last_block_hash() == chunk.parent_block_hash(), @@ -238,22 +204,21 @@ impl <'a> Tx<'a> { chunk: &Chunk, block_number: BlockNumber, expected_parent_hash: &str - ) -> anyhow::Result> - { + ) -> anyhow::Result> { if chunk.first_block() == block_number { return if chunk.parent_block_hash() == expected_parent_hash { Ok(Ok(())) } else { Ok(Err(chunk.parent_block_hash().to_string())) - } + }; } if chunk.last_block() + 1 == block_number { return if chunk.last_block_hash() == expected_parent_hash { - Ok(Ok(())) + Ok(Ok(())) } else { Ok(Err(chunk.last_block_hash().to_string())) - } + }; } ensure!( @@ -263,9 +228,11 @@ impl <'a> Tx<'a> { block_number ); - let blocks_table_id = chunk.tables().get("blocks").copied().ok_or_else(|| { - anyhow!("'blocks' table does not exist in chunk {}", chunk) - })?; + let blocks_table_id = chunk + .tables() + .get("blocks") + .copied() + .ok_or_else(|| anyhow!("'blocks' table does not exist in chunk {}", chunk))?; let parent_hash = get_parent_block_hash( &ReadSnapshot::new(self.db).create_table_reader(blocks_table_id)?, @@ -275,7 +242,7 @@ impl <'a> Tx<'a> { if parent_hash == expected_parent_hash { Ok(Ok(())) } else { - Ok(Err(parent_hash)) + Ok(Err(parent_hash)) } } @@ -284,25 +251,18 @@ impl <'a> Tx<'a> { dataset_id: DatasetId, from_block: BlockNumber, to_block: Option - ) -> ChunkIterator>> - { + ) -> ChunkIterator>> { let mut read_opts = rocksdb::ReadOptions::default(); read_opts.set_snapshot(&self.transaction.snapshot()); - - let cursor = self.transaction.raw_iterator_cf_opt( - self.cf_handle(CF_CHUNKS), - read_opts - ); - - ChunkIterator::new( - cursor, - dataset_id, - from_block, - to_block - ) + + let cursor = self + .transaction + .raw_iterator_cf_opt(self.cf_handle(CF_CHUNKS), read_opts); + + ChunkIterator::new(cursor, dataset_id, from_block, to_block) } fn cf_handle(&self, name: &str) -> &ColumnFamily { self.db.cf_handle(name).unwrap() } -} \ No newline at end of file +} diff --git a/crates/storage/src/kv.rs b/crates/storage/src/kv.rs index 7cf3a589..4603dbb6 100644 --- a/crates/storage/src/kv.rs +++ b/crates/storage/src/kv.rs @@ -1,29 +1,26 @@ use std::ops::Deref; - pub trait KvWrite { fn put(&mut self, key: &[u8], value: &[u8]) -> anyhow::Result<()>; } - pub trait KvRead { type Cursor: KvReadCursor; - - fn get(&self, key: &[u8]) -> anyhow::Result>>; + + fn get(&self, key: &[u8]) -> anyhow::Result>>; fn new_cursor(&self) -> Self::Cursor; } - pub trait KvReadCursor { fn seek_first(&mut self) -> anyhow::Result<()>; - + fn seek(&mut self, key: &[u8]) -> anyhow::Result<()>; - + fn seek_prev(&mut self, key: &[u8]) -> anyhow::Result<()>; fn next(&mut self) -> anyhow::Result<()>; - + fn prev(&mut self) -> anyhow::Result<()>; fn is_valid(&self) -> bool; @@ -31,4 +28,4 @@ pub trait KvReadCursor { fn key(&self) -> &[u8]; fn value(&self) -> &[u8]; -} \ No newline at end of file +} diff --git a/crates/storage/src/lib.rs b/crates/storage/src/lib.rs index 5dd08c9f..88907c9e 100644 --- a/crates/storage/src/lib.rs +++ b/crates/storage/src/lib.rs @@ -1,3 +1,3 @@ -pub mod table; -pub mod kv; pub mod db; +pub mod kv; +pub mod table; diff --git a/crates/storage/src/table/key.rs b/crates/storage/src/table/key.rs index f18d13f7..7300f3bc 100644 --- a/crates/storage/src/table/key.rs +++ b/crates/storage/src/table/key.rs @@ -1,33 +1,24 @@ - enum TableKey { Schema, - Statistic { - column: u16 - }, - Offsets { - buffer: u16 - }, - Page { - buffer: u16, - index: u32 - } + Statistic { column: u16 }, + Offsets { buffer: u16 }, + Page { buffer: u16, index: u32 } } - impl TableKey { fn serialize(&self, out: &mut Vec) { match self { TableKey::Schema => { out.push(0); - }, + } TableKey::Statistic { column } => { out.push(1); out.extend_from_slice(&column.to_be_bytes()); - }, + } TableKey::Offsets { buffer } => { out.push(2); out.extend_from_slice(&buffer.to_be_bytes()) - }, + } TableKey::Page { buffer, index } => { out.push(3); out.extend_from_slice(&buffer.to_be_bytes()); @@ -37,14 +28,12 @@ impl TableKey { } } - #[derive(Clone)] pub struct TableKeyFactory { buf: Vec, name_len: usize } - impl TableKeyFactory { pub fn new>(table_name: N) -> Self { let name = table_name.as_ref(); @@ -55,19 +44,19 @@ impl TableKeyFactory { name_len: name.len() } } - + fn clear(&mut self) { unsafe { self.buf.set_len(self.name_len); } } - + pub fn start(&mut self) -> &[u8] { self.clear(); self.buf.push(0); self.buf.as_slice() } - + pub fn end(&mut self) -> &[u8] { self.clear(); self.buf.push(255); @@ -91,9 +80,7 @@ impl TableKeyFactory { } pub fn offsets(&mut self, buffer: usize) -> &[u8] { - self.make(TableKey::Offsets { - buffer: buffer as u16 - }) + self.make(TableKey::Offsets { buffer: buffer as u16 }) } pub fn page(&mut self, buffer: usize, page: usize) -> &[u8] { @@ -102,4 +89,4 @@ impl TableKeyFactory { index: page as u32 }) } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/mod.rs b/crates/storage/src/table/mod.rs index b5417421..b3bb1370 100644 --- a/crates/storage/src/table/mod.rs +++ b/crates/storage/src/table/mod.rs @@ -1,4 +1,4 @@ +pub(crate) mod key; pub mod read; pub mod stats; pub mod write; -pub(crate) mod key; \ No newline at end of file diff --git a/crates/storage/src/table/read/array.rs b/crates/storage/src/table/read/array.rs index a33556cd..7de104fc 100644 --- a/crates/storage/src/table/read/array.rs +++ b/crates/storage/src/table/read/array.rs @@ -1,21 +1,28 @@ +use std::sync::Arc; + use anyhow::{anyhow, ensure, Context}; -use arrow::array::{ArrayDataBuilder, ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, FixedSizeBinaryArray, ListArray, PrimitiveArray, StringArray, StructArray}; -use arrow::buffer::{BooleanBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{ArrowNativeType, DataType, FieldRef, Fields, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type}; +use arrow::{ + array::{ + ArrayDataBuilder, ArrayRef, ArrowPrimitiveType, BinaryArray, BooleanArray, FixedSizeBinaryArray, ListArray, + PrimitiveArray, StringArray, StructArray + }, + buffer::{BooleanBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}, + datatypes::{ + ArrowNativeType, DataType, FieldRef, Fields, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, + Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type + } +}; use arrow_buffer::MutableBuffer; use rayon::prelude::*; use sqd_array::util::build_field_offsets; use sqd_primitives::range::RangeList; -use std::sync::Arc; - pub trait Storage: Sync { fn read_native( &self, buffer: usize, ranges: Option<&RangeList> - ) -> anyhow::Result> - { + ) -> anyhow::Result> { let buf = self.read_native_bytes(buffer, T::get_byte_width(), ranges)?; Ok(ScalarBuffer::from(buf)) } @@ -27,17 +34,9 @@ pub trait Storage: Sync { ranges: Option<&RangeList> ) -> anyhow::Result; - fn read_boolean( - &self, - buffer: usize, - ranges: Option<&RangeList> - ) -> anyhow::Result; - - fn read_null_mask( - &self, - buffer: usize, - ranges: Option<&RangeList> - ) -> anyhow::Result>; + fn read_boolean(&self, buffer: usize, ranges: Option<&RangeList>) -> anyhow::Result; + + fn read_null_mask(&self, buffer: usize, ranges: Option<&RangeList>) -> anyhow::Result>; fn read_offsets( &self, @@ -46,7 +45,6 @@ pub trait Storage: Sync { ) -> anyhow::Result<(OffsetBuffer, Option>)>; } - pub fn read_array( storage: &impl Storage, pos: usize, @@ -66,12 +64,9 @@ pub fn read_array( DataType::Float16 => read_primitive_array::(storage, pos, ranges), DataType::Float32 => read_primitive_array::(storage, pos, ranges), DataType::Float64 => read_primitive_array::(storage, pos, ranges), - DataType::Timestamp(unit, tz) => read_primitive_data( - storage, - pos, - ranges, - DataType::Timestamp(*unit, tz.clone()) - ), + DataType::Timestamp(unit, tz) => { + read_primitive_data(storage, pos, ranges, DataType::Timestamp(*unit, tz.clone())) + } DataType::Binary => read_binary(storage, pos, ranges), DataType::FixedSizeBinary(size) => read_fixed_size_binary(storage, pos, ranges, *size), DataType::Utf8 => read_string(storage, pos, ranges), @@ -81,17 +76,13 @@ pub fn read_array( } } - -fn read_boolean_array( - storage: &impl Storage, - pos: usize, - ranges: Option<&RangeList> -) -> anyhow::Result -{ - let nulls = storage.read_null_mask(pos, ranges) +fn read_boolean_array(storage: &impl Storage, pos: usize, ranges: Option<&RangeList>) -> anyhow::Result { + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; - let values = storage.read_boolean(pos + 1, ranges) + let values = storage + .read_boolean(pos + 1, ranges) .context("failed to read boolean values")?; if let Some(mask) = nulls.as_ref() { @@ -106,17 +97,17 @@ fn read_boolean_array( Ok(Arc::new(array)) } - fn read_primitive_array( storage: &impl Storage, pos: usize, ranges: Option<&RangeList> -) -> anyhow::Result -{ - let nulls = storage.read_null_mask(pos, ranges) +) -> anyhow::Result { + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; - let values = storage.read_native::(pos + 1, ranges) + let values = storage + .read_native::(pos + 1, ranges) .context("failed to read values buffer")?; let array = PrimitiveArray::::try_new(values, nulls)?; @@ -124,20 +115,20 @@ fn read_primitive_array( Ok(Arc::new(array)) } - fn read_primitive_data( storage: &impl Storage, pos: usize, ranges: Option<&RangeList>, data_type: DataType -) -> anyhow::Result -{ +) -> anyhow::Result { let value_size = data_type.primitive_width().expect("not a primitive data type"); - let nulls = storage.read_null_mask(pos, ranges) + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; - let values = storage.read_native_bytes(pos + 1, value_size, ranges) + let values = storage + .read_native_bytes(pos + 1, value_size, ranges) .context("failed to read values buffer")?; let array_data = ArrayDataBuilder::new(data_type) @@ -149,20 +140,17 @@ fn read_primitive_data( Ok(arrow::array::make_array(array_data)) } - -fn read_binary( - storage: &impl Storage, - pos: usize, - ranges: Option<&RangeList> -) -> anyhow::Result -{ - let nulls = storage.read_null_mask(pos, ranges) +fn read_binary(storage: &impl Storage, pos: usize, ranges: Option<&RangeList>) -> anyhow::Result { + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; - let (offsets, value_ranges) = storage.read_offsets(pos + 1, ranges) + let (offsets, value_ranges) = storage + .read_offsets(pos + 1, ranges) .context("failed to read offsets")?; - let values = storage.read_native::(pos + 2, value_ranges.as_ref()) + let values = storage + .read_native::(pos + 2, value_ranges.as_ref()) .context("failed to read values array")?; let array = BinaryArray::try_new(offsets, values.into_inner(), nulls)?; @@ -170,29 +158,29 @@ fn read_binary( Ok(Arc::new(array)) } - fn read_fixed_size_binary( storage: &impl Storage, pos: usize, ranges: Option<&RangeList>, size: i32 -) -> anyhow::Result -{ - let nulls = storage.read_null_mask(pos, ranges) +) -> anyhow::Result { + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; let value_ranges = ranges.map(|list| { unsafe { // SAFETY: monotonicity, non-emptiness and non-overlapping are guaranteed by construction RangeList::new_unchecked( - list.iter() - .map(|r| (r.start * size as u32)..(r.end * size as u32)) - .collect::>(), + list.iter() + .map(|r| (r.start * size as u32)..(r.end * size as u32)) + .collect::>() ) } }); - let values = storage.read_native::(pos + 1, value_ranges.as_ref()) + let values = storage + .read_native::(pos + 1, value_ranges.as_ref()) .context("failed to read values array")?; let array = FixedSizeBinaryArray::try_new(size, values.into_inner(), nulls)?; @@ -200,20 +188,17 @@ fn read_fixed_size_binary( Ok(Arc::new(array)) } - -fn read_string( - storage: &impl Storage, - pos: usize, - ranges: Option<&RangeList> -) -> anyhow::Result -{ - let nulls = storage.read_null_mask(pos, ranges) +fn read_string(storage: &impl Storage, pos: usize, ranges: Option<&RangeList>) -> anyhow::Result { + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; - let (offsets, value_ranges) = storage.read_offsets(pos + 1, ranges) + let (offsets, value_ranges) = storage + .read_offsets(pos + 1, ranges) .context("failed to read offsets")?; - let values = storage.read_native::(pos + 2, value_ranges.as_ref()) + let values = storage + .read_native::(pos + 2, value_ranges.as_ref()) .context("failed to read values array")?; let array = StringArray::try_new(offsets, values.into_inner(), nulls)?; @@ -221,60 +206,49 @@ fn read_string( Ok(Arc::new(array)) } - fn read_list( storage: &impl Storage, pos: usize, ranges: Option<&RangeList>, field: FieldRef -) -> anyhow::Result -{ - let nulls = storage.read_null_mask(pos, ranges) +) -> anyhow::Result { + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; - let (offsets, value_ranges) = storage.read_offsets(pos + 1, ranges) + let (offsets, value_ranges) = storage + .read_offsets(pos + 1, ranges) .context("failed to read offsets")?; - let values = read_array( - storage, - pos + 2, - value_ranges.as_ref(), - field.data_type() - ).context("failed to read list values array")?; + let values = read_array(storage, pos + 2, value_ranges.as_ref(), field.data_type()) + .context("failed to read list values array")?; let array = ListArray::try_new(field, offsets, values, nulls)?; Ok(Arc::new(array)) } - fn read_struct( storage: &impl Storage, pos: usize, ranges: Option<&RangeList>, fields: Fields -) -> anyhow::Result -{ - let nulls = storage.read_null_mask(pos, ranges) +) -> anyhow::Result { + let nulls = storage + .read_null_mask(pos, ranges) .context("failed to read null mask")?; - + let field_positions = build_field_offsets(pos + 1, &fields); let arrays = (0..fields.len()) .into_par_iter() .map(|i| { - read_array( - storage, - field_positions[i], - ranges, - fields[i].data_type() - ).with_context(|| { - anyhow!("failed to read field `{}`", fields[i].name()) - }) + read_array(storage, field_positions[i], ranges, fields[i].data_type()) + .with_context(|| anyhow!("failed to read field `{}`", fields[i].name())) }) .collect::>>()?; - + let struct_array = StructArray::try_new(fields, arrays, nulls)?; - + Ok(Arc::new(struct_array)) -} \ No newline at end of file +} diff --git a/crates/storage/src/table/read/cursor_byte_reader.rs b/crates/storage/src/table/read/cursor_byte_reader.rs index 8a3831ab..334ce02f 100644 --- a/crates/storage/src/table/read/cursor_byte_reader.rs +++ b/crates/storage/src/table/read/cursor_byte_reader.rs @@ -1,9 +1,7 @@ -use crate::kv::KvReadCursor; -use crate::table::key::TableKeyFactory; use anyhow::{ensure, Context}; -use sqd_array::io::reader::ByteReader; -use sqd_array::util::bisect_offsets; +use sqd_array::{io::reader::ByteReader, util::bisect_offsets}; +use crate::{kv::KvReadCursor, table::key::TableKeyFactory}; pub struct CursorByteReader { cursor: C, @@ -13,36 +11,29 @@ pub struct CursorByteReader { current_page: Option } - impl CursorByteReader { - pub fn new( - cursor: C, - key: TableKeyFactory, - buffer: usize, - page_offsets: Vec - ) -> Self - { + pub fn new(cursor: C, key: TableKeyFactory, buffer: usize, page_offsets: Vec) -> Self { Self { - cursor, - key, - buffer, + cursor, + key, + buffer, page_offsets, current_page: None } } - + fn find_page(&self, offset: usize) -> usize { let offset = offset as u32; if let Some(page) = self.current_page { let beg = self.page_offsets[page]; let end = self.page_offsets[page + 1]; if beg <= offset && offset < end { - return page + return page; } } bisect_offsets(&self.page_offsets, offset).expect("out of bounds access") } - + fn go_to_page(&mut self, page: usize) -> anyhow::Result<()> { match self.current_page { Some(current) if current == page => return Ok(()), @@ -50,17 +41,17 @@ impl CursorByteReader { self.current_page = None; self.cursor.next()?; ensure!( - self.cursor.is_valid() && self.cursor.key() == self.key.page(self.buffer, page), + self.cursor.is_valid() && self.cursor.key() == self.key.page(self.buffer, page), "page was not found at expected place" ); - }, + } _ => { self.current_page = None; self.cursor.seek(self.key.page(self.buffer, page))?; ensure!(self.cursor.is_valid(), "page was not found") } } - + let expected_len = self.page_offsets[page + 1] - self.page_offsets[page]; ensure!( self.cursor.value().len() == expected_len as usize, @@ -68,13 +59,12 @@ impl CursorByteReader { expected_len, self.cursor.value().len() ); - + self.current_page = Some(page); Ok(()) } } - impl ByteReader for CursorByteReader { fn len(&self) -> usize { self.page_offsets.last().copied().unwrap() as usize @@ -83,17 +73,16 @@ impl ByteReader for CursorByteReader { fn read(&mut self, offset: usize, len: usize) -> anyhow::Result<&[u8]> { assert!(offset + len <= self.len()); let page = self.find_page(offset); - - self.go_to_page(page).with_context(|| { - format!("failed to navigate to page {} of buffer {}", page, self.buffer) - })?; - + + self.go_to_page(page) + .with_context(|| format!("failed to navigate to page {} of buffer {}", page, self.buffer))?; + let beg = offset - self.page_offsets[page] as usize; let end = std::cmp::min( - beg + len, + beg + len, self.page_offsets[page + 1] as usize - self.page_offsets[page] as usize ); - + Ok(&self.cursor.value()[beg..end]) } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/read/mod.rs b/crates/storage/src/table/read/mod.rs index 5868fbee..f0dc95ac 100644 --- a/crates/storage/src/table/read/mod.rs +++ b/crates/storage/src/table/read/mod.rs @@ -3,5 +3,4 @@ mod cursor_byte_reader; mod pagination; mod reader; - -pub use reader::*; \ No newline at end of file +pub use reader::*; diff --git a/crates/storage/src/table/read/pagination.rs b/crates/storage/src/table/read/pagination.rs index a0024bb7..07156318 100644 --- a/crates/storage/src/table/read/pagination.rs +++ b/crates/storage/src/table/read/pagination.rs @@ -1,7 +1,7 @@ -use anyhow::ensure; -use sqd_primitives::range::RangeList; use std::ops::Range; +use anyhow::ensure; +use sqd_primitives::range::RangeList; pub struct PageRead { page_index: usize, @@ -9,7 +9,6 @@ pub struct PageRead { ranges_slice: Range } - pub enum Pagination<'a> { Ranged { pages: Vec, @@ -22,8 +21,7 @@ pub enum Pagination<'a> { } } - -impl <'a> Pagination<'a> { +impl<'a> Pagination<'a> { pub fn new(page_offsets: &'a [u32], ranges: Option<&'a RangeList>) -> anyhow::Result { Ok(if let Some(ranges) = ranges { Pagination::Ranged { @@ -48,12 +46,8 @@ impl <'a> Pagination<'a> { pub fn num_items(&self) -> usize { match self { - Pagination::Ranged { ranges, .. } => { - ranges.iter().map(|r| r.len()).sum() - }, - Pagination::All { ranges, .. } => { - ranges[0].end as usize - } + Pagination::Ranged { ranges, .. } => ranges.iter().map(|r| r.len()).sum(), + Pagination::All { ranges, .. } => ranges[0].end as usize } } @@ -69,42 +63,35 @@ impl <'a> Pagination<'a> { pub fn page_range(&self, seq: usize) -> Range { let (offsets, idx) = match self { - Pagination::Ranged { page_offsets, pages, .. } => { - (page_offsets, pages[seq].page_index) - }, - Pagination::All { page_offsets, .. } => { - (page_offsets, seq) - } + Pagination::Ranged { + page_offsets, pages, .. + } => (page_offsets, pages[seq].page_index), + Pagination::All { page_offsets, .. } => (page_offsets, seq) }; offsets[idx] as usize..offsets[idx + 1] as usize } pub fn page_write_offset(&self, seq: usize) -> usize { match self { - Pagination::Ranged { pages, .. } => { - pages[seq].write_offset - }, - Pagination::All { page_offsets, .. } => { - page_offsets[seq] as usize - } + Pagination::Ranged { pages, .. } => pages[seq].write_offset, + Pagination::All { page_offsets, .. } => page_offsets[seq] as usize } } pub fn iter_ranges(&self, seq: usize) -> impl Iterator> + '_ { let (ranges, page_range) = match self { - Pagination::Ranged { pages, ranges, page_offsets } => { + Pagination::Ranged { + pages, + ranges, + page_offsets + } => { let page = &pages[seq]; ( &ranges.as_slice()[page.ranges_slice.clone()], page_offsets[page.page_index]..page_offsets[page.page_index + 1] ) - }, - Pagination::All { ranges, page_offsets } => { - ( - ranges.as_slice(), - page_offsets[seq]..page_offsets[seq + 1] - ) } + Pagination::All { ranges, page_offsets } => (ranges.as_slice(), page_offsets[seq]..page_offsets[seq + 1]) }; ranges.iter().map(move |r| { let beg = r.start.saturating_sub(page_range.start); @@ -114,7 +101,6 @@ impl <'a> Pagination<'a> { } } - fn build_page_reads(page_offsets: &[u32], ranges: &RangeList) -> anyhow::Result> { ensure!( page_offsets.last().cloned().unwrap() >= ranges.end(), @@ -141,7 +127,7 @@ fn build_page_reads(page_offsets: &[u32], ranges: &RangeList) -> anyhow::Re while let Some((idx, r)) = ranges.peek().cloned() { if r.start >= page_range.end { - break + break; } let beg = std::cmp::max(page_range.start, r.start); @@ -156,12 +142,12 @@ fn build_page_reads(page_offsets: &[u32], ranges: &RangeList) -> anyhow::Re } if end == page_range.end { - break + break; } } reads.push(read); - }, + } _ => {} } } @@ -169,4 +155,4 @@ fn build_page_reads(page_offsets: &[u32], ranges: &RangeList) -> anyhow::Re assert!(ranges.peek().is_none()); Ok(reads) -} \ No newline at end of file +} diff --git a/crates/storage/src/table/read/reader.rs b/crates/storage/src/table/read/reader.rs index 766fbe27..90df3a27 100644 --- a/crates/storage/src/table/read/reader.rs +++ b/crates/storage/src/table/read/reader.rs @@ -1,27 +1,36 @@ -use super::array::{read_array, Storage}; -use super::cursor_byte_reader::CursorByteReader; -use super::pagination::Pagination; -use crate::kv::{KvRead, KvReadCursor}; -use crate::table::key::TableKeyFactory; -use crate::table::stats::{can_have_stats, deserialize_stats, Stats, StatsBuilder}; +use std::{collections::HashSet, ops::Range, sync::Arc}; + use anyhow::{anyhow, ensure, Context}; -use arrow::array::{ArrayRef, BooleanBufferBuilder, RecordBatch, RecordBatchOptions}; -use arrow::buffer::{BooleanBuffer, MutableBuffer, OffsetBuffer, ScalarBuffer}; -use arrow::datatypes::{ArrowNativeType, Schema, SchemaRef}; -use arrow::util::bit_util; +use arrow::{ + array::{ArrayRef, BooleanBufferBuilder, RecordBatch, RecordBatchOptions}, + buffer::{BooleanBuffer, MutableBuffer, OffsetBuffer, ScalarBuffer}, + datatypes::{ArrowNativeType, Schema, SchemaRef}, + util::bit_util +}; use arrow_buffer::NullBuffer; use parking_lot::Mutex; use rayon::prelude::*; -use sqd_array::builder::{AnyBuilder, ArrayBuilder}; -use sqd_array::io::reader::{BitmaskIOReader, IOReader, NativeIOReader, NullmaskIOReader, OffsetsIOReader}; -use sqd_array::reader::{AnyReader, ArrayReader, Reader, ReaderFactory}; -use sqd_array::slice::AsSlice; -use sqd_array::util::{build_field_offsets, validate_offsets}; +use sqd_array::{ + builder::{AnyBuilder, ArrayBuilder}, + io::reader::{BitmaskIOReader, IOReader, NativeIOReader, NullmaskIOReader, OffsetsIOReader}, + reader::{AnyReader, ArrayReader, Reader, ReaderFactory}, + slice::AsSlice, + util::{build_field_offsets, validate_offsets} +}; use sqd_primitives::range::RangeList; -use std::collections::HashSet; -use std::ops::Range; -use std::sync::Arc; +use super::{ + array::{read_array, Storage}, + cursor_byte_reader::CursorByteReader, + pagination::Pagination +}; +use crate::{ + kv::{KvRead, KvReadCursor}, + table::{ + key::TableKeyFactory, + stats::{can_have_stats, deserialize_stats, Stats, StatsBuilder} + } +}; pub struct TableReader { storage: S, @@ -33,25 +42,25 @@ pub struct TableReader { num_rows: usize } - impl TableReader { pub fn new(storage: S, table_name: &[u8]) -> anyhow::Result { let mut key = TableKeyFactory::new(table_name); let schema = { - let bytes = storage.get(key.schema())?.ok_or_else(|| { - anyhow!("schema key not found") - })?; - arrow::ipc::root_as_schema(&bytes).map(arrow::ipc::convert::fb_to_schema) + let bytes = storage + .get(key.schema())? + .ok_or_else(|| anyhow!("schema key not found"))?; + arrow::ipc::root_as_schema(&bytes) + .map(arrow::ipc::convert::fb_to_schema) .map_err(|_| anyhow!("failed to deserialize table schema"))? }; let column_positions = build_field_offsets(0, schema.fields()); - + let offsets = std::iter::repeat_with(Mutex::default) .take(column_positions.last().copied().unwrap()) .collect(); - + let stats = std::iter::repeat_with(Mutex::default) .take(schema.fields().len()) .collect(); @@ -65,7 +74,7 @@ impl TableReader { stats, num_rows: 0 }; - + if table.column_positions.len() > 1 { // Let's set number of rows // First buffer is always a null mask @@ -79,7 +88,7 @@ impl TableReader { pub fn schema(&self) -> SchemaRef { self.schema.clone() } - + pub fn num_rows(&self) -> usize { self.num_rows } @@ -96,20 +105,20 @@ impl TableReader { } fn read_column_stats(&self, column_index: usize) -> anyhow::Result> { - self.storage.get( - self.key.clone().statistic(column_index) - )?.map(|data| { - let data_type = self.schema.field(column_index).data_type(); - deserialize_stats(&data, data_type) - }).transpose() + self.storage + .get(self.key.clone().statistic(column_index))? + .map(|data| { + let data_type = self.schema.field(column_index).data_type(); + deserialize_stats(&data, data_type) + }) + .transpose() } pub fn read_table( &self, projection: Option<&HashSet<&str>>, - row_ranges: Option<&RangeList>, - ) -> anyhow::Result - { + row_ranges: Option<&RangeList> + ) -> anyhow::Result { let column_indexes = if let Some(projection) = projection { let mut columns = Vec::with_capacity(projection.len()); for (i, f) in self.schema.fields().iter().enumerate() { @@ -132,14 +141,15 @@ impl TableReader { ); } RecordBatch::try_new_with_options( - Schema::empty().into(), + Schema::empty().into(), vec![], &RecordBatchOptions::new().with_row_count(self.num_rows.into()) )? } else { - let columns = column_indexes.par_iter().map(|i| { - self.read_column(*i, row_ranges) - }).collect::>>()?; + let columns = column_indexes + .par_iter() + .map(|i| self.read_column(*i, row_ranges)) + .collect::>>()?; let schema = if column_indexes.len() == self.schema.fields().len() { self.schema.clone() @@ -156,27 +166,22 @@ impl TableReader { self, self.column_positions[index], ranges, - self.schema.field(index).data_type(), - ).with_context(|| { - format!("failed to read column '{}'", self.schema.field(index).name()) - }) + self.schema.field(index).data_type() + ) + .with_context(|| format!("failed to read column '{}'", self.schema.field(index).name())) } pub fn create_column_reader( &self, column_index: usize - ) -> anyhow::Result::Cursor>>>> - { + ) -> anyhow::Result::Cursor>>>> { let mut factory = CursorReaderFactory { table: self, buffer: self.column_positions[column_index] }; - - let reader = AnyReader::from_factory( - &mut factory, - self.schema.field(column_index).data_type() - )?; - + + let reader = AnyReader::from_factory(&mut factory, self.schema.field(column_index).data_type())?; + ensure!( reader.len() == self.num_rows, "column {} ({}) has length {}, but {} was expected", @@ -185,21 +190,21 @@ impl TableReader { reader.len(), self.num_rows ); - + Ok(reader) } pub fn build_column_stats(&self, window: usize, column_index: usize) -> anyhow::Result { ensure!(window > 0); - + let data_type = self.schema.field(column_index).data_type(); - + ensure!( can_have_stats(data_type), "stats are not supported for columns of type {}", data_type ); - + let mut reader = self.create_column_reader(column_index)?; let mut array_builder = AnyBuilder::new(data_type); let mut stats_builder = StatsBuilder::new(data_type.clone()); @@ -212,7 +217,7 @@ impl TableReader { array_builder.clear(); pos += window; } - + if end - pos > window { let window = (end - pos) / 2; reader.read_slice(&mut array_builder, pos, window)?; @@ -225,26 +230,21 @@ impl TableReader { reader.read_slice(&mut array_builder, pos, end - pos)?; stats_builder.push_entry(&array_builder.as_slice()); } - + Ok(stats_builder.finish()) } - fn get_buffer_pages( - &self, - buffer: usize, - ) -> anyhow::Result> - { + fn get_buffer_pages(&self, buffer: usize) -> anyhow::Result> { let mut offsets_lock = self.offsets[buffer].lock(); if let Some(buf) = offsets_lock.as_ref() { return Ok(buf.clone()); } let mut key = self.key.clone(); - let page = self.storage.get( - key.offsets(buffer) - )?.ok_or_else(|| { - anyhow!("offsets page was not found") - })?; + let page = self + .storage + .get(key.offsets(buffer))? + .ok_or_else(|| anyhow!("offsets page was not found"))?; let offsets = { let item_size = u32::get_byte_width(); @@ -261,9 +261,7 @@ impl TableReader { validate_offsets(&offsets, 0).map_err(|msg| anyhow!(msg))?; ensure!(offsets[0] == 0); - let offsets = unsafe { - OffsetBuffer::new_unchecked(offsets) - }; + let offsets = unsafe { OffsetBuffer::new_unchecked(offsets) }; *offsets_lock = Some(offsets.clone()); @@ -274,23 +272,21 @@ impl TableReader { &self, buffer: usize, pagination: &Pagination<'_>, - mut cb: F, - ) -> anyhow::Result<()> - { + mut cb: F + ) -> anyhow::Result<()> { match pagination.num_pages() { - 0 => {}, + 0 => {} 1 => { let page_idx = pagination.page_index(0); let mut key = self.key.clone(); - let value = self.storage.get( - key.page(buffer, page_idx) - )?.ok_or_else(|| { - anyhow!("page {} was not found", page_idx) - })?; + let value = self + .storage + .get(key.page(buffer, page_idx))? + .ok_or_else(|| anyhow!("page {} was not found", page_idx))?; cb(0, &value)?; - }, + } n => { let mut key = self.key.clone(); let mut prev_page_idx = 0; @@ -319,9 +315,8 @@ impl TableReader { fn read_offsets( &self, buffer: usize, - ranges: Option<&RangeList>, - ) -> anyhow::Result<(OffsetBuffer, Option>)> - { + ranges: Option<&RangeList> + ) -> anyhow::Result<(OffsetBuffer, Option>)> { Ok(if let Some(ranges) = ranges { let (offsets, value_ranges) = self.read_ranged_offsets(buffer, ranges)?; (offsets, Some(value_ranges)) @@ -332,35 +327,27 @@ impl TableReader { } fn read_all_offsets(&self, buffer: usize) -> anyhow::Result> { - let offsets = self.read_native_bytes(buffer, i32::get_byte_width(), None) + let offsets = self + .read_native_bytes(buffer, i32::get_byte_width(), None) .map(ScalarBuffer::::from)?; validate_offsets(&offsets, 0).map_err(|msg| anyhow!(msg))?; - Ok(unsafe { - OffsetBuffer::new_unchecked(offsets) - }) + Ok(unsafe { OffsetBuffer::new_unchecked(offsets) }) } fn read_ranged_offsets( &self, buffer: usize, - ranges: &RangeList, - ) -> anyhow::Result<(OffsetBuffer, RangeList)> - { + ranges: &RangeList + ) -> anyhow::Result<(OffsetBuffer, RangeList)> { if ranges.len() == 0 { return Ok((OffsetBuffer::new_empty(), RangeList::new(vec![]))); } - let offset_ranges = RangeList::seal(ranges.iter().map(|r| { - r.start..r.end + 1 - })); + let offset_ranges = RangeList::seal(ranges.iter().map(|r| r.start..r.end + 1)); - let mut buf = self.read_native_bytes( - buffer, - i32::get_byte_width(), - Some(&offset_ranges), - )?; + let mut buf = self.read_native_bytes(buffer, i32::get_byte_width(), Some(&offset_ranges))?; let offsets = buf.typed_data_mut::(); validate_offsets(offsets, 0).map_err(|msg| anyhow!(msg))?; @@ -374,12 +361,8 @@ impl TableReader { let vr = offsets[beg] as u32..offsets[end] as u32; if vr.start < vr.end { match value_ranges.last_mut() { - Some(prev) if prev.end == vr.start => { - prev.end = vr.end - } - _ => { - value_ranges.push(vr) - } + Some(prev) if prev.end == vr.start => prev.end = vr.end, + _ => value_ranges.push(vr) } } pos = end + 1; @@ -416,9 +399,8 @@ impl TableReader { &self, buffer: usize, item_size: usize, - ranges: Option<&RangeList>, - ) -> anyhow::Result - { + ranges: Option<&RangeList> + ) -> anyhow::Result { let page_offsets = self.get_buffer_pages(buffer)?; let pagination = Pagination::new(&page_offsets, ranges)?; let mut buf = MutableBuffer::from_len_zeroed(pagination.num_items() * item_size); @@ -428,7 +410,7 @@ impl TableReader { buffer, &pagination, 0..pagination.num_pages(), - buf.as_slice_mut(), + buf.as_slice_mut() )?; Ok(buf) @@ -440,9 +422,8 @@ impl TableReader { buffer: usize, pagination: &Pagination, pages: Range, - dest: &mut [u8], - ) -> anyhow::Result<()> - { + dest: &mut [u8] + ) -> anyhow::Result<()> { match pages.len() { 0 => Ok(()), 1 => { @@ -460,7 +441,7 @@ impl TableReader { let (lower_res, upper_res) = rayon::join( || self.read_native_par(item_size, buffer, pagination, lower, lower_buf), - || self.read_native_par(item_size, buffer, pagination, upper, upper_buf), + || self.read_native_par(item_size, buffer, pagination, upper, upper_buf) ); lower_res?; @@ -477,17 +458,15 @@ impl TableReader { buffer: usize, pagination: &Pagination, page_seq: usize, - dest: &mut [u8], - ) -> anyhow::Result<()> - { + dest: &mut [u8] + ) -> anyhow::Result<()> { let page_idx = pagination.page_index(page_seq); let mut key = self.key.clone(); - let data = self.storage.get( - key.page(buffer, page_idx) - )?.with_context(|| { - anyhow!("page {} was not found", page_idx) - })?; + let data = self + .storage + .get(key.page(buffer, page_idx))? + .with_context(|| anyhow!("page {} was not found", page_idx))?; ensure!( data.len() % item_size == 0, @@ -518,31 +497,17 @@ impl TableReader { Ok(()) } - fn read_boolean( - &self, - buffer: usize, - ranges: Option<&RangeList>, - ) -> anyhow::Result - { + fn read_boolean(&self, buffer: usize, ranges: Option<&RangeList>) -> anyhow::Result { let page_offsets = self.get_buffer_pages(buffer)?; self.read_bitmask(buffer, page_offsets, ranges) } - fn read_null_mask( - &self, - buffer: usize, - ranges: Option<&RangeList>, - ) -> anyhow::Result> - { + fn read_null_mask(&self, buffer: usize, ranges: Option<&RangeList>) -> anyhow::Result> { let page_offsets = self.get_nullmask_pages(buffer)?; if page_offsets.len() == 2 { Ok(None) } else { - let values = self.read_bitmask( - buffer, - page_offsets.slice(0, page_offsets.len() - 1), - ranges - )?; + let values = self.read_bitmask(buffer, page_offsets.slice(0, page_offsets.len() - 1), ranges)?; let nulls = NullBuffer::new(values); Ok(Some(nulls)) } @@ -550,12 +515,15 @@ impl TableReader { fn get_nullmask_pages(&self, buffer: usize) -> anyhow::Result> { let pages = self.get_buffer_pages(buffer)?; - + ensure!(pages.len() >= 2, "nullmask offsets should contain at least 2 pages"); - + if pages.len() > 2 { let bit_len = pages[pages.len() - 1]; - ensure!(pages[pages.len() - 2] == bit_len, "bitmask and nullmask lengths are different"); + ensure!( + pages[pages.len() - 2] == bit_len, + "bitmask and nullmask lengths are different" + ); } Ok(pages) @@ -566,8 +534,7 @@ impl TableReader { buffer: usize, page_offsets: OffsetBuffer, ranges: Option<&RangeList> - ) -> anyhow::Result - { + ) -> anyhow::Result { let pagination = Pagination::new(&page_offsets, ranges)?; let mut buf = BooleanBufferBuilder::new(pagination.num_items()); self.for_each_page(buffer, &pagination, |i, data| { @@ -589,9 +556,13 @@ impl TableReader { } } - impl Storage for TableReader { - fn read_native_bytes(&self, buffer: usize, item_size: usize, ranges: Option<&RangeList>) -> anyhow::Result { + fn read_native_bytes( + &self, + buffer: usize, + item_size: usize, + ranges: Option<&RangeList> + ) -> anyhow::Result { self.read_native_bytes(buffer, item_size, ranges) } @@ -603,39 +574,44 @@ impl Storage for TableReader { self.read_null_mask(buffer, ranges) } - fn read_offsets(&self, buffer: usize, ranges: Option<&RangeList>) -> anyhow::Result<(OffsetBuffer, Option>)> { + fn read_offsets( + &self, + buffer: usize, + ranges: Option<&RangeList> + ) -> anyhow::Result<(OffsetBuffer, Option>)> { self.read_offsets(buffer, ranges) } } - struct CursorReaderFactory<'a, S> { table: &'a TableReader, buffer: usize } - impl<'a, S: KvRead + Sync> CursorReaderFactory<'a, S> { fn next_bitmask( &mut self, pages: OffsetBuffer - ) -> anyhow::Result>> - { + ) -> anyhow::Result>> { let bit_len = pages.last().copied().unwrap() as usize; - let byte_offsets = pages.iter().enumerate().map(|(i, &o)| { - Ok(if i == pages.len() - 1 { - bit_util::ceil(o as usize, 8) as u32 - } else { - ensure!( - o % 8 == 0, - "unaligned intermediate bitmask page: buffer {}, page {}", - self.buffer, - i - ); - o / 8 + let byte_offsets = pages + .iter() + .enumerate() + .map(|(i, &o)| { + Ok(if i == pages.len() - 1 { + bit_util::ceil(o as usize, 8) as u32 + } else { + ensure!( + o % 8 == 0, + "unaligned intermediate bitmask page: buffer {}, page {}", + self.buffer, + i + ); + o / 8 + }) }) - }).collect::>>()?; + .collect::>>()?; let byte_reader = CursorByteReader::new( self.table.storage.new_cursor(), @@ -651,9 +627,7 @@ impl<'a, S: KvRead + Sync> CursorReaderFactory<'a, S> { fn next_native_cursor(&mut self, item_size: usize) -> anyhow::Result> { let pages = self.table.get_buffer_pages(self.buffer)?; - let byte_offsets: Vec = pages.iter() - .map(|o| *o * item_size as u32) - .collect(); + let byte_offsets: Vec = pages.iter().map(|o| *o * item_size as u32).collect(); let byte_reader = CursorByteReader::new( self.table.storage.new_cursor(), @@ -667,7 +641,6 @@ impl<'a, S: KvRead + Sync> CursorReaderFactory<'a, S> { } } - impl<'a, S: KvRead + Sync> ReaderFactory for CursorReaderFactory<'a, S> { type Reader = IOReader>; @@ -698,4 +671,4 @@ impl<'a, S: KvRead + Sync> ReaderFactory for CursorReaderFactory<'a, S> { let byte_reader = self.next_native_cursor(i32::get_byte_width())?; OffsetsIOReader::new(byte_reader) } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/stats/builder.rs b/crates/storage/src/table/stats/builder.rs index b0174a1a..8dc2947e 100644 --- a/crates/storage/src/table/stats/builder.rs +++ b/crates/storage/src/table/stats/builder.rs @@ -1,10 +1,12 @@ -use super::{can_have_stats, Stats}; use arrow::datatypes::DataType; use arrow_buffer::{ArrowNativeType, OffsetBuffer}; -use sqd_array::builder::{AnyBuilder, ArrayBuilder}; -use sqd_array::slice::{AnySlice, FixedSizeListSlice, ListSlice, PrimitiveSlice, Slice}; -use sqd_array::writer::ArrayWriter; +use sqd_array::{ + builder::{AnyBuilder, ArrayBuilder}, + slice::{AnySlice, FixedSizeListSlice, ListSlice, PrimitiveSlice, Slice}, + writer::ArrayWriter +}; +use super::{can_have_stats, Stats}; pub struct StatsBuilder { data_type: DataType, @@ -14,7 +16,6 @@ pub struct StatsBuilder { max: AnyBuilder } - impl StatsBuilder { pub fn new(data_type: DataType) -> Self { assert!(can_have_stats(&data_type), "data type {} can't have stats", data_type); @@ -28,9 +29,7 @@ impl StatsBuilder { } pub fn finish(self) -> Stats { - let offsets = unsafe { - OffsetBuffer::new_unchecked(self.offsets.into()) - }; + let offsets = unsafe { OffsetBuffer::new_unchecked(self.offsets.into()) }; Stats { offsets, min: self.min.finish(), @@ -45,26 +44,22 @@ impl StatsBuilder { DataType::Int8 => self.push_primitive(values.as_i8()), DataType::Int16 => self.push_primitive(values.as_i16()), - DataType::Int32 | - DataType::Date32 | - DataType::Time32(_) => self.push_primitive(values.as_i32()), + DataType::Int32 | DataType::Date32 | DataType::Time32(_) => self.push_primitive(values.as_i32()), - DataType::Int64 | - DataType::Timestamp(_, _) | - DataType::Date64 | - DataType::Time64(_) | - DataType::Duration(_) | - DataType::Interval(_) => self.push_primitive(values.as_i64()), + DataType::Int64 + | DataType::Timestamp(_, _) + | DataType::Date64 + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Interval(_) => self.push_primitive(values.as_i64()), DataType::UInt8 => self.push_primitive(values.as_u8()), DataType::UInt16 => self.push_primitive(values.as_u16()), DataType::UInt32 => self.push_primitive(values.as_u32()), DataType::UInt64 => self.push_primitive(values.as_u64()), - DataType::Binary | - DataType::Utf8 => self.push_binary(values.as_binary()), - DataType::FixedSizeBinary(_) => - self.push_fixed_size_binary(values.as_fixed_size_binary()), + DataType::Binary | DataType::Utf8 => self.push_binary(values.as_binary()), + DataType::FixedSizeBinary(_) => self.push_fixed_size_binary(values.as_fixed_size_binary()), ty => unreachable!("unexpected arrow type - {}", ty) }; } @@ -99,7 +94,7 @@ impl StatsBuilder { min_builder.append_null(); max_builder.append_null() } - }, + } (AnyBuilder::String(min_builder), AnyBuilder::String(max_builder)) => { if let Some((min, max)) = min_max { // FIXME: we should not crush here @@ -109,7 +104,7 @@ impl StatsBuilder { min_builder.append_null(); max_builder.append_null() } - }, + } _ => unreachable!() }; } @@ -121,10 +116,7 @@ impl StatsBuilder { }); match (&mut self.min, &mut self.max) { - ( - AnyBuilder::FixedSizeBinary(min_builder), - AnyBuilder::FixedSizeBinary(max_builder), - ) => { + (AnyBuilder::FixedSizeBinary(min_builder), AnyBuilder::FixedSizeBinary(max_builder)) => { if let Some((min, max)) = min_max { min_builder.append(min); max_builder.append(max) @@ -133,7 +125,7 @@ impl StatsBuilder { max_builder.append_null() } } - _ => unreachable!(), + _ => unreachable!() }; } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/stats/mod.rs b/crates/storage/src/table/stats/mod.rs index 2df8e85d..e09bc953 100644 --- a/crates/storage/src/table/stats/mod.rs +++ b/crates/storage/src/table/stats/mod.rs @@ -1,16 +1,14 @@ mod builder; mod serde; - -use arrow::array::ArrayRef; -use arrow::datatypes::{DataType, TimeUnit}; +use arrow::{ + array::ArrayRef, + datatypes::{DataType, TimeUnit} +}; use arrow_buffer::OffsetBuffer; - - pub use builder::*; pub use serde::*; - #[derive(Clone)] pub struct Stats { pub offsets: OffsetBuffer, @@ -18,7 +16,6 @@ pub struct Stats { pub max: ArrayRef } - pub fn can_have_stats(data_type: &DataType) -> bool { match data_type { DataType::Int8 => true, @@ -36,4 +33,4 @@ pub fn can_have_stats(data_type: &DataType) -> bool { DataType::Utf8 => true, _ => false } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/stats/serde.rs b/crates/storage/src/table/stats/serde.rs index 769b8e74..e0d8350b 100644 --- a/crates/storage/src/table/stats/serde.rs +++ b/crates/storage/src/table/stats/serde.rs @@ -1,32 +1,38 @@ -use super::Stats; +use std::io::Write; + use anyhow::{anyhow, ensure}; -use arrow::array::{Array, ArrayRef}; -use arrow::datatypes::DataType; +use arrow::{ + array::{Array, ArrayRef}, + datatypes::DataType +}; use arrow_buffer::{MutableBuffer, OffsetBuffer, ScalarBuffer}; -use sqd_array::builder::{AnyBuilder, ArrayBuilder}; -use sqd_array::io::dense::{DenseReader, DenseWriter}; -use sqd_array::io::writer::IOWriter; -use sqd_array::reader::{AnyReader, ArrayReader, NativeReader, ReaderFactory}; -use sqd_array::slice::{AsSlice, Slice}; -use sqd_array::util::validate_offsets; -use sqd_array::writer::{AnyArrayWriter, NativeWriter, WriterFactory}; -use std::io::Write; +use sqd_array::{ + builder::{AnyBuilder, ArrayBuilder}, + io::{ + dense::{DenseReader, DenseWriter}, + writer::IOWriter + }, + reader::{AnyReader, ArrayReader, NativeReader, ReaderFactory}, + slice::{AsSlice, Slice}, + util::validate_offsets, + writer::{AnyArrayWriter, NativeWriter, WriterFactory} +}; +use super::Stats; pub fn serialize_stats(out: &mut W, stats: &Stats) -> anyhow::Result<()> { let mut file = DenseWriter::new(out); - + let mut offsets_writer = file.native::()?; offsets_writer.write_slice(&stats.offsets)?; offsets_writer.into_write().finish(); - + ser_array(&mut file, &stats.min)?; ser_array(&mut file, &stats.max)?; file.finish()?; Ok(()) } - fn ser_array(file: &mut DenseWriter, array: &dyn Array) -> anyhow::Result<()> { let mut writer = AnyArrayWriter::from_factory(file, array.data_type())?; array.as_slice().write(&mut writer)?; @@ -36,7 +42,6 @@ fn ser_array(file: &mut DenseWriter, array: &dyn Array) -> anyhow:: Ok(()) } - pub fn deserialize_stats(input: &[u8], data_type: &DataType) -> anyhow::Result { let mut reader = DenseReader::new(input)?; @@ -46,9 +51,7 @@ pub fn deserialize_stats(input: &[u8], data_type: &DataType) -> anyhow::Result::from(builder); validate_offsets(&offsets, 0).map_err(|msg| anyhow!(msg))?; ensure!(offsets[0] == 0, "offsets array does not start with 0"); - unsafe { - OffsetBuffer::new_unchecked(offsets) - } + unsafe { OffsetBuffer::new_unchecked(offsets) } }; let min = de_array(&mut reader, data_type)?; @@ -69,14 +72,9 @@ pub fn deserialize_stats(input: &[u8], data_type: &DataType) -> anyhow::Result, data_type: &DataType) -> anyhow::Result { let mut builder = AnyBuilder::new(data_type); AnyReader::from_factory(reader, data_type)?.read(&mut builder)?; diff --git a/crates/storage/src/table/write/bitmask.rs b/crates/storage/src/table/write/bitmask.rs index f31bb998..d0be5b10 100644 --- a/crates/storage/src/table/write/bitmask.rs +++ b/crates/storage/src/table/write/bitmask.rs @@ -1,15 +1,11 @@ use std::sync::LazyLock; -use parking_lot::RwLock; -use super::page::PageWriter; -use sqd_array::builder::bitmask::BitmaskBuilder; -use sqd_array::index::RangeList; -use sqd_array::writer::BitmaskWriter; +use parking_lot::RwLock; +use sqd_array::{builder::bitmask::BitmaskBuilder, index::RangeList, writer::BitmaskWriter}; -pub static PAGE_SIZE: LazyLock> = LazyLock::new(|| { - RwLock::new(16 * 1024) -}); +use super::page::PageWriter; +pub static PAGE_SIZE: LazyLock> = LazyLock::new(|| RwLock::new(16 * 1024)); pub struct BitmaskPageWriter

{ page_writer: P, @@ -17,7 +13,6 @@ pub struct BitmaskPageWriter

{ page_size: usize } - impl BitmaskPageWriter

{ pub fn new(page_writer: P) -> Self { let page_size = PAGE_SIZE.read().clone(); @@ -36,7 +31,7 @@ impl BitmaskPageWriter

{ Ok(()) } } - + #[inline(never)] fn do_flush(&mut self) -> anyhow::Result<()> { let mut byte_offset = 0; @@ -52,7 +47,7 @@ impl BitmaskPageWriter

{ pub fn finish(mut self) -> anyhow::Result

{ let mut byte_offset = 0; let byte_end = self.builder.bytes_size(); - + if byte_end > self.page_size { let split_byte = byte_end / 2; let bytes = &self.builder.data()[0..split_byte]; @@ -66,19 +61,18 @@ impl BitmaskPageWriter

{ &self.builder.data()[byte_offset..byte_end] )?; } - + Ok(self.page_writer) } } - impl BitmaskWriter for BitmaskPageWriter

{ fn write_slice(&mut self, data: &[u8], offset: usize, len: usize) -> anyhow::Result<()> { self.builder.append_slice(data, offset, len); self.flush() } - fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) -> anyhow::Result<()> { + fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) -> anyhow::Result<()> { self.builder.append_slice_indexes(data, indexes); self.flush() } @@ -96,4 +90,4 @@ impl BitmaskWriter for BitmaskPageWriter

{ self.builder.append_many(val, count); self.flush() } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/mod.rs b/crates/storage/src/table/write/mod.rs index b40903aa..a213de1d 100644 --- a/crates/storage/src/table/write/mod.rs +++ b/crates/storage/src/table/write/mod.rs @@ -7,11 +7,9 @@ mod storage_cell; mod storage_writer; mod table; - pub use storage_cell::*; pub use table::*; - pub fn use_small_buffers() -> RestoreBufferSizesGuard { RestoreBufferSizesGuard { bitmask_page_size: set_buf_size(&bitmask::PAGE_SIZE, 4), @@ -20,14 +18,12 @@ pub fn use_small_buffers() -> RestoreBufferSizesGuard { } } - pub struct RestoreBufferSizesGuard { bitmask_page_size: usize, native_page_size: usize, offset_page_len: usize } - impl Drop for RestoreBufferSizesGuard { fn drop(&mut self) { set_buf_size(&bitmask::PAGE_SIZE, self.bitmask_page_size); @@ -36,10 +32,9 @@ impl Drop for RestoreBufferSizesGuard { } } - fn set_buf_size(cell: &parking_lot::RwLock, new_val: usize) -> usize { let mut lock = cell.write(); let current = lock.clone(); *lock = new_val; current -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/native.rs b/crates/storage/src/table/write/native.rs index 252bc379..8d7bca81 100644 --- a/crates/storage/src/table/write/native.rs +++ b/crates/storage/src/table/write/native.rs @@ -1,16 +1,13 @@ use std::sync::LazyLock; -use crate::table::write::page::PageWriter; + use anyhow::ensure; use arrow_buffer::{bit_util, ArrowNativeType, MutableBuffer, ToByteSlice}; use parking_lot::RwLock; -use sqd_array::index::RangeList; -use sqd_array::writer::NativeWriter; - +use sqd_array::{index::RangeList, writer::NativeWriter}; -pub static PAGE_SIZE: LazyLock> = LazyLock::new(|| { - RwLock::new(64 * 1024) -}); +use crate::table::write::page::PageWriter; +pub static PAGE_SIZE: LazyLock> = LazyLock::new(|| RwLock::new(64 * 1024)); pub struct NativePageWriter

{ page_writer: P, @@ -19,7 +16,6 @@ pub struct NativePageWriter

{ page_size: usize } - impl NativePageWriter

{ pub fn new(page_writer: P, item_size: usize) -> Self { assert!(item_size > 0); @@ -44,9 +40,7 @@ impl NativePageWriter

{ #[inline(never)] fn do_flush(&mut self) -> anyhow::Result<()> { let mut byte_offset = 0; - while self.buffer.len() - byte_offset > self.page_size * 3 / 2 - && self.buffer.len() >= self.item_size - { + while self.buffer.len() - byte_offset > self.page_size * 3 / 2 && self.buffer.len() >= self.item_size { let len = bit_util::ceil(self.page_size, self.item_size); let byte_end = byte_offset + len * self.item_size; let bytes = &self.buffer[byte_offset..byte_end]; @@ -60,16 +54,13 @@ impl NativePageWriter

{ self.buffer.truncate(bytes_left); Ok(()) } - + pub fn finish(mut self) -> anyhow::Result

{ - ensure!( - self.buffer.len() % self.item_size == 0, - "got partially written item" - ); - + ensure!(self.buffer.len() % self.item_size == 0, "got partially written item"); + let mut byte_offset = 0; let byte_end = self.buffer.len(); - + if byte_end > self.page_size { let len = bit_util::ceil(self.page_size, self.item_size); let split_byte = len * self.item_size; @@ -77,19 +68,16 @@ impl NativePageWriter

{ self.page_writer.write_page(len, bytes)?; byte_offset = split_byte; } - + if byte_end > byte_offset { - self.page_writer.write_page( - (byte_end - byte_offset) / self.item_size, - &self.buffer[byte_offset..] - )?; + self.page_writer + .write_page((byte_end - byte_offset) / self.item_size, &self.buffer[byte_offset..])?; } - + Ok(self.page_writer) } } - impl NativeWriter for NativePageWriter

{ #[inline] fn write(&mut self, value: T) -> anyhow::Result<()> { @@ -98,7 +86,7 @@ impl NativeWriter for NativePageWriter

{ } #[inline] - fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()> { + fn write_iter(&mut self, values: impl Iterator) -> anyhow::Result<()> { for v in values { self.write(v)?; } @@ -112,14 +100,22 @@ impl NativeWriter for NativePageWriter

{ } #[inline] - fn write_slice_indexes(&mut self, values: &[T], indexes: impl Iterator) -> anyhow::Result<()> { + fn write_slice_indexes( + &mut self, + values: &[T], + indexes: impl Iterator + ) -> anyhow::Result<()> { self.buffer.write_slice_indexes(values, indexes)?; self.flush() } #[inline] - fn write_slice_ranges(&mut self, values: &[T], ranges: &mut impl RangeList) -> anyhow::Result<()> { + fn write_slice_ranges( + &mut self, + values: &[T], + ranges: &mut impl RangeList + ) -> anyhow::Result<()> { self.buffer.write_slice_ranges(values, ranges)?; self.flush() } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/nullmask.rs b/crates/storage/src/table/write/nullmask.rs index 028c1755..6145527b 100644 --- a/crates/storage/src/table/write/nullmask.rs +++ b/crates/storage/src/table/write/nullmask.rs @@ -1,9 +1,6 @@ -use super::bitmask::BitmaskPageWriter; -use super::page::PageWriter; -use sqd_array::index::RangeList; -use sqd_array::util::bit_tools; -use sqd_array::writer::BitmaskWriter; +use sqd_array::{index::RangeList, util::bit_tools, writer::BitmaskWriter}; +use super::{bitmask::BitmaskPageWriter, page::PageWriter}; pub struct NullmaskPageWriter

{ nulls: BitmaskPageWriter

, @@ -11,7 +8,6 @@ pub struct NullmaskPageWriter

{ len: usize } - impl NullmaskPageWriter

{ pub fn new(page_writer: P) -> Self { Self { @@ -24,17 +20,17 @@ impl NullmaskPageWriter

{ #[inline] fn check_bitmask_presence(&mut self, all_valid: impl FnOnce() -> Option) -> anyhow::Result { if self.has_nulls { - return Ok(true) + return Ok(true); } if let Some(len) = all_valid() { self.len += len; - return Ok(false) + return Ok(false); } self.has_nulls = true; self.nulls.write_many(true, self.len)?; Ok(true) } - + pub fn finish(self) -> anyhow::Result

{ let mut page_writer = self.nulls.finish()?; page_writer.write_page(if self.has_nulls { 0 } else { self.len }, &[])?; @@ -42,7 +38,6 @@ impl NullmaskPageWriter

{ } } - impl BitmaskWriter for NullmaskPageWriter

{ fn write_slice(&mut self, data: &[u8], offset: usize, len: usize) -> anyhow::Result<()> { if self.check_bitmask_presence(|| bit_tools::all_valid(data, offset, len).then_some(len))? { @@ -51,7 +46,7 @@ impl BitmaskWriter for NullmaskPageWriter

{ Ok(()) } - fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) -> anyhow::Result<()> { + fn write_slice_indexes(&mut self, data: &[u8], indexes: impl Iterator + Clone) -> anyhow::Result<()> { if self.check_bitmask_presence(|| bit_tools::all_indexes_valid(data, indexes.clone()))? { self.nulls.write_slice_indexes(data, indexes)?; } @@ -67,14 +62,14 @@ impl BitmaskWriter for NullmaskPageWriter

{ fn write_many(&mut self, val: bool, count: usize) -> anyhow::Result<()> { if count == 0 { - return Ok(()) + return Ok(()); } match (self.has_nulls, val) { (true, val) => self.nulls.write_many(val, count), (false, true) => { self.len += count; Ok(()) - }, + } (false, false) => { self.has_nulls = true; self.nulls.write_many(true, self.len)?; @@ -82,4 +77,4 @@ impl BitmaskWriter for NullmaskPageWriter

{ } } } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/offsets.rs b/crates/storage/src/table/write/offsets.rs index 0b44e98c..444ea8c6 100644 --- a/crates/storage/src/table/write/offsets.rs +++ b/crates/storage/src/table/write/offsets.rs @@ -1,17 +1,12 @@ -use crate::table::write::page::PageWriter; -use arrow_buffer::ToByteSlice; -use parking_lot::RwLock; -use sqd_array::builder::offsets::OffsetsBuilder; -use sqd_array::index::RangeList; -use sqd_array::offsets::Offsets; -use sqd_array::writer::OffsetsWriter; use std::sync::LazyLock; +use arrow_buffer::ToByteSlice; +use parking_lot::RwLock; +use sqd_array::{builder::offsets::OffsetsBuilder, index::RangeList, offsets::Offsets, writer::OffsetsWriter}; -pub static PAGE_LEN: LazyLock> = LazyLock::new(|| { - RwLock::new(16 * 1024) -}); +use crate::table::write::page::PageWriter; +pub static PAGE_LEN: LazyLock> = LazyLock::new(|| RwLock::new(16 * 1024)); pub struct OffsetPageWriter

{ page_writer: P, @@ -19,7 +14,6 @@ pub struct OffsetPageWriter

{ page_len: usize } - impl OffsetPageWriter

{ pub fn new(page_writer: P) -> Self { let page_len = PAGE_LEN.read().clone(); @@ -61,10 +55,7 @@ impl OffsetPageWriter

{ if data.len() > self.page_len { let split = data.len() / 2; - self.page_writer.write_page( - split, - data[0..split].to_byte_slice() - )?; + self.page_writer.write_page(split, data[0..split].to_byte_slice())?; offset = split; } @@ -75,14 +66,17 @@ impl OffsetPageWriter

{ } } - impl OffsetsWriter for OffsetPageWriter

{ fn write_slice(&mut self, offsets: Offsets<'_>) -> anyhow::Result<()> { self.builder.append_slice(offsets); self.flush() } - fn write_slice_indexes(&mut self, offsets: Offsets<'_>, indexes: impl Iterator) -> anyhow::Result<()> { + fn write_slice_indexes( + &mut self, + offsets: Offsets<'_>, + indexes: impl Iterator + ) -> anyhow::Result<()> { self.builder.append_slice_indexes(offsets, indexes); self.flush() } @@ -96,4 +90,4 @@ impl OffsetsWriter for OffsetPageWriter

{ self.builder.append_len(len); self.flush() } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/page.rs b/crates/storage/src/table/write/page.rs index e17b93b9..941b238b 100644 --- a/crates/storage/src/table/write/page.rs +++ b/crates/storage/src/table/write/page.rs @@ -1,13 +1,11 @@ -use crate::kv::KvWrite; -use crate::table::key::TableKeyFactory; use arrow_buffer::ToByteSlice; +use crate::{kv::KvWrite, table::key::TableKeyFactory}; pub trait PageWriter { fn write_page(&mut self, item_count: usize, bytes: &[u8]) -> anyhow::Result<()>; } - pub struct BufferPageWriter { storage: S, key: TableKeyFactory, @@ -15,7 +13,6 @@ pub struct BufferPageWriter { page_offsets: Vec } - impl BufferPageWriter { pub fn new(storage: S, key: TableKeyFactory, buffer_index: usize) -> Self { Self { @@ -29,11 +26,11 @@ impl BufferPageWriter { pub fn num_pages(&self) -> usize { self.page_offsets.len() - 1 } - + pub fn num_items(&self) -> usize { self.page_offsets.last().copied().unwrap() as usize } - + pub fn finish(mut self) -> anyhow::Result { let key = self.key.offsets(self.buffer_index); self.storage.put(key, self.page_offsets.to_byte_slice())?; @@ -41,7 +38,6 @@ impl BufferPageWriter { } } - impl PageWriter for BufferPageWriter { fn write_page(&mut self, item_count: usize, bytes: &[u8]) -> anyhow::Result<()> { let key = self.key.page(self.buffer_index, self.num_pages()); @@ -49,4 +45,4 @@ impl PageWriter for BufferPageWriter { self.page_offsets.push((self.num_items() + item_count) as u32); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/storage_cell.rs b/crates/storage/src/table/write/storage_cell.rs index b998b297..194165cf 100644 --- a/crates/storage/src/table/write/storage_cell.rs +++ b/crates/storage/src/table/write/storage_cell.rs @@ -1,13 +1,11 @@ -use crate::kv::KvWrite; -use std::cell::RefCell; -use std::rc::Rc; +use std::{cell::RefCell, rc::Rc}; +use crate::kv::KvWrite; pub struct StorageCell { inner: Rc> } - impl Clone for StorageCell { fn clone(&self) -> Self { Self { @@ -16,7 +14,6 @@ impl Clone for StorageCell { } } - impl StorageCell { pub fn new(inner: S) -> Self { Self { @@ -27,7 +24,7 @@ impl StorageCell { pub fn put(&self, key: &[u8], value: &[u8]) -> anyhow::Result<()> { (&self.inner).borrow_mut().put(key, value) } - + pub fn into_inner(self) -> S { Rc::into_inner(self.inner) .expect("storage is still in use") @@ -35,9 +32,8 @@ impl StorageCell { } } - impl KvWrite for StorageCell { fn put(&mut self, key: &[u8], value: &[u8]) -> anyhow::Result<()> { (&*self).put(key, value) } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/storage_writer.rs b/crates/storage/src/table/write/storage_writer.rs index 03dd62f0..5bca85b4 100644 --- a/crates/storage/src/table/write/storage_writer.rs +++ b/crates/storage/src/table/write/storage_writer.rs @@ -1,20 +1,18 @@ -use super::bitmask::BitmaskPageWriter; -use super::native::NativePageWriter; -use super::nullmask::NullmaskPageWriter; -use super::offsets::OffsetPageWriter; -use super::page::BufferPageWriter; -use crate::kv::KvWrite; -use crate::table::key::TableKeyFactory; +use std::marker::PhantomData; + use arrow_buffer::ArrowNativeType; use sqd_array::writer::{Writer, WriterFactory}; -use std::marker::PhantomData; +use super::{ + bitmask::BitmaskPageWriter, native::NativePageWriter, nullmask::NullmaskPageWriter, offsets::OffsetPageWriter, + page::BufferPageWriter +}; +use crate::{kv::KvWrite, table::key::TableKeyFactory}; pub struct StorageWriter { phantom_data: PhantomData } - impl Writer for StorageWriter { type Bitmask = BitmaskPageWriter>; type Nullmask = NullmaskPageWriter>; @@ -22,21 +20,15 @@ impl Writer for StorageWriter { type Offset = OffsetPageWriter>; } - pub(super) struct StorageWriterFactory { storage: S, key: TableKeyFactory, pos: usize } - impl StorageWriterFactory { pub fn new(storage: S, key: TableKeyFactory) -> Self { - Self { - storage, - key, - pos: 0 - } + Self { storage, key, pos: 0 } } fn next_buffer(&mut self) -> BufferPageWriter { @@ -46,8 +38,7 @@ impl StorageWriterFactory { } } - -impl WriterFactory for StorageWriterFactory{ +impl WriterFactory for StorageWriterFactory { type Writer = StorageWriter; fn nullmask(&mut self) -> anyhow::Result<::Nullmask> { @@ -69,4 +60,4 @@ impl WriterFactory for StorageWriterFactory{ let buf = self.next_buffer(); Ok(OffsetPageWriter::new(buf)) } -} \ No newline at end of file +} diff --git a/crates/storage/src/table/write/table.rs b/crates/storage/src/table/write/table.rs index 8f81dd7e..1746d57d 100644 --- a/crates/storage/src/table/write/table.rs +++ b/crates/storage/src/table/write/table.rs @@ -1,10 +1,8 @@ -use super::storage_writer::{StorageWriter, StorageWriterFactory}; -use crate::kv::KvWrite; -use crate::table::key::TableKeyFactory; -use arrow::datatypes::SchemaRef; -use arrow::ipc::convert::IpcSchemaEncoder; +use arrow::{datatypes::SchemaRef, ipc::convert::IpcSchemaEncoder}; use sqd_array::writer::{AnyArrayWriter, AnyWriter, ArrayWriter, Writer}; +use super::storage_writer::{StorageWriter, StorageWriterFactory}; +use crate::{kv::KvWrite, table::key::TableKeyFactory}; pub struct TableWriter { storage: S, @@ -13,7 +11,6 @@ pub struct TableWriter { writer: AnyArrayWriter> } - impl TableWriter { pub fn new(storage: S, table_name: &[u8], schema: SchemaRef) -> Self { let key = TableKeyFactory::new(table_name); @@ -28,28 +25,27 @@ impl TableWriter { } } - -impl TableWriter { +impl TableWriter { pub fn finish(mut self) -> anyhow::Result { for buf in self.writer.into_inner() { match buf { AnyWriter::Bitmask(writer) => writer.finish(), AnyWriter::Nullmask(writer) => writer.finish(), AnyWriter::Native(writer) => writer.finish(), - AnyWriter::Offsets(writer) => writer.finish(), - }?.finish()?; + AnyWriter::Offsets(writer) => writer.finish() + }? + .finish()?; } self.storage.put( self.key.schema(), IpcSchemaEncoder::new().schema_to_fb(&self.schema).finished_data() )?; - + Ok(self.storage) } } - impl ArrayWriter for TableWriter { type Writer = StorageWriter; @@ -72,4 +68,4 @@ impl ArrayWriter for TableWriter { fn offset(&mut self, buf: usize) -> &mut ::Offset { self.writer.offset(buf) } -} \ No newline at end of file +} diff --git a/crates/storage/tests/arb_array.rs b/crates/storage/tests/arb_array.rs index 25af02c5..4acfe1af 100644 --- a/crates/storage/tests/arb_array.rs +++ b/crates/storage/tests/arb_array.rs @@ -1,25 +1,28 @@ -use arrow::array::{Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, StringArray, StructArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array}; -use arrow_buffer::BooleanBuffer; -use arrow::datatypes::{DataType, Field}; -use proptest::collection::SizeRange; -use proptest::prelude::*; -use proptest::string::string_regex; use std::sync::Arc; -use arrow::datatypes::Int32Type; +use arrow::{ + array::{ + Array, ArrayRef, BinaryArray, BooleanArray, FixedSizeBinaryArray, Int16Array, Int32Array, Int64Array, + Int8Array, ListArray, StringArray, StructArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array + }, + datatypes::{DataType, Field, Int32Type} +}; +use arrow_buffer::BooleanBuffer; +use proptest::{collection::SizeRange, prelude::*, string::string_regex}; -pub fn bitmask(len: impl Into) -> impl Strategy { +pub fn bitmask(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(BooleanBuffer::from) } -pub fn boolean(len: impl Into) -> impl Strategy { +pub fn boolean(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = BooleanArray::from(vec); Arc::new(array) as ArrayRef }) } -pub fn binary(len: impl Into + Clone) -> impl Strategy { +pub fn binary(len: impl Into + Clone) -> impl Strategy { prop::collection::vec(prop::collection::vec(any::(), len.clone()), len).prop_map(|vecs| { let res = vecs.iter().map(Vec::as_slice).collect(); let array = BinaryArray::from_vec(res); @@ -27,16 +30,15 @@ pub fn binary(len: impl Into + Clone) -> impl Strategy + Clone, size: usize) -> impl Strategy { - prop::collection::vec(prop::collection::vec(any::(), size), len.clone()) - .prop_map(move |vecs| { - let iter = vecs.into_iter().map(Some); - let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(iter, size as i32).unwrap(); - Arc::new(array) as ArrayRef - }) +pub fn fixed_size_binary(len: impl Into + Clone, size: usize) -> impl Strategy { + prop::collection::vec(prop::collection::vec(any::(), size), len.clone()).prop_map(move |vecs| { + let iter = vecs.into_iter().map(Some); + let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(iter, size as i32).unwrap(); + Arc::new(array) as ArrayRef + }) } -pub fn list(len: impl Into + Clone) -> impl Strategy { +pub fn list(len: impl Into + Clone) -> impl Strategy { prop::collection::vec(prop::collection::vec(any::>(), len.clone()), len).prop_map(|vecs| { let res = vecs.iter().map(|x| Some(x.clone())).collect::>(); let array = ListArray::from_iter_primitive::(res); @@ -51,85 +53,81 @@ pub fn string(len: impl Into) -> impl Strategy { }) } -pub fn uint8(len: impl Into) -> impl Strategy { +pub fn uint8(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = UInt8Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn uint16(len: impl Into) -> impl Strategy { +pub fn uint16(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = UInt16Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn uint32(len: impl Into) -> impl Strategy { +pub fn uint32(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = UInt32Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn uint64(len: impl Into) -> impl Strategy { +pub fn uint64(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = UInt64Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn int8(len: impl Into) -> impl Strategy { +pub fn int8(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = Int8Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn int16(len: impl Into) -> impl Strategy { +pub fn int16(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = Int16Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn int32(len: impl Into) -> impl Strategy { +pub fn int32(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = Int32Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn int64(len: impl Into) -> impl Strategy { +pub fn int64(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = Int64Array::from(vec); Arc::new(array) as ArrayRef }) } -pub fn timestamp(len: impl Into) -> impl Strategy { +pub fn timestamp(len: impl Into) -> impl Strategy { prop::collection::vec(any::(), len).prop_map(|vec| { let array = TimestampSecondArray::from(vec); Arc::new(array) as ArrayRef }) } - pub fn structs(len: impl Into) -> impl Strategy { prop::collection::vec(string_regex("\\PC*").unwrap(), len).prop_map(|strings| { let arrow_strings = Arc::new(StringArray::from(strings)); - let array = StructArray::from(vec![ - ( - Arc::new(Field::new("s", DataType::Utf8, false)), - arrow_strings as ArrayRef, - ), - ]); + let array = StructArray::from(vec![( + Arc::new(Field::new("s", DataType::Utf8, false)), + arrow_strings as ArrayRef + )]); Arc::new(array) as ArrayRef }) } - -pub fn with_nullmask(array: impl Strategy) -> impl Strategy { +pub fn with_nullmask(array: impl Strategy) -> impl Strategy { array.prop_flat_map(|arr| { bitmask(arr.len()).prop_map(move |nulls| { arr.to_data() @@ -140,4 +138,4 @@ pub fn with_nullmask(array: impl Strategy) -> impl Strategy anyhow::Result { db.perform_dataset_compaction(dataset_id, Some(100), Some(1.25), None) } - #[test] fn small_chunks_test() { let (db, dataset_id) = setup_db(); @@ -32,21 +32,21 @@ fn small_chunks_test() { parent_block_hash: "base".to_owned(), first_block_time: Some(5), last_block_time: None, - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 1, last_block: 1, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk3 = Chunk::V0 { first_block: 2, last_block: 2, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk4 = Chunk::V1 { first_block: 3, @@ -55,7 +55,7 @@ fn small_chunks_test() { parent_block_hash: "last_3".to_owned(), first_block_time: None, last_block_time: Some(10), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk4).is_ok()); @@ -63,11 +63,7 @@ fn small_chunks_test() { assert!(db.insert_chunk(dataset_id, &chunk2).is_ok()); assert!(db.insert_chunk(dataset_id, &chunk1).is_ok()); - validate_chunks( - &db, - dataset_id, - [&chunk1, &chunk2, &chunk3, &chunk4].to_vec(), - ); + validate_chunks(&db, dataset_id, [&chunk1, &chunk2, &chunk3, &chunk4].to_vec()); compact(&db, dataset_id).unwrap(); @@ -78,7 +74,7 @@ fn small_chunks_test() { parent_block_hash: "base".to_owned(), first_block_time: Some(5), last_block_time: Some(10), - tables: Default::default(), + tables: Default::default() }; validate_chunks(&db, dataset_id, [&compacted].to_vec()); } @@ -96,7 +92,7 @@ fn compaction_wo_tables_test() { last_block: i, last_block_hash: last_hash, parent_block_hash: base_hash, - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk).is_ok()); chunks.push(chunk); @@ -115,7 +111,7 @@ fn compaction_wo_tables_test() { parent_block_hash: "last_0".to_owned(), first_block_time: None, last_block_time: None, - tables: Default::default(), + tables: Default::default() }; let chungus2 = Chunk::V1 { first_block: 100, @@ -124,7 +120,7 @@ fn compaction_wo_tables_test() { parent_block_hash: "last_100".to_owned(), first_block_time: None, last_block_time: None, - tables: Default::default(), + tables: Default::default() }; validate_chunks(&db, dataset_id, [&chungus1, &chungus2].to_vec()); } @@ -135,7 +131,7 @@ fn universal_compaction_test( type_b: DataType, do_sort: bool, n_blocks: usize, - n_compactions: usize, + n_compactions: usize ) { let (db, dataset_id) = setup_db(); let _sb = use_small_buffers(); @@ -159,10 +155,7 @@ fn universal_compaction_test( let chunk_data = chunkify_data(global_data, do_sort); validate_chunks(&db, dataset_id, chunks.iter().collect()); for _ in 0..n_compactions { - assert!(matches!( - compact(&db, dataset_id).unwrap(), - CompactionStatus::Ok(_) - )); + assert!(matches!(compact(&db, dataset_id).unwrap(), CompactionStatus::Ok(_))); } assert!(matches!( compact(&db, dataset_id).unwrap(), @@ -240,10 +233,7 @@ fn compaction_plan_test_execution( let type_b = DataType::Int32; let do_sort = false; - let total_blocks = block_sizes - .iter() - .map(|v| v.iter().sum::()) - .sum::(); + let total_blocks = block_sizes.iter().map(|v| v.iter().sum::()).sum::(); let static_data = vec![ (0..total_blocks as u16).collect::>(), (0..total_blocks as u16).collect::>(), @@ -264,16 +254,12 @@ fn compaction_plan_test_execution( } else { Arc::clone(&schema_b) }; - let (chunk, data) = make_irregular_block( - &static_data, - total_offset, - total_offset + *size, - local_schema, - &db, - ); + let (chunk, data) = + make_irregular_block(&static_data, total_offset, total_offset + *size, local_schema, &db); assert!(db.insert_chunk(dataset_id, &chunk).is_ok()); if compact_on_each_insert { - db.perform_dataset_compaction(dataset_id, Some(max_chunk_size), Some(wa_limit), None).unwrap(); + db.perform_dataset_compaction(dataset_id, Some(max_chunk_size), Some(wa_limit), None) + .unwrap(); } chunks.push(chunk); global_data.extend(data); diff --git a/crates/storage/tests/database_ops.rs b/crates/storage/tests/database_ops.rs index febff1d3..357c0472 100644 --- a/crates/storage/tests/database_ops.rs +++ b/crates/storage/tests/database_ops.rs @@ -1,13 +1,18 @@ use core::{assert, assert_eq}; -use std::collections::{BTreeMap, HashSet}; -use std::sync::Arc; - -use arrow::array::{RecordBatch, UInt32Array}; -use arrow::datatypes::{DataType, Field, Schema}; -use sqd_primitives::sid::SID; -use sqd_primitives::BlockRef; -use sqd_storage::db::{Chunk, DatabaseSettings, DatasetId, DatasetKind}; -use sqd_storage::table::write::use_small_buffers; +use std::{ + collections::{BTreeMap, HashSet}, + sync::Arc +}; + +use arrow::{ + array::{RecordBatch, UInt32Array}, + datatypes::{DataType, Field, Schema} +}; +use sqd_primitives::{sid::SID, BlockRef}; +use sqd_storage::{ + db::{Chunk, DatabaseSettings, DatasetId, DatasetKind}, + table::write::use_small_buffers +}; mod utils; use utils::{setup_db, validate_chunks}; @@ -30,9 +35,7 @@ fn create_dataset() { assert!(res); let res = db.create_dataset(dataset_id, dataset_kind).is_err(); assert!(res); - let res = db - .create_dataset_if_not_exists(dataset_id, dataset_kind) - .is_ok(); + let res = db.create_dataset_if_not_exists(dataset_id, dataset_kind).is_ok(); assert!(res); let datasets = db.get_all_datasets().unwrap(); assert_eq!(datasets.len(), 1); @@ -49,21 +52,21 @@ fn basic_chunks_test() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk1).is_ok()); @@ -82,21 +85,21 @@ fn basic_chunks_test_rev() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk3).is_ok()); @@ -115,21 +118,21 @@ fn basic_chunks_bad_hash() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "BAD".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk1).is_ok()); @@ -148,21 +151,21 @@ fn basic_chunks_bad_range() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 99, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk1).is_ok()); @@ -181,28 +184,28 @@ fn basic_fork_test() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), - tables: Default::default(), + tables: Default::default() }; let fork = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "fork_hash".to_owned(), parent_block_hash: "last_1".to_owned(), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk1).is_ok()); @@ -217,11 +220,7 @@ fn basic_fork_test() { fn delete_chunks() { let (db, dataset_id) = setup_db(); - let schema = Arc::new(Schema::new(vec![Field::new( - "data", - DataType::UInt32, - true, - )])); + let schema = Arc::new(Schema::new(vec![Field::new("data", DataType::UInt32, true)])); let mut builder = db.new_table_builder(schema.clone()); @@ -237,14 +236,14 @@ fn delete_chunks() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: tables.clone(), + tables: tables.clone() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables, + tables }; // let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), tables: Default::default() }; @@ -252,15 +251,11 @@ fn delete_chunks() { assert!(db.insert_chunk(dataset_id, &chunk2).is_ok()); validate_chunks(&db, dataset_id, [&chunk1, &chunk2].to_vec()); - assert!(db - .update_dataset(dataset_id, |tx| { tx.delete_chunk(&chunk2) }) - .is_ok()); + assert!(db.update_dataset(dataset_id, |tx| { tx.delete_chunk(&chunk2) }).is_ok()); validate_chunks(&db, dataset_id, [&chunk1].to_vec()); - assert!(db - .update_dataset(dataset_id, |tx| { tx.delete_chunk(&chunk1) }) - .is_ok()); + assert!(db.update_dataset(dataset_id, |tx| { tx.delete_chunk(&chunk1) }).is_ok()); validate_chunks(&db, dataset_id, [].to_vec()); @@ -291,14 +286,14 @@ fn chunk_reader() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: tables.clone(), + tables: tables.clone() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables, + tables }; // let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), tables: Default::default() }; @@ -358,21 +353,21 @@ fn labels() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "BAD".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk3 = Chunk::V0 { first_block: 201, last_block: 300, last_block_hash: "last_3".to_owned(), parent_block_hash: "last_2".to_owned(), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk1).is_ok()); @@ -381,7 +376,7 @@ fn labels() { let finalized_head = BlockRef { number: 300, - hash: "last_3".to_owned(), + hash: "last_3".to_owned() }; assert!(db @@ -407,14 +402,14 @@ fn delete_dataset() { last_block: 100, last_block_hash: "last_1".to_owned(), parent_block_hash: "base".to_owned(), - tables: Default::default(), + tables: Default::default() }; let chunk2 = Chunk::V0 { first_block: 101, last_block: 200, last_block_hash: "last_2".to_owned(), parent_block_hash: "last_1".to_owned(), - tables: Default::default(), + tables: Default::default() }; assert!(db.insert_chunk(dataset_id, &chunk1).is_ok()); diff --git a/crates/storage/tests/utils.rs b/crates/storage/tests/utils.rs index c36cf4b6..adc72aa7 100644 --- a/crates/storage/tests/utils.rs +++ b/crates/storage/tests/utils.rs @@ -1,12 +1,11 @@ -use arrow::array::{ - ArrayRef, AsArray, Float32Array, Int32Array, RecordBatch, UInt16Array, UInt32Array, +use std::{collections::BTreeMap, sync::Arc}; + +use arrow::{ + array::{ArrayRef, AsArray, Float32Array, Int32Array, RecordBatch, UInt16Array, UInt32Array}, + datatypes::{DataType, Field, Schema, UInt32Type} }; -use arrow::datatypes::{DataType, Field, Schema, UInt32Type}; use sqd_array::schema_metadata::set_sort_key; use sqd_storage::db::{Chunk, Database, DatabaseSettings, DatasetId, DatasetKind, ReadSnapshot}; -use std::collections::BTreeMap; -use std::sync::Arc; - pub fn setup_db() -> (Database, DatasetId) { let db_dir = tempfile::tempdir().unwrap(); @@ -55,17 +54,14 @@ pub fn read_chunk(snapshot: &ReadSnapshot, chunk: Chunk) -> Vec<(u32, u32)> { let keys = tmp_k.as_primitive::().values().to_vec(); let tmp_v = table_reader.read_column(1, None).unwrap(); let vals = tmp_v.as_primitive::().values().to_vec(); - keys.iter() - .zip(vals) - .map(|(a, b)| (*a, b)) - .collect::>() + keys.iter().zip(vals).map(|(a, b)| (*a, b)).collect::>() } pub fn make_block( static_data: &Vec>, idx: usize, local_schema: Arc, - db: &Database, + db: &Database ) -> (Chunk, Vec<(u32, u32)>) { make_irregular_block(static_data, idx * 10, (idx + 1) * 10, local_schema, db) } @@ -75,7 +71,7 @@ pub fn make_irregular_block( start: usize, end: usize, local_schema: Arc, - db: &Database, + db: &Database ) -> (Chunk, Vec<(u32, u32)>) { let base_hash = format!("last_{}", start); let last_hash = format!("last_{}", end); @@ -95,53 +91,29 @@ pub fn make_irregular_block( let array_1 = match type_1 { DataType::UInt32 => Arc::new(UInt32Array::from( - vec_1 - .clone() - .into_iter() - .map(Into::into) - .collect::>(), + vec_1.clone().into_iter().map(Into::into).collect::>() )) as ArrayRef, DataType::Int32 => Arc::new(Int32Array::from( - vec_1 - .clone() - .into_iter() - .map(Into::into) - .collect::>(), + vec_1.clone().into_iter().map(Into::into).collect::>() )) as ArrayRef, DataType::UInt16 => Arc::new(UInt16Array::from(vec_1.clone())) as ArrayRef, DataType::Float32 => Arc::new(Float32Array::from( - vec_1 - .clone() - .into_iter() - .map(Into::into) - .collect::>(), + vec_1.clone().into_iter().map(Into::into).collect::>() )) as ArrayRef, - _ => todo!(), + _ => todo!() }; let array_2 = match type_2 { DataType::UInt32 => Arc::new(UInt32Array::from( - vec_2 - .clone() - .into_iter() - .map(Into::into) - .collect::>(), + vec_2.clone().into_iter().map(Into::into).collect::>() )) as ArrayRef, DataType::Int32 => Arc::new(Int32Array::from( - vec_2 - .clone() - .into_iter() - .map(Into::into) - .collect::>(), + vec_2.clone().into_iter().map(Into::into).collect::>() )) as ArrayRef, DataType::UInt16 => Arc::new(UInt16Array::from(vec_2.clone())) as ArrayRef, DataType::Float32 => Arc::new(Float32Array::from( - vec_2 - .clone() - .into_iter() - .map(Into::into) - .collect::>(), + vec_2.clone().into_iter().map(Into::into).collect::>() )) as ArrayRef, - _ => todo!(), + _ => todo!() }; let batch = RecordBatch::try_new(local_schema, vec![array_1, array_2]).unwrap(); assert!(builder.write_record_batch(&batch).is_ok()); @@ -154,13 +126,9 @@ pub fn make_irregular_block( last_block: (end - 1) as u64, last_block_hash: last_hash, parent_block_hash: base_hash, - tables, + tables }; - let data = vec_1 - .iter() - .zip(vec_2) - .map(|(a, b)| (*a as u32, b as u32)) - .collect(); + let data = vec_1.iter().zip(vec_2).map(|(a, b)| (*a as u32, b as u32)).collect(); (chunk, data) } @@ -171,7 +139,7 @@ pub fn make_schema(type_1: DataType, type_2: DataType, is_sorted: bool) -> Arc set_sort_key(schema, &[0]), - false => schema, + false => schema } } @@ -185,4 +153,4 @@ pub fn chunkify_data(data: Vec<(u32, u32)>, do_sort: bool) -> Vec RecordBatch { - let schema = Schema::new(vec![ - Field::new("c0", array.data_type().clone(), true) - ]); + let schema = Schema::new(vec![Field::new("c0", array.data_type().clone(), true)]); RecordBatch::try_new(Arc::new(schema), vec![array]).unwrap() } @@ -71,7 +77,7 @@ fn check_write_read(db: &Database, batches: Vec, stats_type: bool) Ok(()) } -fn test_write_read(array: impl Strategy) -> anyhow::Result<()> { +fn test_write_read(array: impl Strategy) -> anyhow::Result<()> { let db_dir = tempfile::tempdir()?; let db = DatabaseSettings::default().open(db_dir.path())?; let _sg = use_small_buffers(); @@ -140,8 +146,16 @@ fn binary_write_read() { fn fixed_size_binary_write_read() { // Strategy returned from `fixed_size_binary` within a single call to `test_write_read` has to generate // arrays with the same sizes so that they can be concatenated into a single RecordBatch. - test_write_read(arb_array::with_nullmask(arb_array::fixed_size_binary(0..WRITE_READ_ARRAY_SIZE, 12))).unwrap(); - test_write_read(arb_array::with_nullmask(arb_array::fixed_size_binary(0..WRITE_READ_ARRAY_SIZE, 128))).unwrap(); + test_write_read(arb_array::with_nullmask(arb_array::fixed_size_binary( + 0..WRITE_READ_ARRAY_SIZE, + 12 + ))) + .unwrap(); + test_write_read(arb_array::with_nullmask(arb_array::fixed_size_binary( + 0..WRITE_READ_ARRAY_SIZE, + 128 + ))) + .unwrap(); } #[test] diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 00000000..0eb83a6e --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,5 @@ +max_width = 120 +group_imports = "StdExternalCrate" +trailing_comma = "Never" +imports_granularity = "Crate" +