diff --git a/Cargo.lock b/Cargo.lock index ae5064bd95d23..defd15ff2fc26 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -144,12 +144,6 @@ version = "0.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9d4ee0d472d1cd2e28c97dfa124b3d8d992e10eb0a035f33f5d12e3a177ba3b" -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -220,12 +214,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.98" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" -dependencies = [ - "backtrace", -] +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "apache-avro" @@ -356,24 +347,45 @@ dependencies = [ [[package]] name = "arrow" -version = "56.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" dependencies = [ - "arrow-arith 56.2.0", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-csv 56.2.0", - "arrow-data 56.2.0", - "arrow-ipc 56.2.0", - "arrow-json 56.2.0", - "arrow-ord 56.2.0", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-csv 57.3.0", + "arrow-data 57.3.0", + "arrow-ipc 57.3.0", + "arrow-json 57.3.0", + "arrow-ord 57.3.0", + "arrow-row 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "arrow-string 57.3.0", +] + +[[package]] +name = "arrow" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c" +dependencies = [ + "arrow-arith 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-csv 58.1.0", + "arrow-data 58.1.0", + "arrow-ipc 58.1.0", + "arrow-json 58.1.0", + "arrow-ord 58.1.0", "arrow-pyarrow", - "arrow-row 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", - "arrow-string 56.2.0", + "arrow-row 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "arrow-string 58.1.0", ] [[package]] @@ -404,6 +416,34 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-arith" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-arith" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "chrono", + "num-traits", +] + [[package]] name = "arrow-array" version = "55.1.0" @@ -432,12 +472,49 @@ dependencies = [ "arrow-data 56.2.0", "arrow-schema 56.2.0", "chrono", - "chrono-tz 0.10.3", "half", - "hashbrown 0.16.0", + "hashbrown 0.16.1", "num", ] +[[package]] +name = "arrow-array" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "chrono-tz 0.10.3", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "chrono", + "chrono-tz 0.10.3", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-buffer" version = "55.1.0" @@ -460,6 +537,30 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-buffer" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + [[package]] name = "arrow-cast" version = "55.1.0" @@ -495,13 +596,56 @@ dependencies = [ "atoi", "base64 0.22.1", "chrono", - "comfy-table", "half", "lexical-core", "num", "ryu", ] +[[package]] +name = "arrow-cast" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "atoi", + "base64 0.22.1", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-cast" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-ord 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "atoi", + "base64 0.22.1", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + [[package]] name = "arrow-csv" version = "55.1.0" @@ -520,13 +664,28 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "56.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" dependencies = [ - "arrow-array 56.2.0", - "arrow-cast 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 57.3.0", + "arrow-cast 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-csv" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca025bd0f38eeecb57c2153c0123b960494138e6a957bbda10da2b25415209fe" +dependencies = [ + "arrow-array 58.1.0", + "arrow-cast 58.1.0", + "arrow-schema 58.1.0", "chrono", "csv", "csv-core", @@ -557,6 +716,32 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-data" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +dependencies = [ + "arrow-buffer 57.3.0", + "arrow-schema 57.3.0", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-data" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" +dependencies = [ + "arrow-buffer 58.1.0", + "arrow-schema 58.1.0", + "half", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-flight" version = "56.2.0" @@ -579,11 +764,39 @@ dependencies = [ "futures", "once_cell", "paste", - "prost", - "prost-types", + "prost 0.13.5", + "prost-types 0.13.5", "tonic 0.13.1", ] +[[package]] +name = "arrow-flight" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "302b2e036335f3f04d65dad3f74ff1f2aae6dc671d6aa04dc6b61193761e16fb" +dependencies = [ + "arrow-arith 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-data 58.1.0", + "arrow-ipc 58.1.0", + "arrow-ord 58.1.0", + "arrow-row 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "arrow-string 58.1.0", + "base64 0.22.1", + "bytes", + "futures", + "once_cell", + "paste", + "prost 0.14.3", + "prost-types 0.14.3", + "tonic 0.14.5", + "tonic-prost", +] + [[package]] name = "arrow-ipc" version = "55.1.0" @@ -609,7 +822,36 @@ dependencies = [ "arrow-schema 56.2.0", "arrow-select 56.2.0", "flatbuffers", - "lz4_flex", +] + +[[package]] +name = "arrow-ipc" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "flatbuffers", + "lz4_flex 0.12.1", +] + +[[package]] +name = "arrow-ipc" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "flatbuffers", + "lz4_flex 0.13.0", "zstd 0.13.3", ] @@ -626,7 +868,7 @@ dependencies = [ "arrow-schema 55.1.0", "chrono", "half", - "indexmap 2.12.0", + "indexmap 2.14.0", "lexical-core", "memchr", "num", @@ -637,22 +879,48 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" dependencies = [ - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", "chrono", "half", - "indexmap 2.12.0", + "indexmap 2.14.0", + "itoa", "lexical-core", "memchr", - "num", - "serde", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-json" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ead0914e4861a531be48fe05858265cf854a4880b9ed12618b1d08cba9bebc8" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "chrono", + "half", + "indexmap 2.14.0", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", "serde_json", "simdutf8", ] @@ -683,15 +951,41 @@ dependencies = [ "arrow-select 56.2.0", ] +[[package]] +name = "arrow-ord" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", +] + +[[package]] +name = "arrow-ord" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", +] + [[package]] name = "arrow-pyarrow" -version = "56.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" +checksum = "e63351dc11981a316c828a6032a5021345bba882f68bc4a36c36825a50725089" dependencies = [ - "arrow-array 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "pyo3", ] @@ -721,6 +1015,32 @@ dependencies = [ "half", ] +[[package]] +name = "arrow-row" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "half", +] + +[[package]] +name = "arrow-row" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "half", +] + [[package]] name = "arrow-schema" version = "55.1.0" @@ -736,9 +1056,22 @@ name = "arrow-schema" version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" + +[[package]] +name = "arrow-schema" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" + +[[package]] +name = "arrow-schema" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" dependencies = [ "bitflags 2.9.0", "serde", + "serde_core", ] [[package]] @@ -769,6 +1102,34 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-select" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +dependencies = [ + "ahash 0.8.12", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "num-traits", +] + +[[package]] +name = "arrow-select" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" +dependencies = [ + "ahash 0.8.12", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "num-traits", +] + [[package]] name = "arrow-string" version = "55.1.0" @@ -803,17 +1164,51 @@ dependencies = [ "regex-syntax 0.8.5", ] +[[package]] +name = "arrow-string" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "memchr", + "num-traits", + "regex", + "regex-syntax 0.8.5", +] + +[[package]] +name = "arrow-string" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "memchr", + "num-traits", + "regex", + "regex-syntax 0.8.5", +] + [[package]] name = "arrow-udf-runtime" version = "0.8.0" -source = "git+https://github.com/datafuse-extras/arrow-udf.git?rev=2480dccf1#2480dccf1bad1a88d39a7c084ed7d54685e93735" +source = "git+https://github.com/SkyFan2002/arrow-udf.git?rev=ec0c74d1bb53d68243c2bb23ebaa9392223e094c#ec0c74d1bb53d68243c2bb23ebaa9392223e094c" dependencies = [ "anyhow", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "async-trait", "atomic-time", "base64 0.22.1", @@ -1654,8 +2049,8 @@ checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" name = "bendpy" version = "0.1.0" dependencies = [ - "arrow 56.2.0", - "arrow-schema 56.2.0", + "arrow 58.1.0", + "arrow-schema 58.1.0", "ctor", "databend-common-base", "databend-common-catalog", @@ -2014,7 +2409,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" dependencies = [ "memchr", - "regex-automata 0.4.9", + "regex-automata", "serde", ] @@ -2103,9 +2498,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.1" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" dependencies = [ "serde", ] @@ -2349,17 +2744,16 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.41" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ - "android-tzdata", "iana-time-zone", "js-sys", "num-traits", "serde", "wasm-bindgen", - "windows-link 0.1.1", + "windows-link 0.2.1", ] [[package]] @@ -3338,7 +3732,7 @@ dependencies = [ "http 1.3.1", "log", "logforth", - "opendal", + "opendal 0.54.1", "tokio", "toml 0.8.22", ] @@ -3362,7 +3756,7 @@ dependencies = [ "databend-storages-common-table-meta", "limits-rs", "log", - "opendal", + "opendal 0.54.1", "serde", "serde_json", "serfig", @@ -3496,7 +3890,7 @@ name = "databend-common-catalog" version = "0.1.0" dependencies = [ "anyhow", - "arrow-schema 56.2.0", + "arrow-schema 58.1.0", "async-backtrace", "async-trait", "chrono", @@ -3526,13 +3920,12 @@ dependencies = [ "log", "maplit", "parking_lot 0.12.3", - "parquet 56.2.0", + "parquet 58.1.0", "rand 0.8.5", "roaring 0.10.12", "serde", "serde_json", "sha2", - "thrift", "tokio", "typetag", "uuid", @@ -3549,13 +3942,14 @@ dependencies = [ "databend-common-exception", "hyper-util", "lenient_semver", - "prost", - "prost-build", + "prost 0.14.3", + "prost-build 0.14.3", "semver", "serde", "tokio", - "tonic 0.13.1", - "tonic-build", + "tonic 0.14.5", + "tonic-prost", + "tonic-prost-build", "tower 0.5.2", ] @@ -3563,9 +3957,9 @@ dependencies = [ name = "databend-common-column" version = "0.1.0" dependencies = [ - "arrow-buffer 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "borsh", "bytemuck", "databend-common-base", @@ -3641,8 +4035,8 @@ name = "databend-common-exception" version = "0.1.0" dependencies = [ "anyhow", - "arrow-flight", - "arrow-schema 56.2.0", + "arrow-flight 58.1.0", + "arrow-schema 58.1.0", "backtrace", "bincode 2.0.1", "cidr", @@ -3655,10 +4049,10 @@ dependencies = [ "libc", "object", "once_cell", - "opendal", - "parquet 56.2.0", + "opendal 0.54.1", + "parquet 58.1.0", "paste", - "prost", + "prost 0.14.3", "redis", "reqwest", "rustc-demangle", @@ -3668,7 +4062,7 @@ dependencies = [ "tantivy", "thiserror 1.0.69", "tokio", - "tonic 0.13.1", + "tonic 0.14.5", ] [[package]] @@ -3676,14 +4070,14 @@ name = "databend-common-expression" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-data 56.2.0", - "arrow-flight", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-data 58.1.0", + "arrow-flight 58.1.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "async-backtrace", "base64 0.22.1", "borsh", @@ -3735,7 +4129,7 @@ dependencies = [ "serde_json", "strength_reduce", "terminal_size", - "tonic 0.13.1", + "tonic 0.14.5", "typetag", "unicode-segmentation", ] @@ -3861,7 +4255,7 @@ dependencies = [ "semver", "thiserror 1.0.69", "tokio", - "tonic 0.13.1", + "tonic 0.14.5", "tower-service", ] @@ -3968,7 +4362,7 @@ dependencies = [ "fastrace", "futures", "log", - "prost", + "prost 0.14.3", "seq-marked", "serde", "serde_json", @@ -3997,7 +4391,7 @@ dependencies = [ "log", "logcall", "maplit", - "prost", + "prost 0.14.3", "rand 0.8.5", "seq-marked", "serde", @@ -4005,6 +4399,7 @@ dependencies = [ "thiserror 1.0.69", "tokio", "tonic 0.13.1", + "tonic 0.14.5", "uuid", "zstd 0.12.4", ] @@ -4033,7 +4428,7 @@ dependencies = [ "num-derive", "num-traits", "paste", - "prost", + "prost 0.14.3", "serde", "serde_json", "sha1", @@ -4046,7 +4441,7 @@ name = "databend-common-meta-app-storage" version = "0.1.0" dependencies = [ "databend-common-exception", - "opendal", + "opendal 0.54.1", "serde", "tokio", ] @@ -4071,7 +4466,7 @@ dependencies = [ "futures", "mlua", "openraft", - "prost", + "prost 0.14.3", "reqwest", "serde", "serde_json", @@ -4094,7 +4489,7 @@ dependencies = [ "databend-meta", "databend-meta-client 260205.5.0", "openraft", - "prost", + "prost 0.14.3", "regex", "serde", "serde_json", @@ -4153,7 +4548,7 @@ dependencies = [ "tempfile", "tokio", "tokio-stream", - "tonic 0.13.1", + "tonic 0.14.5", ] [[package]] @@ -4184,7 +4579,7 @@ dependencies = [ "lz4", "match-template", "num", - "opendal", + "opendal 0.54.1", "rand 0.8.5", "ringbuffer", "roaring 0.10.12", @@ -4208,7 +4603,7 @@ dependencies = [ "futures", "log", "parking_lot 0.12.3", - "petgraph", + "petgraph 0.6.5", "serde", "tokio", "typetag", @@ -4219,9 +4614,9 @@ name = "databend-common-pipeline-transforms" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 56.2.0", - "arrow-ord 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-ord 58.1.0", + "arrow-schema 58.1.0", "async-backtrace", "async-channel", "async-trait", @@ -4273,7 +4668,7 @@ dependencies = [ "maplit", "num", "pretty_assertions", - "prost", + "prost 0.14.3", "thiserror 1.0.69", ] @@ -4284,11 +4679,12 @@ dependencies = [ "lenient_semver", "num-derive", "num-traits", - "prost", - "prost-build", + "prost 0.14.3", + "prost-build 0.14.3", "semver", - "tonic 0.13.1", - "tonic-build", + "tonic 0.14.5", + "tonic-prost", + "tonic-prost-build", ] [[package]] @@ -4384,13 +4780,13 @@ dependencies = [ "fastrace", "globiter", "goldenfile", - "indexmap 2.12.0", + "indexmap 2.14.0", "itertools 0.13.0", "jsonb", "log", "num-derive", "num-traits", - "opendal", + "opendal 0.54.1", "parking_lot 0.12.3", "prqlc", "recursive", @@ -4444,7 +4840,7 @@ version = "0.1.0" dependencies = [ "ahash 0.8.12", "anyhow", - "arrow-schema 56.2.0", + "arrow-schema 58.1.0", "async-backtrace", "base64 0.22.1", "borsh", @@ -4463,8 +4859,8 @@ dependencies = [ "iceberg", "log", "lru", - "opendal", - "parquet 56.2.0", + "opendal 0.54.1", + "parquet 58.1.0", "prometheus-client 0.22.3", "regex", "reqwest", @@ -4479,7 +4875,7 @@ dependencies = [ name = "databend-common-storages-basic" version = "0.1.0" dependencies = [ - "arrow 56.2.0", + "arrow 58.1.0", "async-backtrace", "async-trait", "databend-common-base", @@ -4495,9 +4891,9 @@ dependencies = [ "databend-storages-common-blocks", "databend-storages-common-table-meta", "log", - "opendal", + "opendal 0.54.1", "parking_lot 0.12.3", - "parquet 56.2.0", + "parquet 58.1.0", "serde", "serde_json", "sha2", @@ -4510,7 +4906,7 @@ dependencies = [ name = "databend-common-storages-delta" version = "0.1.0" dependencies = [ - "arrow-schema 56.2.0", + "arrow-schema 58.1.0", "async-backtrace", "async-trait", "databend-common-base", @@ -4528,7 +4924,7 @@ dependencies = [ "fastrace", "itertools 0.13.0", "object_store_opendal", - "parquet 56.2.0", + "parquet 58.1.0", "serde", "serde_json", "tokio", @@ -4557,10 +4953,10 @@ name = "databend-common-storages-fuse" version = "0.1.0" dependencies = [ "ahash 0.8.12", - "arrow 56.2.0", - "arrow-array 56.2.0", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", + "arrow 58.1.0", + "arrow-array 58.1.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", "async-backtrace", "async-channel", "async-trait", @@ -4605,14 +5001,14 @@ dependencies = [ "futures-util", "geo", "geo-index", - "indexmap 2.12.0", + "indexmap 2.14.0", "itertools 0.13.0", "jsonb", "log", "match-template", - "opendal", + "opendal 0.54.1", "parking_lot 0.12.3", - "parquet 56.2.0", + "parquet 58.1.0", "paste", "rand 0.8.5", "roaring 0.10.12", @@ -4657,8 +5053,8 @@ dependencies = [ "futures", "hive_metastore", "log", - "opendal", - "parquet 56.2.0", + "opendal 0.54.1", + "parquet 58.1.0", "recursive", "serde", "tokio", @@ -4670,8 +5066,8 @@ dependencies = [ name = "databend-common-storages-iceberg" version = "0.1.0" dependencies = [ - "arrow-array 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-schema 58.1.0", "async-backtrace", "async-trait", "chrono", @@ -4700,7 +5096,7 @@ dependencies = [ "iceberg-catalog-rest", "iceberg-catalog-s3tables", "log", - "parquet 56.2.0", + "parquet 58.1.0", "serde", "serde_json", "typetag", @@ -4722,8 +5118,8 @@ dependencies = [ name = "databend-common-storages-orc" version = "0.1.0" dependencies = [ - "arrow-array 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-schema 58.1.0", "async-backtrace", "async-trait", "bytes", @@ -4743,7 +5139,7 @@ dependencies = [ "futures-util", "jiff", "log", - "opendal", + "opendal 0.54.1", "orc-rust", "serde", "serde_json", @@ -4754,10 +5150,10 @@ dependencies = [ name = "databend-common-storages-parquet" version = "0.1.0" dependencies = [ - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-schema 58.1.0", "async-backtrace", "async-trait", "bytes", @@ -4780,11 +5176,10 @@ dependencies = [ "futures", "jiff", "log", - "opendal", - "parquet 56.2.0", + "opendal 0.54.1", + "parquet 58.1.0", "rand 0.8.5", "serde", - "thrift", "tokio", "typetag", ] @@ -4794,8 +5189,8 @@ name = "databend-common-storages-stage" version = "0.1.0" dependencies = [ "apache-avro 0.17.0", - "arrow-array 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-schema 58.1.0", "async-backtrace", "async-trait", "bstr", @@ -4835,9 +5230,9 @@ dependencies = [ "num-traits", "object_store", "object_store_opendal", - "opendal", + "opendal 0.54.1", "parking_lot 0.12.3", - "parquet 56.2.0", + "parquet 58.1.0", "serde", "serde_json", "simdutf8", @@ -4901,7 +5296,7 @@ dependencies = [ "jsonb", "log", "once_cell", - "opendal", + "opendal 0.54.1", "parking_lot 0.12.3", "regex", "serde", @@ -4934,8 +5329,8 @@ name = "databend-common-tracing" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-schema 58.1.0", "async-channel", "backtrace", "concurrent-queue", @@ -4951,16 +5346,16 @@ dependencies = [ "libc", "log", "logforth", - "opendal", + "opendal 0.54.1", "opentelemetry 0.29.1", "opentelemetry-otlp 0.29.0", "opentelemetry_sdk 0.29.0", - "parquet 56.2.0", + "parquet 58.1.0", "serde", "serde_json", "tokio", "toml 0.8.22", - "tonic 0.13.1", + "tonic 0.14.5", "uuid", ] @@ -5130,7 +5525,7 @@ dependencies = [ "jsonb", "jwt-simple", "log", - "opendal", + "opendal 0.54.1", "serde", "tempfile", "tokio", @@ -5315,7 +5710,7 @@ source = "git+https://github.com/databendlabs/databend-meta?tag=v260312.7.0#d2a0 dependencies = [ "anyerror", "anyhow", - "arrow-flight", + "arrow-flight 56.2.0", "async-trait", "backon", "base2histogram", @@ -5343,7 +5738,7 @@ dependencies = [ "openraft", "peel-off", "prometheus-client 0.22.3", - "prost", + "prost 0.13.5", "raft-log", "rustls 0.23.36", "seq-marked", @@ -5465,7 +5860,7 @@ version = "260205.5.0" source = "git+https://github.com/databendlabs/databend-meta?tag=v260205.5.0#b0c74ad6d4e3c2e2f8801fdb193704dc5c682d56" dependencies = [ "anyerror", - "arrow-flight", + "arrow-flight 56.2.0", "async-backtrace", "chrono", "databend-base", @@ -5483,7 +5878,7 @@ dependencies = [ "logcall", "once_cell", "parking_lot 0.12.3", - "prost", + "prost 0.13.5", "serde", "serde_json", "thiserror 1.0.69", @@ -5497,7 +5892,7 @@ version = "260312.7.0" source = "git+https://github.com/databendlabs/databend-meta?tag=v260312.7.0#d2a0e8316f0537be1ba6aa59f87f3ac052ade57d" dependencies = [ "anyerror", - "arrow-flight", + "arrow-flight 56.2.0", "async-backtrace", "async-trait", "chrono", @@ -5517,7 +5912,7 @@ dependencies = [ "logcall", "once_cell", "parking_lot 0.12.3", - "prost", + "prost 0.13.5", "serde", "serde_json", "thiserror 1.0.69", @@ -5674,13 +6069,13 @@ dependencies = [ "num-derive", "num-traits", "openraft", - "prost", - "prost-build", + "prost 0.13.5", + "prost-build 0.13.5", "serde", "serde_json", "state-machine-api", "tonic 0.13.1", - "tonic-build", + "tonic-build 0.13.1", ] [[package]] @@ -5817,15 +6212,15 @@ dependencies = [ "num_cpus", "openraft", "pretty_assertions", - "prost", - "prost-build", + "prost 0.13.5", + "prost-build 0.13.5", "rotbl", "serde", "serde_json", "state-machine-api", "thiserror 1.0.69", "tonic 0.13.1", - "tonic-build", + "tonic-build 0.13.1", ] [[package]] @@ -5847,7 +6242,7 @@ dependencies = [ "num_cpus", "openraft", "pretty_assertions", - "prost", + "prost 0.13.5", "rotbl", "serde", "serde_json", @@ -5884,13 +6279,14 @@ name = "databend-query" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-flight", - "arrow-ipc 56.2.0", - "arrow-json 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-csv 58.1.0", + "arrow-flight 58.1.0", + "arrow-ipc 58.1.0", + "arrow-json 58.1.0", + "arrow-schema 58.1.0", "async-backtrace", "async-channel", "async-compat", @@ -6007,20 +6403,20 @@ dependencies = [ "md-5", "mysql_async", "num_cpus", - "opendal", + "opendal 0.54.1", "opensrv-mysql", "opentelemetry 0.29.1", "opentelemetry_sdk 0.29.0", "p256", "parking_lot 0.12.3", - "parquet 56.2.0", - "petgraph", + "parquet 58.1.0", + "petgraph 0.6.5", "pin-project-lite", "poem", "pretty_assertions", "prometheus-client 0.22.3", "proptest", - "prost", + "prost 0.14.3", "rand 0.8.5", "recursive", "redis", @@ -6043,7 +6439,7 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic 0.13.1", + "tonic 0.14.5", "tower 0.5.2", "typetag", "unicase", @@ -6058,10 +6454,10 @@ name = "databend-query-script-udf-support" version = "0.1.0" dependencies = [ "anyhow", - "arrow-array 56.2.0", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", + "arrow-array 58.1.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "arrow-udf-runtime", "databend-common-base", "databend-common-cache", @@ -6181,13 +6577,13 @@ dependencies = [ name = "databend-storages-common-blocks" version = "0.1.0" dependencies = [ - "arrow-array 56.2.0", + "arrow-array 58.1.0", "bytes", "databend-common-exception", "databend-common-expression", "databend-storages-common-table-meta", "parking_lot 0.12.3", - "parquet 56.2.0", + "parquet 58.1.0", "tokio", ] @@ -6195,7 +6591,7 @@ dependencies = [ name = "databend-storages-common-cache" version = "0.1.0" dependencies = [ - "arrow 56.2.0", + "arrow 58.1.0", "async-backtrace", "async-trait", "bytes", @@ -6216,7 +6612,7 @@ dependencies = [ "log", "mockall", "parking_lot 0.12.3", - "parquet 56.2.0", + "parquet 58.1.0", "rayon", "rustix 0.38.44", "siphasher 0.3.11", @@ -6229,8 +6625,8 @@ version = "0.1.0" dependencies = [ "anyerror", "anyhow", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", "base64 0.22.1", "bincode 2.0.1", "bitvec", @@ -6262,7 +6658,7 @@ dependencies = [ "num_cpus", "ordered-float 5.1.0", "parking_lot 0.12.3", - "parquet 56.2.0", + "parquet 58.1.0", "rand 0.8.5", "rayon", "roaring 0.10.12", @@ -6293,8 +6689,8 @@ dependencies = [ "fastrace", "futures", "log", - "opendal", - "parquet 56.2.0", + "opendal 0.54.1", + "parquet 58.1.0", ] [[package]] @@ -6323,7 +6719,7 @@ dependencies = [ "databend-storages-common-blocks", "databend-storages-common-table-meta", "log", - "opendal", + "opendal 0.54.1", "parking_lot 0.12.3", "serde", "uuid", @@ -6333,7 +6729,7 @@ dependencies = [ name = "databend-storages-common-stage" version = "0.1.0" dependencies = [ - "arrow-array 56.2.0", + "arrow-array 58.1.0", "databend-common-ast", "databend-common-catalog", "databend-common-exception", @@ -6349,7 +6745,7 @@ dependencies = [ name = "databend-storages-common-table-meta" version = "0.1.0" dependencies = [ - "arrow 56.2.0", + "arrow 58.1.0", "bincode 1.3.3", "bytes", "chrono", @@ -6366,7 +6762,7 @@ dependencies = [ "databend-common-storage", "enum-as-inner", "log", - "parquet 56.2.0", + "parquet 58.1.0", "rmp-serde", "serde", "serde_json", @@ -6390,22 +6786,20 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "50.3.0" +version = "52.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a83760d9a13122d025fbdb1d5d5aaf93dd9ada5e90ea229add92aa30898b2d1" +checksum = "d551054acec0398ca604512310b77ce05c46f66e54b54d48200a686e385cca4e" dependencies = [ "ahash 0.8.12", - "arrow 56.2.0", - "arrow-ipc 56.2.0", - "base64 0.22.1", + "arrow 57.3.0", + "arrow-ipc 57.3.0", "chrono", "half", - "hashbrown 0.14.5", - "indexmap 2.12.0", + "hashbrown 0.16.1", + "indexmap 2.14.0", "libc", "log", "paste", - "sqlparser 0.58.0", "tokio", "web-time", ] @@ -6480,7 +6874,7 @@ dependencies = [ "chrono", "delta_kernel_derive", "futures", - "indexmap 2.12.0", + "indexmap 2.14.0", "itertools 0.14.0", "object_store", "parquet 55.1.0", @@ -6543,7 +6937,7 @@ dependencies = [ "either", "futures", "humantime", - "indexmap 2.12.0", + "indexmap 2.14.0", "itertools 0.14.0", "maplit", "num-bigint", @@ -7441,6 +7835,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "flatbuffers" version = "25.2.10" @@ -7507,6 +7907,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.3.2" @@ -7656,11 +8062,10 @@ dependencies = [ [[package]] name = "fsst" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a7cb31412f3f09e2e0487281b2920fd4c19aba352c3db6e93e7d63d8f2081eb" +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" dependencies = [ - "arrow-array 56.2.0", + "arrow-array 58.1.0", "rand 0.9.2", ] @@ -8065,7 +8470,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" dependencies = [ "fallible-iterator", - "indexmap 2.12.0", + "indexmap 2.14.0", "stable_deref_trait", ] @@ -9068,7 +9473,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.12.0", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -9087,7 +9492,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.3.1", - "indexmap 2.12.0", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -9115,13 +9520,14 @@ checksum = "3b42eb4efef1f96510ae1a33b2682562a677d504641e9903a77bf5c666b9013e" [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", "num-traits", + "zerocopy", ] [[package]] @@ -9179,14 +9585,25 @@ checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] name = "hashbrown" -version = "0.16.0" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "hashlink" @@ -9788,19 +10205,19 @@ dependencies = [ [[package]] name = "iceberg" version = "0.8.0" -source = "git+https://github.com/databendlabs/iceberg-rust?rev=6ccaa60e#6ccaa60e3da324b570a1288cd9b5977c70981930" +source = "git+https://github.com/SkyFan2002/iceberg-rust.git?rev=edb4e4f8158821d274850d19d2c7d0030e16e142#edb4e4f8158821d274850d19d2c7d0030e16e142" dependencies = [ "anyhow", "apache-avro 0.21.0", "array-init", - "arrow-arith 56.2.0", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-ord 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", - "arrow-string 56.2.0", + "arrow-arith 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-ord 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "arrow-string 58.1.0", "as-any", "async-trait", "backon", @@ -9818,9 +10235,9 @@ dependencies = [ "murmur3", "num-bigint", "once_cell", - "opendal", + "opendal 0.55.0", "ordered-float 4.6.0", - "parquet 56.2.0", + "parquet 58.1.0", "rand 0.8.5", "reqsign", "reqwest", @@ -9843,7 +10260,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-glue" version = "0.8.0" -source = "git+https://github.com/databendlabs/iceberg-rust?rev=6ccaa60e#6ccaa60e3da324b570a1288cd9b5977c70981930" +source = "git+https://github.com/SkyFan2002/iceberg-rust.git?rev=edb4e4f8158821d274850d19d2c7d0030e16e142#edb4e4f8158821d274850d19d2c7d0030e16e142" dependencies = [ "anyhow", "async-trait", @@ -9858,7 +10275,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-hms" version = "0.8.0" -source = "git+https://github.com/databendlabs/iceberg-rust?rev=6ccaa60e#6ccaa60e3da324b570a1288cd9b5977c70981930" +source = "git+https://github.com/SkyFan2002/iceberg-rust.git?rev=edb4e4f8158821d274850d19d2c7d0030e16e142#edb4e4f8158821d274850d19d2c7d0030e16e142" dependencies = [ "anyhow", "async-trait", @@ -9880,7 +10297,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-rest" version = "0.8.0" -source = "git+https://github.com/databendlabs/iceberg-rust?rev=6ccaa60e#6ccaa60e3da324b570a1288cd9b5977c70981930" +source = "git+https://github.com/SkyFan2002/iceberg-rust.git?rev=edb4e4f8158821d274850d19d2c7d0030e16e142#edb4e4f8158821d274850d19d2c7d0030e16e142" dependencies = [ "async-trait", "chrono", @@ -9900,7 +10317,7 @@ dependencies = [ [[package]] name = "iceberg-catalog-s3tables" version = "0.8.0" -source = "git+https://github.com/databendlabs/iceberg-rust?rev=6ccaa60e#6ccaa60e3da324b570a1288cd9b5977c70981930" +source = "git+https://github.com/SkyFan2002/iceberg-rust.git?rev=edb4e4f8158821d274850d19d2c7d0030e16e142#edb4e4f8158821d274850d19d2c7d0030e16e142" dependencies = [ "anyhow", "async-trait", @@ -10113,22 +10530,16 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.12.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.0", + "hashbrown 0.17.0", "serde", "serde_core", ] -[[package]] -name = "indoc" -version = "2.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" - [[package]] name = "inferno" version = "0.11.21" @@ -10136,7 +10547,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash 0.8.12", - "indexmap 2.12.0", + "indexmap 2.14.0", "is-terminal", "itoa", "log", @@ -10237,6 +10648,17 @@ version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06432fb54d3be7964ecd3649233cddf80db2832f47fec34c01f65b3d9d774983" +[[package]] +name = "io-uring" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd7bddefd0a8833b88a4b68f90dae22c7450d11b354198baee3874fd811b344" +dependencies = [ + "bitflags 2.9.0", + "cfg-if", + "libc", +] + [[package]] name = "ipconfig" version = "0.3.2" @@ -10370,9 +10792,9 @@ version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01dbdbd07b076e8403abac68ce7744d93e2ecd953bbc44bf77bf00e1e81172bc" dependencies = [ - "foldhash", + "foldhash 0.1.5", "hifijson", - "indexmap 2.12.0", + "indexmap 2.14.0", "jaq-core", "jaq-std", "serde_json", @@ -10569,7 +10991,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ee7893dab2e44ae5f9d0173f26ff4aa327c10b01b06a72b52dd9405b628640d" dependencies = [ - "indexmap 2.12.0", + "indexmap 2.14.0", ] [[package]] @@ -10603,17 +11025,19 @@ dependencies = [ [[package]] name = "lance-arrow" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b53669b967e3800e3ef80b6d32451ca1f4a98adab5a0ea180b57451b45ef4f7e" -dependencies = [ - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-data 58.1.0", + "arrow-ipc 58.1.0", + "arrow-ord 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "bytes", + "futures", "getrandom 0.2.16", "half", "jsonb", @@ -10623,9 +11047,8 @@ dependencies = [ [[package]] name = "lance-bitpacking" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ab4c7e3d69d4318e4078c38f3bba4e24d70eed8dad01ef2646a3639da59cdcf" +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" dependencies = [ "arrayref", "paste", @@ -10634,19 +11057,19 @@ dependencies = [ [[package]] name = "lance-core" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bc4d37a02c3212343e87008e9140ce8d28c370b45c31ca9cadcb3528b588162" +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" dependencies = [ - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-schema 56.2.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-schema 58.1.0", "async-trait", "byteorder", "bytes", "chrono", "deepsize", "futures", + "itertools 0.13.0", "lance-arrow", "libc", "log", @@ -10655,11 +11078,11 @@ dependencies = [ "num_cpus", "object_store", "pin-project", - "prost", + "prost 0.14.3", "rand 0.9.2", - "roaring 0.10.12", + "roaring 0.11.3", "serde_json", - "snafu", + "snafu 0.9.0", "tempfile", "tokio", "tokio-stream", @@ -10670,17 +11093,16 @@ dependencies = [ [[package]] name = "lance-encoding" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "226f5cc09ff8f52f7648ac3aeb0193c9a041f881108de6f7a928cf23a802165d" -dependencies = [ - "arrow-arith 56.2.0", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" +dependencies = [ + "arrow-arith 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "bytemuck", "byteorder", "bytes", @@ -10695,12 +11117,12 @@ dependencies = [ "log", "lz4", "num-traits", - "prost", - "prost-build", - "prost-types", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", "protobuf-src", "rand 0.9.2", - "snafu", + "snafu 0.9.0", "strum 0.26.3", "tokio", "tracing", @@ -10710,16 +11132,15 @@ dependencies = [ [[package]] name = "lance-file" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcc5f0bfc5fb352cd1f127968dc3118320fa5c339bc1d41667a75db0dc44e185" -dependencies = [ - "arrow-arith 56.2.0", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" +dependencies = [ + "arrow-arith 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "async-recursion", "async-trait", "byteorder", @@ -10734,29 +11155,28 @@ dependencies = [ "log", "num-traits", "object_store", - "prost", - "prost-build", - "prost-types", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", "protobuf-src", - "snafu", + "snafu 0.9.0", "tokio", "tracing", ] [[package]] name = "lance-io" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09dda9825d75111d83a3bd63fcbd6dbaa1f0ae9aa8b791dfb32fb5d521055c6b" -dependencies = [ - "arrow 56.2.0", - "arrow-arith 56.2.0", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-data 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" +dependencies = [ + "arrow 58.1.0", + "arrow-arith 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "async-recursion", "async-trait", "byteorder", @@ -10764,18 +11184,22 @@ dependencies = [ "chrono", "deepsize", "futures", + "http 1.3.1", + "io-uring", "lance-arrow", "lance-core", "lance-namespace", + "libc", "log", + "moka", "object_store", "path_abs", "pin-project", - "prost", + "prost 0.14.3", "rand 0.9.2", "serde", - "shellexpand 3.1.2", - "snafu", + "snafu 0.9.0", + "tempfile", "tokio", "tracing", "url", @@ -10783,23 +11207,23 @@ dependencies = [ [[package]] name = "lance-namespace" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f89added88947b232179e39c400dd6091498be7cfe29cefd464c0bf31d6f3ad" +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" dependencies = [ - "arrow 56.2.0", + "arrow 58.1.0", "async-trait", "bytes", "lance-core", "lance-namespace-reqwest-client", - "snafu", + "serde", + "snafu 0.9.0", ] [[package]] name = "lance-namespace-reqwest-client" -version = "0.4.5" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" +checksum = "ee2e48de899e2931afb67fcddd0a08e439bf5d8b6ea2a2ed9cb8f4df669bd5cc" dependencies = [ "reqwest", "serde", @@ -10810,15 +11234,14 @@ dependencies = [ [[package]] name = "lance-table" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee04ca678959cd94f11fe4c4229eebcce636f0ea1094d2c81a0ee5e0458c1a2" -dependencies = [ - "arrow 56.2.0", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", +version = "5.1.0-beta.3" +source = "git+https://github.com/SkyFan2002/lance.git?rev=983238285f482cc6d229483d15d6dde8f3bc3e96#983238285f482cc6d229483d15d6dde8f3bc3e96" +dependencies = [ + "arrow 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", "async-trait", "byteorder", "bytes", @@ -10831,17 +11254,17 @@ dependencies = [ "lance-io", "log", "object_store", - "prost", - "prost-build", - "prost-types", + "prost 0.14.3", + "prost-build 0.14.3", + "prost-types 0.14.3", "protobuf-src", "rand 0.9.2", "rangemap", - "roaring 0.10.12", + "roaring 0.11.3", "semver", "serde", "serde_json", - "snafu", + "snafu 0.9.0", "tokio", "tracing", "url", @@ -11271,6 +11694,24 @@ dependencies = [ "twox-hash 1.6.3", ] +[[package]] +name = "lz4_flex" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +dependencies = [ + "twox-hash 2.1.0", +] + +[[package]] +name = "lz4_flex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" +dependencies = [ + "twox-hash 2.1.0", +] + [[package]] name = "lzma-rs" version = "0.3.0" @@ -11354,11 +11795,11 @@ checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" [[package]] name = "matchers" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" dependencies = [ - "regex-automata 0.1.10", + "regex-automata", ] [[package]] @@ -11950,12 +12391,11 @@ dependencies = [ [[package]] name = "nu-ansi-term" -version = "0.46.0" +version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "overload", - "winapi", + "windows-sys 0.61.2", ] [[package]] @@ -12136,7 +12576,7 @@ dependencies = [ "crc32fast", "flate2", "hashbrown 0.15.3", - "indexmap 2.12.0", + "indexmap 2.14.0", "memchr", "ruzstd", ] @@ -12189,7 +12629,7 @@ dependencies = [ "bytes", "futures", "object_store", - "opendal", + "opendal 0.54.1", "pin-project", "tokio", ] @@ -12240,7 +12680,7 @@ dependencies = [ "moka", "percent-encoding", "prometheus-client 0.23.1", - "prost", + "prost 0.13.5", "quick-xml 0.38.4", "reqsign", "reqwest", @@ -12251,6 +12691,35 @@ dependencies = [ "uuid", ] +[[package]] +name = "opendal" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d075ab8a203a6ab4bc1bce0a4b9fe486a72bf8b939037f4b78d95386384bc80a" +dependencies = [ + "anyhow", + "backon", + "base64 0.22.1", + "bytes", + "crc32c", + "futures", + "getrandom 0.2.16", + "http 1.3.1", + "http-body 1.0.1", + "jiff", + "log", + "md-5", + "percent-encoding", + "quick-xml 0.38.4", + "reqsign", + "reqwest", + "serde", + "serde_json", + "tokio", + "url", + "uuid", +] + [[package]] name = "openraft" version = "0.10.0-alpha.17" @@ -12447,7 +12916,7 @@ dependencies = [ "opentelemetry-http 0.28.0", "opentelemetry-proto 0.28.0", "opentelemetry_sdk 0.28.0", - "prost", + "prost 0.13.5", "reqwest", "serde_json", "thiserror 2.0.18", @@ -12468,7 +12937,7 @@ dependencies = [ "opentelemetry-http 0.29.0", "opentelemetry-proto 0.29.0", "opentelemetry_sdk 0.29.0", - "prost", + "prost 0.13.5", "reqwest", "thiserror 2.0.18", "tokio", @@ -12486,7 +12955,7 @@ dependencies = [ "hex", "opentelemetry 0.28.0", "opentelemetry_sdk 0.28.0", - "prost", + "prost 0.13.5", "serde", "tonic 0.12.3", ] @@ -12499,7 +12968,7 @@ checksum = "8c40da242381435e18570d5b9d50aca2a4f4f4d8e146231adb4e7768023309b3" dependencies = [ "opentelemetry 0.29.1", "opentelemetry_sdk 0.29.0", - "prost", + "prost 0.13.5", "tonic 0.12.3", ] @@ -12547,9 +13016,9 @@ dependencies = [ [[package]] name = "orc-rust" version = "0.6.0" -source = "git+https://github.com/datafuse-extras/orc-rust?rev=fc812ad7010#fc812ad7010c5ab9753a0d93a31dbeed0bcbf3b7" +source = "git+https://github.com/SkyFan2002/orc-rust.git?rev=3fba34ccde628b4063d7f892d15866f89236064c#3fba34ccde628b4063d7f892d15866f89236064c" dependencies = [ - "arrow 56.2.0", + "arrow 58.1.0", "async-trait", "bytemuck", "bytes", @@ -12559,11 +13028,11 @@ dependencies = [ "flate2", "futures", "futures-util", - "lz4_flex", + "lz4_flex 0.11.3", "lzokay-native", "num", - "prost", - "snafu", + "prost 0.13.5", + "snafu 0.8.5", "snap", "tokio", "zstd 0.13.3", @@ -12614,12 +13083,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "ownedbytes" version = "0.9.0" @@ -12734,7 +13197,7 @@ dependencies = [ "futures", "half", "hashbrown 0.15.3", - "lz4_flex", + "lz4_flex 0.11.3", "num", "num-bigint", "object_store", @@ -12750,18 +13213,17 @@ dependencies = [ [[package]] name = "parquet" -version = "56.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" dependencies = [ "ahash 0.8.12", - "arrow-array 56.2.0", - "arrow-buffer 56.2.0", - "arrow-cast 56.2.0", - "arrow-data 56.2.0", - "arrow-ipc 56.2.0", - "arrow-schema 56.2.0", - "arrow-select 56.2.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "base64 0.22.1", "brotli 8.0.1", "bytes", @@ -12769,10 +13231,11 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.16.0", - "lz4_flex", - "num", + "hashbrown 0.16.1", + "lz4_flex 0.13.0", "num-bigint", + "num-integer", + "num-traits", "paste", "seq-macro", "simdutf8", @@ -12932,12 +13395,23 @@ version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ - "fixedbitset", - "indexmap 2.12.0", + "fixedbitset 0.4.2", + "indexmap 2.14.0", "serde", "serde_derive", ] +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset 0.5.7", + "hashbrown 0.15.3", + "indexmap 2.14.0", +] + [[package]] name = "phf" version = "0.11.3" @@ -13561,7 +14035,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.13.5", +] + +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive 0.14.3", ] [[package]] @@ -13575,10 +14059,31 @@ dependencies = [ "log", "multimap", "once_cell", - "petgraph", + "petgraph 0.6.5", + "prettyplease", + "prost 0.13.5", + "prost-types 0.13.5", + "regex", + "syn 2.0.106", + "tempfile", +] + +[[package]] +name = "prost-build" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +dependencies = [ + "heck 0.5.0", + "itertools 0.14.0", + "log", + "multimap", + "petgraph 0.8.3", "prettyplease", - "prost", - "prost-types", + "prost 0.14.3", + "prost-types 0.14.3", + "pulldown-cmark", + "pulldown-cmark-to-cmark", "regex", "syn 2.0.106", "tempfile", @@ -13597,13 +14102,35 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "prost-types" version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ - "prost", + "prost 0.13.5", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost 0.14.3", ] [[package]] @@ -13639,7 +14166,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4aeaa1f2460f1d348eeaeed86aea999ce98c1bded6f089ff8514c9d9dbdc973" dependencies = [ "anyhow", - "indexmap 2.12.0", + "indexmap 2.14.0", "log", "protobuf", "protobuf-support", @@ -13796,6 +14323,26 @@ dependencies = [ "psl-types", ] +[[package]] +name = "pulldown-cmark" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" +dependencies = [ + "bitflags 2.9.0", + "memchr", + "unicase", +] + +[[package]] +name = "pulldown-cmark-to-cmark" +version = "22.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50793def1b900256624a709439404384204a5dc3a6ec580281bfaac35e882e90" +dependencies = [ + "pulldown-cmark", +] + [[package]] name = "pulley-interpreter" version = "27.0.0" @@ -13809,37 +14356,33 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "cf85e27e86080aafd5a22eae58a162e133a589551542b3e5cee4beb27e54f8e1" dependencies = [ - "indoc", "libc", - "memoffset", "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", - "unindent", ] [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "8bf94ee265674bf76c09fa430b0e99c26e319c945d96ca0d5a8215f31bf81cf7" dependencies = [ - "once_cell", "python3-dll-a", - "target-lexicon 0.13.2", + "target-lexicon 0.13.5", ] [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "491aa5fc66d8059dd44a75f4580a2962c1862a1c2945359db36f6c2818b748dc" dependencies = [ "libc", "pyo3-build-config", @@ -13847,9 +14390,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "f5d671734e9d7a43449f8480f8b38115df67bef8d21f76837fa75ee7aaa5e52e" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -13859,9 +14402,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "22faaa1ce6c430a1f71658760497291065e6450d7b5dc2bcf254d49f66ee700a" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -14293,30 +14836,21 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.1" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.9", + "regex-automata", "regex-syntax 0.8.5", ] [[package]] name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" -dependencies = [ - "regex-syntax 0.6.29", -] - -[[package]] -name = "regex-automata" -version = "0.4.9" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -14544,7 +15078,7 @@ checksum = "1e147371c75553e1e2fcdb483944a8540b8438c31426279553b9a8182a9b7b65" dependencies = [ "bytes", "hashbrown 0.15.3", - "indexmap 2.12.0", + "indexmap 2.14.0", "munge", "ptr_meta 0.3.0", "rancor", @@ -14683,7 +15217,7 @@ dependencies = [ "convert_case 0.6.0", "fnv", "ident_case", - "indexmap 2.12.0", + "indexmap 2.14.0", "proc-macro-crate 1.3.1", "proc-macro-error 1.0.4", "proc-macro2", @@ -15198,7 +15732,7 @@ version = "0.1.0" source = "git+https://github.com/datafuse-extras/serde-bridge?rev=4f0e99a#4f0e99abc82de8a82046415c90f01f9739bad630" dependencies = [ "anyhow", - "indexmap 2.12.0", + "indexmap 2.14.0", "serde", ] @@ -15274,7 +15808,7 @@ version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ - "indexmap 2.12.0", + "indexmap 2.14.0", "itoa", "memchr", "serde", @@ -15352,7 +15886,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.12.0", + "indexmap 2.14.0", "serde", "serde_derive", "serde_json", @@ -15378,7 +15912,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.12.0", + "indexmap 2.14.0", "itoa", "ryu", "serde", @@ -15391,7 +15925,7 @@ version = "0.1.0" source = "git+https://github.com/datafuse-extras/serfig?rev=610ac6d#610ac6dfa251206d3667d169b772de7b0f05d151" dependencies = [ "anyhow", - "indexmap 2.12.0", + "indexmap 2.14.0", "log", "serde", "serde-bridge", @@ -15461,15 +15995,6 @@ dependencies = [ "dirs", ] -[[package]] -name = "shellexpand" -version = "3.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32824fab5e16e6c4d86dc1ba84489390419a39f97699852b66480bb87d297ed8" -dependencies = [ - "dirs", -] - [[package]] name = "shlex" version = "1.3.0" @@ -15656,7 +16181,16 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" dependencies = [ - "snafu-derive", + "snafu-derive 0.8.5", +] + +[[package]] +name = "snafu" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1d4bced6a69f90b2056c03dcff2c4737f98d6fb9e0853493996e1d253ca29c6" +dependencies = [ + "snafu-derive 0.9.0", ] [[package]] @@ -15671,6 +16205,18 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "snafu-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54254b8531cafa275c5e096f62d48c81435d1015405a91198ddb11e967301d40" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "snailquote" version = "0.3.1" @@ -15852,27 +16398,6 @@ dependencies = [ "recursive", ] -[[package]] -name = "sqlparser" -version = "0.58.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec4b661c54b1e4b603b37873a18c59920e4c51ea8ea2cf527d925424dbd4437c" -dependencies = [ - "log", - "sqlparser_derive", -] - -[[package]] -name = "sqlparser_derive" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - [[package]] name = "sqlx" version = "0.8.5" @@ -15904,7 +16429,7 @@ dependencies = [ "futures-util", "hashbrown 0.15.3", "hashlink 0.10.0", - "indexmap 2.12.0", + "indexmap 2.14.0", "log", "memchr", "once_cell", @@ -16553,7 +17078,7 @@ dependencies = [ "levenshtein_automata", "log", "lru", - "lz4_flex", + "lz4_flex 0.11.3", "measure_time", "memmap2", "once_cell", @@ -16693,9 +17218,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "target-lexicon" -version = "0.13.2" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" [[package]] name = "target-triple" @@ -17098,7 +17623,7 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75129e1dc5000bfbaa9fee9d1b21f974f9fbad9daec557a521ee6e080825f6e8" dependencies = [ - "indexmap 2.12.0", + "indexmap 2.14.0", "serde", "serde_spanned 1.0.0", "toml_datetime 0.7.0", @@ -17131,7 +17656,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap 2.12.0", + "indexmap 2.14.0", "serde", "serde_spanned 0.6.8", "toml_datetime 0.6.9", @@ -17144,7 +17669,7 @@ version = "0.22.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e" dependencies = [ - "indexmap 2.12.0", + "indexmap 2.14.0", "serde", "serde_spanned 0.6.8", "toml_datetime 0.6.9", @@ -17194,7 +17719,7 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost", + "prost 0.13.5", "socket2 0.5.9", "tokio", "tokio-stream", @@ -17223,7 +17748,7 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost", + "prost 0.13.5", "rustls-native-certs 0.8.1", "socket2 0.5.9", "tokio", @@ -17235,6 +17760,37 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec" +dependencies = [ + "async-trait", + "axum 0.8.7", + "base64 0.22.1", + "bytes", + "h2 0.4.13", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "hyper 1.8.1", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "rustls-native-certs 0.8.1", + "socket2 0.6.1", + "sync_wrapper", + "tokio", + "tokio-rustls 0.26.2", + "tokio-stream", + "tower 0.5.2", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic-build" version = "0.13.1" @@ -17243,10 +17799,49 @@ checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847" dependencies = [ "prettyplease", "proc-macro2", - "prost-build", - "prost-types", + "prost-build 0.13.5", + "prost-types 0.13.5", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "tonic-build" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1882ac3bf5ef12877d7ed57aad87e75154c11931c2ba7e6cde5e22d63522c734" +dependencies = [ + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "tonic-prost" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309" +dependencies = [ + "bytes", + "prost 0.14.3", + "tonic 0.14.5", +] + +[[package]] +name = "tonic-prost-build" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3144df636917574672e93d0f56d7edec49f90305749c668df5101751bb8f95a" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build 0.14.3", + "prost-types 0.14.3", "quote", "syn 2.0.106", + "tempfile", + "tonic-build 0.14.5", ] [[package]] @@ -17255,8 +17850,8 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1" dependencies = [ - "prost", - "prost-types", + "prost 0.13.5", + "prost-types 0.13.5", "tokio", "tokio-stream", "tonic 0.13.1", @@ -17290,7 +17885,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "indexmap 2.12.0", + "indexmap 2.14.0", "pin-project-lite", "slab", "sync_wrapper", @@ -17399,14 +17994,14 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.19" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" +checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" dependencies = [ "matchers", "nu-ansi-term", "once_cell", - "regex", + "regex-automata", "serde", "serde_json", "sharded-slab", @@ -18105,7 +18700,7 @@ dependencies = [ "ahash 0.8.12", "bitflags 2.9.0", "hashbrown 0.14.5", - "indexmap 2.12.0", + "indexmap 2.14.0", "semver", "serde", ] @@ -18117,7 +18712,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "808198a69b5a0535583370a51d459baa14261dfab04800c4864ee9e1a14346ed" dependencies = [ "bitflags 2.9.0", - "indexmap 2.12.0", + "indexmap 2.14.0", "semver", ] @@ -18149,7 +18744,7 @@ dependencies = [ "fxprof-processed-profile", "gimli 0.31.1", "hashbrown 0.14.5", - "indexmap 2.12.0", + "indexmap 2.14.0", "ittapi", "libc", "libm", @@ -18275,7 +18870,7 @@ dependencies = [ "cranelift-bitset", "cranelift-entity", "gimli 0.31.1", - "indexmap 2.12.0", + "indexmap 2.14.0", "log", "object", "postcard", @@ -18371,7 +18966,7 @@ checksum = "bf3963c9c29df91564d8bd181eb00d0dbaeafa1b2a01e15952bb7391166b704e" dependencies = [ "anyhow", "heck 0.5.0", - "indexmap 2.12.0", + "indexmap 2.14.0", "wit-parser", ] @@ -18521,7 +19116,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "shellexpand 2.1.2", + "shellexpand", "syn 2.0.106", "witx", ] @@ -19154,7 +19749,7 @@ checksum = "ca004bb251010fe956f4a5b9d4bf86b4e415064160dd6669569939e8cbf2504f" dependencies = [ "anyhow", "id-arena", - "indexmap 2.12.0", + "indexmap 2.14.0", "log", "semver", "serde", @@ -19398,7 +19993,7 @@ dependencies = [ "flate2", "getrandom 0.3.3", "hmac", - "indexmap 2.12.0", + "indexmap 2.14.0", "lzma-rs", "memchr", "pbkdf2", diff --git a/Cargo.toml b/Cargo.toml index 05008cdc325a2..8b14fcfb0f118 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -180,18 +180,18 @@ anyerror = { version = "=0.1.13" } anyhow = { version = "1.0.65" } apache-avro = { version = "0.17.0", features = ["snappy", "zstandard", "xz", "snappy", "bzip"] } approx = "0.5.1" -arrow = { version = "56" } -arrow-array = { version = "56" } -arrow-buffer = { version = "56" } -arrow-cast = { version = "56", features = ["prettyprint"] } -arrow-csv = { version = "56" } -arrow-data = { version = "56" } -arrow-flight = { version = "56", features = ["flight-sql-experimental", "tls-ring"] } -arrow-ipc = { version = "56", features = ["lz4", "zstd"] } -arrow-json = { version = "56" } -arrow-ord = { version = "56" } -arrow-schema = { version = "56", features = ["serde"] } -arrow-select = { version = "56" } +arrow = { version = "58.1.0" } +arrow-array = { version = "58.1.0" } +arrow-buffer = { version = "58.1.0" } +arrow-cast = { version = "58.1.0", features = ["prettyprint"] } +arrow-csv = { version = "58.1.0" } +arrow-data = { version = "58.1.0" } +arrow-flight = { version = "58.1.0", features = ["flight-sql-experimental", "tls-ring"] } +arrow-ipc = { version = "58.1.0", features = ["lz4", "zstd"] } +arrow-json = { version = "58.1.0" } +arrow-ord = { version = "58.1.0" } +arrow-schema = { version = "58.1.0", features = ["serde"] } +arrow-select = { version = "58.1.0" } arrow-udf-runtime = { version = "0.8.0", default-features = false, features = ["javascript", "wasm"] } async-backtrace = "0.2" async-channel = "2.3.1" @@ -298,13 +298,13 @@ proc-macro2 = "1.0" quote = "1.0" ## in branch dev -iceberg = { version = "0.8.0", git = "https://github.com/databendlabs/iceberg-rust", rev = "6ccaa60e", features = [ +iceberg = { version = "0.8.0", git = "https://github.com/SkyFan2002/iceberg-rust.git", rev = "edb4e4f8158821d274850d19d2c7d0030e16e142", features = [ "storage-all", ] } -iceberg-catalog-glue = { version = "0.8.0", git = "https://github.com/databendlabs/iceberg-rust", rev = "6ccaa60e" } -iceberg-catalog-hms = { version = "0.8.0", git = "https://github.com/databendlabs/iceberg-rust", rev = "6ccaa60e" } -iceberg-catalog-rest = { version = "0.8.0", git = "https://github.com/databendlabs/iceberg-rust", rev = "6ccaa60e" } -iceberg-catalog-s3tables = { version = "0.8.0", git = "https://github.com/databendlabs/iceberg-rust", rev = "6ccaa60e" } +iceberg-catalog-glue = { version = "0.8.0", git = "https://github.com/SkyFan2002/iceberg-rust.git", rev = "edb4e4f8158821d274850d19d2c7d0030e16e142" } +iceberg-catalog-hms = { version = "0.8.0", git = "https://github.com/SkyFan2002/iceberg-rust.git", rev = "edb4e4f8158821d274850d19d2c7d0030e16e142" } +iceberg-catalog-rest = { version = "0.8.0", git = "https://github.com/SkyFan2002/iceberg-rust.git", rev = "edb4e4f8158821d274850d19d2c7d0030e16e142" } +iceberg-catalog-s3tables = { version = "0.8.0", git = "https://github.com/SkyFan2002/iceberg-rust.git", rev = "edb4e4f8158821d274850d19d2c7d0030e16e142" } # Explicitly specify compatible AWS SDK versions aws-config = "1.5.18" @@ -324,11 +324,11 @@ jaq-std = "2.1.2" jiff = { version = "0.2.16", features = ["serde", "tzdb-bundle-always"] } jsonb = "0.5.6" jwt-simple = { version = "0.12.12", default-features = false, features = ["pure-rust"] } -lance-core = "1.0.4" -lance-encoding = { version = "1.0.4", features = ["protoc"] } -lance-file = "1.0.4" -lance-io = { version = "1.0.4", default-features = false } -lance-table = "1.0.4" +lance-core = "5.1.0-beta.3" +lance-encoding = { version = "5.1.0-beta.3", features = ["protoc"] } +lance-file = "5.1.0-beta.3" +lance-io = { version = "5.1.0-beta.3", default-features = false } +lance-table = "5.1.0-beta.3" lenient_semver = "0.4.2" levenshtein_automata = "0.2.1" lexical-core = "1" @@ -389,7 +389,7 @@ orc-rust = "0.6.0" ordered-float = { version = "5.1.0", default-features = false } p256 = "0.13" parking_lot = "0.12.1" -parquet = { version = "56", features = ["async"] } +parquet = { version = "58.1.0", features = ["async"] } passwords = { version = "3.1.16" } paste = "1.0.15" percent-encoding = "2.3.1" @@ -406,8 +406,8 @@ pretty_assertions = "1.3.0" procfs = { version = "0.17.0" } proj4rs = { version = "0.1.10", features = ["geo-types", "crs-definitions"] } proptest = { version = "1", default-features = false, features = ["std"] } -prost = { version = "0.13" } -prost-build = { version = "0.13" } +prost = { version = "0.14.3" } +prost-build = { version = "0.14.3" } prqlc = "0.11.3" rand = { version = "0.8.5", features = ["small_rng", "serde1"] } rand_distr = "0.4.3" @@ -489,9 +489,11 @@ tokio = { version = "1.35.0", features = ["full"] } tokio-stream = { version = "0.1.11", features = ["net"] } tokio-util = { version = "0.7.13" } toml = { version = "0.8", features = ["parse"] } -tonic = { version = "0.13", features = ["transport", "codegen", "tls-native-roots"] } -tonic-build = { version = "0.13" } -tonic-reflection = { version = "0.13" } +tonic = { version = "0.14.5", features = ["transport", "codegen", "tls-native-roots"] } +tonic-build = { version = "0.14.5" } +tonic-prost = { version = "0.14.5" } +tonic-prost-build = { version = "0.14.5" } +tonic-reflection = { version = "0.14.5" } tower = { version = "0.5.1", features = ["util"] } tower-service = "0.3.3" twox-hash = "1.6.3" @@ -616,7 +618,7 @@ overflow-checks = true rpath = true [patch.crates-io] -arrow-udf-runtime = { git = "https://github.com/datafuse-extras/arrow-udf.git", rev = "2480dccf1" } +arrow-udf-runtime = { git = "https://github.com/SkyFan2002/arrow-udf.git", rev = "ec0c74d1bb53d68243c2bb23ebaa9392223e094c" } async-backtrace = { git = "https://github.com/datafuse-extras/async-backtrace.git", rev = "dea4553" } async-recursion = { git = "https://github.com/datafuse-extras/async-recursion.git", rev = "a353334" } backtrace = { git = "https://github.com/rust-lang/backtrace-rs.git", rev = "72265be" } @@ -630,7 +632,14 @@ deltalake = { git = "https://github.com/delta-io/delta-rs", rev = "9954bff" } map-api = { git = "https://github.com/databendlabs/map-api", tag = "v0.4.2" } openraft = { git = "https://github.com/databendlabs/openraft", tag = "v0.10.0-alpha.17" } openraft-rt = { git = "https://github.com/databendlabs/openraft", tag = "v0.10.0-alpha.17" } -orc-rust = { git = "https://github.com/datafuse-extras/orc-rust", rev = "fc812ad7010" } +# Keep ORC on a local patch while the upstream crate has not caught up with Arrow 58 yet. +lance-arrow = { git = "https://github.com/SkyFan2002/lance.git", rev = "983238285f482cc6d229483d15d6dde8f3bc3e96" } +lance-core = { git = "https://github.com/SkyFan2002/lance.git", rev = "983238285f482cc6d229483d15d6dde8f3bc3e96" } +lance-encoding = { git = "https://github.com/SkyFan2002/lance.git", rev = "983238285f482cc6d229483d15d6dde8f3bc3e96" } +lance-file = { git = "https://github.com/SkyFan2002/lance.git", rev = "983238285f482cc6d229483d15d6dde8f3bc3e96" } +lance-io = { git = "https://github.com/SkyFan2002/lance.git", rev = "983238285f482cc6d229483d15d6dde8f3bc3e96" } +lance-table = { git = "https://github.com/SkyFan2002/lance.git", rev = "983238285f482cc6d229483d15d6dde8f3bc3e96" } +orc-rust = { git = "https://github.com/SkyFan2002/orc-rust.git", rev = "3fba34ccde628b4063d7f892d15866f89236064c" } recursive = { git = "https://github.com/datafuse-extras/recursive.git", rev = "16e433a" } sled = { git = "https://github.com/datafuse-extras/sled", tag = "v0.34.7-datafuse.1" } state-machine-api = { git = "https://github.com/databendlabs/state-machine-api.git", tag = "v0.3.4" } diff --git a/src/bendpy/Cargo.toml b/src/bendpy/Cargo.toml index 779b7e897f2e7..e19a6ee582f6e 100644 --- a/src/bendpy/Cargo.toml +++ b/src/bendpy/Cargo.toml @@ -7,7 +7,7 @@ publish = { workspace = true } edition = { workspace = true } [build-dependencies] -pyo3-build-config = "0.25" +pyo3-build-config = "0.28" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] @@ -32,7 +32,7 @@ databend-query = { workspace = true, features = [ "simd", "disable_initial_exec_tls", ] } -pyo3 = { version = "0.25", features = ["generate-import-lib", "abi3-py312"] } +pyo3 = { version = "0.28", features = ["generate-import-lib", "abi3-py312"] } serde_json = { workspace = true } sysinfo = { workspace = true } tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "sync"] } diff --git a/src/bendpy/src/context.rs b/src/bendpy/src/context.rs index 15b91583156d6..0467326e222c8 100644 --- a/src/bendpy/src/context.rs +++ b/src/bendpy/src/context.rs @@ -96,7 +96,12 @@ fn build_position_select(col_names: &[String]) -> PyResult { .join(", ")) } -#[pyclass(name = "SessionContext", module = "databend", subclass)] +#[pyclass( + name = "SessionContext", + module = "databend", + subclass, + skip_from_py_object +)] #[derive(Clone)] pub(crate) struct PySessionContext { pub(crate) session: Arc, diff --git a/src/bendpy/src/dataframe.rs b/src/bendpy/src/dataframe.rs index 700b9eaee96eb..cc96a790ad1be 100644 --- a/src/bendpy/src/dataframe.rs +++ b/src/bendpy/src/dataframe.rs @@ -23,14 +23,14 @@ use databend_query::interpreters::InterpreterFactory; use databend_query::sessions::QueryContext; use databend_query::sql::plans::Plan; use pyo3::prelude::*; -use pyo3::types::PyTuple; +use pyo3::types::PyAny; use tokio_stream::StreamExt; use crate::datablock::PyDataBlocks; use crate::schema::PySchema; use crate::utils::wait_for_future; -#[pyclass(name = "BoxSize", module = "databend", subclass)] +#[pyclass(name = "BoxSize", module = "databend", subclass, skip_from_py_object)] #[derive(Clone, Debug)] pub(crate) struct PyBoxSize { pub(crate) bs_max_display_rows: usize, @@ -38,7 +38,7 @@ pub(crate) struct PyBoxSize { pub(crate) bs_max_col_width: usize, } -#[pyclass(name = "DataFrame", module = "databend", subclass)] +#[pyclass(name = "DataFrame", module = "databend", subclass, skip_from_py_object)] #[derive(Clone)] pub(crate) struct PyDataFrame { ctx: Arc, @@ -119,7 +119,7 @@ impl PyDataFrame { } } - pub fn to_py_arrow(&self, py: Python) -> PyResult> { + pub fn to_py_arrow(&self, py: Python) -> PyResult>> { let blocks = wait_for_future(py, self.df_collect()); let blocks = blocks.map_err(|err| { pyo3::exceptions::PyRuntimeError::new_err(format!("DataFrame collect error: {:?}", err)) @@ -132,50 +132,38 @@ impl PyDataFrame { .to_record_batch_with_dataschema(self.df.schema().as_ref()) .unwrap() .to_pyarrow(py) + .map(Bound::unbind) }) .collect() } /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table - pub fn to_arrow_table(&self, py: Python) -> PyResult { + pub fn to_arrow_table(&self, py: Python<'_>) -> PyResult> { let batches = self.to_py_arrow(py)?.into_pyobject(py)?; let schema = ArrowSchema::from(self.df.schema().as_ref()); let schema = PyArrowType(schema); let schema = schema.into_pyobject(py)?; - Python::with_gil(|py| { - // Instantiate pyarrow Table object and use its from_batches method - let table_class = py.import("pyarrow")?.getattr("Table")?; - let args = PyTuple::new(py, &[batches, schema])?; - let table: PyObject = table_class.call_method1("from_batches", args)?.into(); - Ok(table) - }) + let table_class = py.import("pyarrow")?.getattr("Table")?; + Ok(table_class + .call_method1("from_batches", (batches, schema))? + .unbind()) } /// Convert to pandas dataframe with pyarrow /// Collect the batches, pass to Arrow Table & then convert to Pandas DataFrame - fn to_pandas(&self, py: Python) -> PyResult { + fn to_pandas(&self, py: Python<'_>) -> PyResult> { let table = self.to_arrow_table(py)?; - - Python::with_gil(|py| { - // See also: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas - let result = table.call_method0(py, "to_pandas")?; - Ok(result) - }) + table.call_method0(py, "to_pandas") } /// Convert to polars dataframe with pyarrow /// Collect the batches, pass to Arrow Table & then convert to polars DataFrame - fn to_polars(&self, py: Python) -> PyResult { + fn to_polars(&self, py: Python<'_>) -> PyResult> { let table = self.to_arrow_table(py)?; - - Python::with_gil(|py| { - let dataframe = py.import("polars")?.getattr("DataFrame")?; - let args = PyTuple::new(py, &[table])?; - let result: PyObject = dataframe.call1(args)?.into(); - Ok(result) - }) + let dataframe = py.import("polars")?.getattr("DataFrame")?; + Ok(dataframe.call1((table,))?.unbind()) } } diff --git a/src/bendpy/src/utils.rs b/src/bendpy/src/utils.rs index c11da635491d6..5278757361bb9 100644 --- a/src/bendpy/src/utils.rs +++ b/src/bendpy/src/utils.rs @@ -30,5 +30,5 @@ where F: Future + Send, F::Output: Send, { - py.allow_threads(|| RUNTIME.block_on(f)) + py.detach(|| RUNTIME.block_on(f)) } diff --git a/src/common/cloud_control/Cargo.toml b/src/common/cloud_control/Cargo.toml index 00e8b8e2ab750..c9405624f635b 100644 --- a/src/common/cloud_control/Cargo.toml +++ b/src/common/cloud_control/Cargo.toml @@ -16,17 +16,21 @@ hyper-util = { workspace = true } prost = { workspace = true } serde = { workspace = true } tonic = { workspace = true } +tonic-prost = { workspace = true } [build-dependencies] lenient_semver = { workspace = true } prost-build = { workspace = true } semver = { workspace = true } -tonic-build = { workspace = true } +tonic-prost-build = { workspace = true } [dev-dependencies] anyhow = { workspace = true } tokio = { workspace = true } tower = { workspace = true } +[package.metadata.cargo-machete] +ignored = ["tonic-prost"] + [lints] workspace = true diff --git a/src/common/cloud_control/build.rs b/src/common/cloud_control/build.rs index 23a76f501109a..62f6c4d96f6c2 100644 --- a/src/common/cloud_control/build.rs +++ b/src/common/cloud_control/build.rs @@ -80,5 +80,5 @@ fn build_proto() -> Result<()> { config.protoc_arg("--experimental_allow_proto3_optional"); } - tonic_build::configure().compile_protos_with_config(config, &proto_defs, &[proto_path]) + tonic_prost_build::configure().compile_with_config(config, &proto_defs, &[proto_path]) } diff --git a/src/common/cloud_control/src/task_utils.rs b/src/common/cloud_control/src/task_utils.rs index 190052d703487..9744df722bdba 100644 --- a/src/common/cloud_control/src/task_utils.rs +++ b/src/common/cloud_control/src/task_utils.rs @@ -17,6 +17,7 @@ use std::fmt::Display; use std::fmt::Formatter; use chrono::DateTime; +use chrono::FixedOffset; use chrono::Utc; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -165,7 +166,7 @@ impl TryFrom for Task { value.next_scheduled_at )) }) - .map(|d| d.with_timezone(&Utc)) + .map(|d: DateTime| d.with_timezone(&Utc)) }) .transpose()?; @@ -180,7 +181,7 @@ impl TryFrom for Task { value.last_suspended_at )) }) - .map(|d| d.with_timezone(&Utc)) + .map(|d: DateTime| d.with_timezone(&Utc)) }) .transpose()?; let schedule = match value.schedule_options { diff --git a/src/common/column/src/binview/mod.rs b/src/common/column/src/binview/mod.rs index 146de14cb4dfc..a572b2c3d7522 100644 --- a/src/common/column/src/binview/mod.rs +++ b/src/common/column/src/binview/mod.rs @@ -658,7 +658,7 @@ impl From for ArrayData { column .buffers .iter() - .map(|x| x.clone().into()) + .map(|buffer| arrow_buffer::Buffer::from(buffer.clone())) .collect::>(), ); unsafe { builder.build_unchecked() } @@ -674,7 +674,7 @@ impl From for ArrayData { column .buffers .iter() - .map(|x| x.clone().into()) + .map(|buffer| arrow_buffer::Buffer::from(buffer.clone())) .collect::>(), ); unsafe { builder.build_unchecked() } @@ -686,8 +686,8 @@ impl From for Utf8ViewColumn { let views = data.buffers()[0].clone(); let buffers = data.buffers()[1..] .iter() - .map(|x| x.clone().into()) - .collect(); + .map(|buffer| crate::buffer::Buffer::from(buffer.clone())) + .collect::]>>(); unsafe { Utf8ViewColumn::new_unchecked_unknown_md(views.into(), buffers, None) } } diff --git a/src/common/column/src/bitmap/mutable.rs b/src/common/column/src/bitmap/mutable.rs index 666b524f061d3..691baca0e2b5c 100644 --- a/src/common/column/src/bitmap/mutable.rs +++ b/src/common/column/src/bitmap/mutable.rs @@ -19,8 +19,6 @@ use std::iter::TrustedLen; use std::ops::Range; use std::sync::Arc; -use databend_common_base::vec_ext::VecExt; - use super::Bitmap; use super::utils::BitChunk; use super::utils::BitChunksExactMut; @@ -267,14 +265,12 @@ impl MutableBitmap { /// The caller must ensure that the [`MutableBitmap`] has sufficient capacity. #[inline] pub unsafe fn push_unchecked(&mut self, value: bool) { - unsafe { - if self.length.is_multiple_of(8) { - self.buffer.push_unchecked(0); - } - let byte = self.buffer.as_mut_slice().last_mut().unwrap(); - *byte = set(*byte, self.length % 8, value); - self.length += 1; + if self.length.is_multiple_of(8) { + self.buffer.push(0); } + let byte = self.buffer.as_mut_slice().last_mut().unwrap(); + *byte = set(*byte, self.length % 8, value); + self.length += 1; } /// Returns the number of unset bits on this [`MutableBitmap`]. @@ -525,20 +521,20 @@ unsafe fn extend_aligned_trusted_iter_unchecked( // chunks of 64 bits for _ in 0..chunks { let chunk = get_chunk_unchecked(&mut iterator); - buffer.extend_from_slice_unchecked(&chunk.to_le_bytes()); + buffer.extend_from_slice(&chunk.to_le_bytes()); } // remaining complete bytes for _ in 0..(remainder / 8) { let byte = get_byte_unchecked(8, &mut iterator); - buffer.push_unchecked(byte) + buffer.push(byte) } // remaining bits let remainder = remainder % 8; if remainder > 0 { let byte = get_byte_unchecked(remainder, &mut iterator); - buffer.push_unchecked(byte) + buffer.push(byte) } additional_bits } @@ -744,7 +740,7 @@ impl MutableBitmap { } // SAFETY: Already allocated sufficient capacity - unsafe { buffer.extend_from_slice_unchecked(&packed.to_le_bytes()) } + buffer.extend_from_slice(&packed.to_le_bytes()) } if remainder != 0 { @@ -755,7 +751,7 @@ impl MutableBitmap { } // SAFETY: Already allocated sufficient capacity - unsafe { buffer.extend_from_slice_unchecked(&packed.to_le_bytes()) } + buffer.extend_from_slice(&packed.to_le_bytes()) } buffer.truncate(len.div_ceil(8)); diff --git a/src/common/column/src/types/native.rs b/src/common/column/src/types/native.rs index b8d93bd2e4841..56822e4dc5b05 100644 --- a/src/common/column/src/types/native.rs +++ b/src/common/column/src/types/native.rs @@ -142,11 +142,11 @@ impl NativeType for F32 { } #[inline] fn from_le_bytes(bytes: Self::Bytes) -> Self { - Self(f32::from_le_bytes(bytes)) + OrderedFloat(f32::from_le_bytes(bytes)) } #[inline] fn from_be_bytes(bytes: Self::Bytes) -> Self { - Self(f32::from_be_bytes(bytes)) + OrderedFloat(f32::from_be_bytes(bytes)) } } @@ -164,11 +164,11 @@ impl NativeType for F64 { } #[inline] fn from_le_bytes(bytes: Self::Bytes) -> Self { - Self(f64::from_le_bytes(bytes)) + OrderedFloat(f64::from_le_bytes(bytes)) } #[inline] fn from_be_bytes(bytes: Self::Bytes) -> Self { - Self(f64::from_be_bytes(bytes)) + OrderedFloat(f64::from_be_bytes(bytes)) } } diff --git a/src/common/storage/src/parquet.rs b/src/common/storage/src/parquet.rs index dcf640c839dab..592b8495e275d 100644 --- a/src/common/storage/src/parquet.rs +++ b/src/common/storage/src/parquet.rs @@ -25,6 +25,7 @@ use parquet::arrow::parquet_to_arrow_schema; use parquet::errors::ParquetError; // FIXME(xuanwo): refactor code here. use parquet::file::metadata::FileMetaData; +use parquet::file::metadata::FooterTail; use parquet::file::metadata::ParquetMetaData; use parquet::file::metadata::ParquetMetaDataReader; @@ -108,12 +109,8 @@ pub async fn read_metadata_async( let map_err = |e: ParquetError| ErrorCode::BadBytes(format!("Invalid Parquet file '{path}': {e}",)); - let footer_tail = ParquetMetaDataReader::decode_footer_tail( - &buffer[(buffer_len - FOOTER_SIZE as usize)..] - .try_into() - .unwrap(), - ) - .map_err(map_err)?; + let footer_tail = + FooterTail::try_from(&buffer[(buffer_len - FOOTER_SIZE as usize)..]).map_err(map_err)?; let metadata_len = footer_tail.metadata_length() as u64; check_meta_size(file_size, metadata_len, path)?; diff --git a/src/common/tracing/src/init.rs b/src/common/tracing/src/init.rs index 0dff1dc9a9b87..47df3d4e3d462 100644 --- a/src/common/tracing/src/init.rs +++ b/src/common/tracing/src/init.rs @@ -20,6 +20,7 @@ use std::sync::atomic::Ordering; use databend_common_base::base::GlobalInstance; use databend_common_base::runtime::Thread; +use fastrace::collector::SpanContext; use fastrace::prelude::*; use log::LevelFilter; use logforth::filter::EnvFilter; diff --git a/src/meta/api/Cargo.toml b/src/meta/api/Cargo.toml index 19c83d26c4992..9484290166edc 100644 --- a/src/meta/api/Cargo.toml +++ b/src/meta/api/Cargo.toml @@ -32,6 +32,7 @@ serde_json = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } tonic = { workspace = true } +tonic_013 = { package = "tonic", version = "0.13.1" } uuid = { workspace = true } zstd = { workspace = true } diff --git a/src/meta/api/src/error/app_error.rs b/src/meta/api/src/error/app_error.rs index 16410897f65e5..9f49cab0a22ac 100644 --- a/src/meta/api/src/error/app_error.rs +++ b/src/meta/api/src/error/app_error.rs @@ -25,7 +25,7 @@ use databend_meta_client::types::MetaAPIError; use databend_meta_client::types::MetaClientError; use databend_meta_client::types::MetaError; use databend_meta_client::types::MetaNetworkError; -use tonic::Status; +use tonic_013::Status; use super::txn_error::MetaTxnError; diff --git a/src/meta/api/src/kv/pb_api/codec.rs b/src/meta/api/src/kv/pb_api/codec.rs index 9dac0fde02b57..2411801aa3cb8 100644 --- a/src/meta/api/src/kv/pb_api/codec.rs +++ b/src/meta/api/src/kv/pb_api/codec.rs @@ -26,6 +26,11 @@ use crate::kv_pb_api::errors::PbApiReadError; use crate::kv_pb_api::errors::PbDecodeError; use crate::kv_pb_api::errors::PbEncodeError; +#[allow(deprecated)] +fn prost_decode_error(message: impl Into) -> prost::DecodeError { + prost::DecodeError::new(message.into()) +} + /// Encode a `FromToProto` value to protobuf bytes, with transparent zstd compression. /// /// Delegates to [`compress::GLOBAL_ENCODER`]. @@ -36,7 +41,7 @@ pub fn encode_pb(value: &T) -> Result, PbEncodeError> { /// Decode protobuf bytes (possibly zstd-compressed) to a `FromToProto` value. pub fn decode_pb(buf: &[u8]) -> Result { let buf = compress::decode_value(buf) - .map_err(|e| PbDecodeError::from(prost::DecodeError::new(e.to_string())))?; + .map_err(|e| PbDecodeError::from(prost_decode_error(e.to_string())))?; let p: T::PB = prost::Message::decode(buf.as_ref()).map_err(PbDecodeError::from)?; T::from_pb(p).map_err(PbDecodeError::from) } diff --git a/src/meta/api/src/kv/pb_api/errors/decode_error.rs b/src/meta/api/src/kv/pb_api/errors/decode_error.rs index f1d9c1bbca518..bb6af9cb94d99 100644 --- a/src/meta/api/src/kv/pb_api/errors/decode_error.rs +++ b/src/meta/api/src/kv/pb_api/errors/decode_error.rs @@ -73,19 +73,25 @@ impl PbDecodeError { } } +#[cfg(test)] +#[allow(deprecated)] +fn prost_decode_error(message: impl Into) -> prost::DecodeError { + prost::DecodeError::new(message.into()) +} + #[cfg(test)] mod tests { use crate::kv_pb_api::errors::PbDecodeError; #[test] fn test_error_message() { - let e = PbDecodeError::from(prost::DecodeError::new("decode error")); + let e = PbDecodeError::from(super::prost_decode_error("decode error")); assert_eq!( "PbDecodeError: failed to decode Protobuf message: decode error", e.to_string() ); - let e = PbDecodeError::from(prost::DecodeError::new("decode error")).with_context("ctx"); + let e = PbDecodeError::from(super::prost_decode_error("decode error")).with_context("ctx"); assert_eq!( "PbDecodeError: failed to decode Protobuf message: decode error; when:(ctx)", e.to_string() diff --git a/src/meta/app/src/data_id.rs b/src/meta/app/src/data_id.rs index 010d8f766ffa2..8c30b9987f4e9 100644 --- a/src/meta/app/src/data_id.rs +++ b/src/meta/app/src/data_id.rs @@ -152,6 +152,11 @@ mod prost_message_impl { use crate::data_id::DataId; use crate::tenant_key::resource::TenantResource; + #[allow(deprecated)] + fn decode_error(message: impl Into) -> DecodeError { + DecodeError::new(message.into()) + } + impl prost::Message for DataId where R: TenantResource + Sync + Send { @@ -165,7 +170,7 @@ mod prost_message_impl { let mut b = [0; 64]; let len = buf.remaining(); if len > b.len() { - return Err(DecodeError::new(format!( + return Err(decode_error(format!( "buffer(len={}) is too large, max={}", len, b.len() @@ -174,7 +179,7 @@ mod prost_message_impl { buf.copy_to_slice(&mut b[..len]); let id: u64 = serde_json::from_slice(&b[..len]) - .map_err(|e| DecodeError::new(format!("failed to decode u64 as json: {}", e)))?; + .map_err(|e| decode_error(format!("failed to decode u64 as json: {}", e)))?; Ok(DataId::new(id)) } diff --git a/src/meta/app/src/primitive.rs b/src/meta/app/src/primitive.rs index 9454a1469cd73..51d4791752687 100644 --- a/src/meta/app/src/primitive.rs +++ b/src/meta/app/src/primitive.rs @@ -122,6 +122,11 @@ mod prost_message_impl { use crate::primitive::Id; + #[allow(deprecated)] + fn decode_error(message: impl Into) -> DecodeError { + DecodeError::new(message.into()) + } + impl prost::Message for Id where T: fmt::Debug + Send + Sync, @@ -137,7 +142,7 @@ mod prost_message_impl { let mut b = [0; 64]; let len = buf.remaining(); if len > b.len() { - return Err(DecodeError::new(format!( + return Err(decode_error(format!( "buffer(len={}) is too large, max={}", len, b.len() @@ -146,7 +151,7 @@ mod prost_message_impl { buf.copy_to_slice(&mut b[..len]); let id: u64 = serde_json::from_slice(&b[..len]) - .map_err(|e| DecodeError::new(format!("failed to decode u64 as json: {}", e)))?; + .map_err(|e| decode_error(format!("failed to decode u64 as json: {}", e)))?; Ok(Id::from(id)) } diff --git a/src/meta/plugins/cache/Cargo.toml b/src/meta/plugins/cache/Cargo.toml index e72a1764853e4..e930ca5d42c84 100644 --- a/src/meta/plugins/cache/Cargo.toml +++ b/src/meta/plugins/cache/Cargo.toml @@ -24,7 +24,7 @@ databend-meta-runtime = { workspace = true } futures = { workspace = true } log = { workspace = true } sub-cache = { workspace = true } -tonic = { workspace = true } +tonic_013 = { package = "tonic", version = "0.13.1" } [dev-dependencies] anyhow = { workspace = true } diff --git a/src/meta/plugins/cache/src/meta_client_source.rs b/src/meta/plugins/cache/src/meta_client_source.rs index f5d279246894d..9b5fc9162fa00 100644 --- a/src/meta/plugins/cache/src/meta_client_source.rs +++ b/src/meta/plugins/cache/src/meta_client_source.rs @@ -32,7 +32,7 @@ use sub_cache::errors::Unsupported; use sub_cache::event_stream::Change; use sub_cache::event_stream::Event; use sub_cache::event_stream::EventStream; -use tonic::Status; +use tonic_013::Status; pub struct MetaClientSource { pub(crate) client: Arc>, diff --git a/src/meta/plugins/semaphore/Cargo.toml b/src/meta/plugins/semaphore/Cargo.toml index ab193d50ec5c7..521fa05a91d1d 100644 --- a/src/meta/plugins/semaphore/Cargo.toml +++ b/src/meta/plugins/semaphore/Cargo.toml @@ -21,7 +21,7 @@ log = { workspace = true } seq-marked = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } -tonic = { workspace = true } +tonic_013 = { package = "tonic", version = "0.13.1" } [dev-dependencies] anyhow = { workspace = true } diff --git a/src/meta/plugins/semaphore/src/errors/connection_closed.rs b/src/meta/plugins/semaphore/src/errors/connection_closed.rs index a3fdcafba23ac..a8402e2e571bc 100644 --- a/src/meta/plugins/semaphore/src/errors/connection_closed.rs +++ b/src/meta/plugins/semaphore/src/errors/connection_closed.rs @@ -15,7 +15,7 @@ use std::fmt; use std::io; -use tonic::Status; +use tonic_013::Status; use crate::errors::either::Either; diff --git a/src/meta/plugins/semaphore/src/errors/processor_error.rs b/src/meta/plugins/semaphore/src/errors/processor_error.rs index 8ae9d0686ca6a..ff0291eac7f50 100644 --- a/src/meta/plugins/semaphore/src/errors/processor_error.rs +++ b/src/meta/plugins/semaphore/src/errors/processor_error.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use tonic::Status; +use tonic_013::Status; use crate::errors::AcquirerClosed; use crate::errors::ConnectionClosed; diff --git a/src/meta/plugins/semaphore/src/meta_event_subscriber/subscriber.rs b/src/meta/plugins/semaphore/src/meta_event_subscriber/subscriber.rs index 55056f7ff603e..cfa8da7efa538 100644 --- a/src/meta/plugins/semaphore/src/meta_event_subscriber/subscriber.rs +++ b/src/meta/plugins/semaphore/src/meta_event_subscriber/subscriber.rs @@ -28,7 +28,7 @@ use futures::TryStreamExt; use log::error; use log::info; use log::warn; -use tonic::Status; +use tonic_013::Status; use crate::errors::ConnectionClosed; use crate::errors::ProcessorError; @@ -88,7 +88,7 @@ impl MetaEventSubscriber { pub(crate) async fn new_watch_stream( &self, ctx: impl fmt::Display, - ) -> Result, ConnectionClosed> { + ) -> Result, ConnectionClosed> { let watch = WatchRequest::new(self.left.clone(), Some(self.right.clone())).with_initial_flush(true); diff --git a/src/meta/protos/Cargo.toml b/src/meta/protos/Cargo.toml index 612c1aa62d70b..fc5aa1ac6a142 100644 --- a/src/meta/protos/Cargo.toml +++ b/src/meta/protos/Cargo.toml @@ -11,15 +11,16 @@ num-derive = { workspace = true } num-traits = { workspace = true } prost = { workspace = true } tonic = { workspace = true } +tonic-prost = { workspace = true } [build-dependencies] lenient_semver = { workspace = true } prost-build = { workspace = true } semver = { workspace = true } -tonic-build = { workspace = true } +tonic-prost-build = { workspace = true } [package.metadata.cargo-machete] -ignored = ["num-derive", "num-traits"] +ignored = ["num-derive", "num-traits", "tonic-prost"] [lints] workspace = true diff --git a/src/meta/protos/build.rs b/src/meta/protos/build.rs index 681a13d5ac64b..d83a4b80a66a0 100644 --- a/src/meta/protos/build.rs +++ b/src/meta/protos/build.rs @@ -81,7 +81,7 @@ fn build_proto() -> Result<()> { config.protoc_arg("--experimental_allow_proto3_optional"); } - tonic_build::configure() + tonic_prost_build::configure() .type_attribute("IntervalKind", "#[derive(num_derive::FromPrimitive)]") .type_attribute( "StageFileFormatType", @@ -92,5 +92,5 @@ fn build_proto() -> Result<()> { "#[derive(num_derive::FromPrimitive)]", ) .type_attribute("StageType", "#[derive(num_derive::FromPrimitive)]") - .compile_protos_with_config(config, &proto_defs, &[proto_path]) + .compile_with_config(config, &proto_defs, &[proto_path]) } diff --git a/src/meta/runtime/Cargo.toml b/src/meta/runtime/Cargo.toml index b615c7ff9c8a0..357a376dcfe47 100644 --- a/src/meta/runtime/Cargo.toml +++ b/src/meta/runtime/Cargo.toml @@ -16,7 +16,7 @@ databend-meta = { workspace = true } databend-meta-client = { workspace = true } fastrace = { workspace = true } tokio = { workspace = true } -tonic = { workspace = true } +tonic_013 = { package = "tonic", version = "0.13.1", features = ["transport", "tls-native-roots"] } [lints] workspace = true diff --git a/src/meta/runtime/src/client_bridge.rs b/src/meta/runtime/src/client_bridge.rs index 53f3b8488bac9..b32b2a391736d 100644 --- a/src/meta/runtime/src/client_bridge.rs +++ b/src/meta/runtime/src/client_bridge.rs @@ -28,6 +28,7 @@ use std::time::Duration; use databend_meta::runtime_api as server_rt; use databend_meta_client::runtime_api as client_rt; +use tonic_013::Request; use crate::DatabendMetrics; use crate::DatabendRuntime; @@ -126,17 +127,17 @@ impl client_rt::SpawnApi for DatabendRuntime { ::unlimited_future(fut) } - fn prepare_request(request: tonic::Request) -> tonic::Request { + fn prepare_request(request: Request) -> Request { ::prepare_request(request) } fn trace_request<'a, T, F, Fut, R>( name: &'static str, - request: tonic::Request, + request: Request, f: F, ) -> client_rt::BoxFuture<'a, R> where - F: FnOnce(tonic::Request) -> Fut, + F: FnOnce(Request) -> Fut, Fut: Future + Send + 'a, R: Send + 'a, { diff --git a/src/meta/runtime/src/lib.rs b/src/meta/runtime/src/lib.rs index 4d0247c1ffb39..fc6fec4371397 100644 --- a/src/meta/runtime/src/lib.rs +++ b/src/meta/runtime/src/lib.rs @@ -25,13 +25,11 @@ mod metrics; use std::future::Future; use std::net::IpAddr; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use databend_common_base::runtime; -use databend_common_grpc::ConnectionFactory; -use databend_common_grpc::GrpcConnectionError; -use databend_common_grpc::RpcClientTlsConfig; use databend_meta::runtime_api::BoxFuture; use databend_meta::runtime_api::Channel; use databend_meta::runtime_api::ChannelError; @@ -40,24 +38,13 @@ use databend_meta::runtime_api::RuntimeApi; use databend_meta::runtime_api::SpawnApi; use databend_meta::runtime_api::TlsConfig; use databend_meta::runtime_api::TrackingData; +use fastrace::collector::SpanContext; pub use metrics::DatabendMetrics; +use tonic_013::transport::Certificate; +use tonic_013::transport::ClientTlsConfig; +use tonic_013::transport::Endpoint; -fn convert_grpc_error(e: GrpcConnectionError) -> ChannelError { - match e { - GrpcConnectionError::InvalidUri { uri, source } => ChannelError::InvalidUri { - uri, - message: source.to_string(), - }, - GrpcConnectionError::TLSConfigError { action, source } => ChannelError::TlsConfig { - action, - message: source.to_string(), - }, - GrpcConnectionError::CannotConnect { uri, source } => ChannelError::CannotConnect { - uri, - message: source.to_string(), - }, - } -} +const HEADER_TRACE_PARENT: &str = "traceparent"; /// Runtime adapter that wraps `databend_common_base::Runtime`. /// @@ -130,16 +117,23 @@ impl SpawnApi for DatabendRuntime { Box::pin(runtime::UnlimitedFuture::create(fut)) } - fn prepare_request(request: tonic::Request) -> tonic::Request { - use std::str::FromStr; - - // Inject tracing span context - let mut req = databend_common_tracing::inject_span_to_tonic_request(request); + fn prepare_request(request: tonic_013::Request) -> tonic_013::Request { + let mut req = request; + + if let Some(current) = SpanContext::current_local_parent() { + let key = tonic_013::metadata::MetadataKey::from_bytes(HEADER_TRACE_PARENT.as_bytes()) + .unwrap(); + let val = tonic_013::metadata::AsciiMetadataValue::try_from( + ¤t.encode_w3c_traceparent(), + ) + .unwrap(); + req.metadata_mut().insert(key, val); + } // Inject query ID if available if let Some(query_id) = runtime::ThreadTracker::query_id() { - let key = tonic::metadata::AsciiMetadataKey::from_str("QueryID"); - let value = tonic::metadata::AsciiMetadataValue::from_str(query_id); + let key = tonic_013::metadata::AsciiMetadataKey::from_str("QueryID"); + let value = tonic_013::metadata::AsciiMetadataValue::from_str(query_id); if let Some((key, value)) = key.ok().zip(value.ok()) { req.metadata_mut().insert(key, value); @@ -151,17 +145,26 @@ impl SpawnApi for DatabendRuntime { fn trace_request<'a, T, F, Fut, R>( name: &'static str, - request: tonic::Request, + request: tonic_013::Request, f: F, ) -> BoxFuture<'a, R> where - F: FnOnce(tonic::Request) -> Fut, + F: FnOnce(tonic_013::Request) -> Fut, Fut: Future + Send + 'a, R: Send + 'a, { use fastrace::prelude::*; - let span = databend_common_tracing::start_trace_for_remote_request(name, &request); + let span_context = request + .metadata() + .get(HEADER_TRACE_PARENT) + .and_then(|traceparent| traceparent.to_str().ok()) + .and_then(SpanContext::decode_w3c_traceparent); + let span = if let Some(span_context) = span_context { + Span::root(name, span_context) + } else { + Span::noop() + }; Box::pin(f(request).in_span(span)) } @@ -177,26 +180,64 @@ impl SpawnApi for DatabendRuntime { tls_config: Option, ) -> BoxFuture<'static, Result> { Box::pin(async move { - let grpc_tls = tls_config.map(|c| RpcClientTlsConfig { - rpc_tls_server_root_ca_cert: c.root_ca_cert_path, - domain_name: c.domain_name, - }); + let uri = format!( + "{}://{}", + if tls_config.is_some() { + "https" + } else { + "http" + }, + addr + ); + let mut endpoint = + Endpoint::from_shared(uri.clone()).map_err(|e| ChannelError::InvalidUri { + uri: uri.clone(), + message: e.to_string(), + })?; + + if let Some(tls) = tls_config { + let cert = + std::fs::read(&tls.root_ca_cert_path).map_err(|e| ChannelError::TlsConfig { + action: "loading".to_string(), + message: e.to_string(), + })?; + let tls = ClientTlsConfig::new() + .domain_name(tls.domain_name) + .ca_certificate(Certificate::from_pem(cert)); + endpoint = endpoint + .tls_config(tls) + .map_err(|e| ChannelError::TlsConfig { + action: "building".to_string(), + message: e.to_string(), + })?; + } + + if let Some(timeout) = timeout { + endpoint = endpoint.timeout(timeout); + endpoint = endpoint.connect_timeout(timeout); + } - ConnectionFactory::create_rpc_channel(&addr, timeout, grpc_tls, None) + endpoint + .connect() .await - .map_err(convert_grpc_error) + .map_err(|e| ChannelError::CannotConnect { + uri, + message: e.to_string(), + }) }) } fn resolve(hostname: &str) -> BoxFuture<'static, std::io::Result>> { let hostname = hostname.to_string(); Box::pin(async move { - let resolver = databend_common_grpc::DNSResolver::instance() - .map_err(|e| std::io::Error::other(e.to_string()))?; - resolver + let resolver: Arc = + databend_common_grpc::DNSResolver::instance() + .map_err(|e| std::io::Error::other(e.to_string()))?; + let addrs: Vec = resolver .resolve(&hostname) .await - .map_err(|e| std::io::Error::other(e.to_string())) + .map_err(|e| std::io::Error::other(e.to_string()))?; + Ok(addrs) }) } diff --git a/src/query/catalog/Cargo.toml b/src/query/catalog/Cargo.toml index 3669c548ec5c1..4386cac87d8f0 100644 --- a/src/query/catalog/Cargo.toml +++ b/src/query/catalog/Cargo.toml @@ -40,7 +40,6 @@ roaring = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } sha2 = { workspace = true } -thrift = { workspace = true } tokio = { workspace = true } typetag = { workspace = true } uuid = { workspace = true } diff --git a/src/query/catalog/src/plan/datasource/datasource_info/parquet.rs b/src/query/catalog/src/plan/datasource/datasource_info/parquet.rs index ee0ac558591b9..976efd208677d 100644 --- a/src/query/catalog/src/plan/datasource/datasource_info/parquet.rs +++ b/src/query/catalog/src/plan/datasource/datasource_info/parquet.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::collections::HashMap; -use std::io::Cursor; use std::sync::Arc; use arrow_schema::Schema as ArrowSchema; @@ -25,19 +24,14 @@ use databend_common_meta_app::schema::TableInfo; use databend_common_storage::StageFileInfo; use databend_common_storage::StageFilesInfo; use databend_storages_common_table_meta::meta::ColumnStatistics; +use parquet::arrow::ArrowSchemaConverter; use parquet::file::metadata::ParquetMetaData; -use parquet::format::SchemaElement; -use parquet::schema::types; +use parquet::schema::parser::parse_message_type; +use parquet::schema::printer::print_schema; use parquet::schema::types::SchemaDescPtr; use parquet::schema::types::SchemaDescriptor; -use parquet::thrift::TSerializable; use serde::Deserialize; -use thrift::protocol::TCompactInputProtocol; -use thrift::protocol::TCompactOutputProtocol; -use thrift::protocol::TInputProtocol; -use thrift::protocol::TListIdentifier; -use thrift::protocol::TOutputProtocol; -use thrift::protocol::TType; +use serde::Serialize; use crate::plan::datasource::datasource_info::parquet_read_options::ParquetReadOptions; @@ -56,7 +50,7 @@ pub struct FullParquetMeta { pub row_group_level_stats: Option>>, } -#[derive(serde::Serialize, serde::Deserialize, Clone, Debug)] +#[derive(Clone, Debug)] pub struct ParquetTableInfo { pub read_options: ParquetReadOptions, pub stage_info: StageInfo, @@ -64,19 +58,14 @@ pub struct ParquetTableInfo { pub table_info: TableInfo, pub arrow_schema: ArrowSchema, - #[serde(deserialize_with = "deser_schema_desc")] - #[serde(serialize_with = "ser_schema_desc")] pub schema_descr: SchemaDescPtr, pub files_to_read: Option>, pub schema_from: String, pub compression_ratio: f64, pub leaf_fields: Arc>, - #[serde(skip)] pub need_stats_provider: bool, - #[serde(skip)] pub max_threads: usize, - #[serde(skip)] pub max_memory_usage: u64, } @@ -90,40 +79,82 @@ impl ParquetTableInfo { } } -fn deser_schema_desc<'de, D>(deserializer: D) -> Result -where D: serde::Deserializer<'de> { - let bytes: Vec = Deserialize::deserialize(deserializer)?; - let cursor = Cursor::new(bytes); - let mut i_prot = TCompactInputProtocol::new(cursor); - let list_ident = i_prot.read_list_begin().unwrap(); - let mut schema_elements: Vec = Vec::with_capacity(list_ident.size as usize); - for _ in 0..list_ident.size { - let list_elem = SchemaElement::read_from_in_protocol(&mut i_prot).unwrap(); - schema_elements.push(list_elem); +#[derive(Serialize, Deserialize)] +struct ParquetTableInfoSerde { + read_options: ParquetReadOptions, + stage_info: StageInfo, + files_info: StageFilesInfo, + table_info: TableInfo, + arrow_schema: ArrowSchema, + schema_descr_bytes: Vec, + schema_descr_root: String, + files_to_read: Option>, + schema_from: String, + compression_ratio: f64, + leaf_fields: Arc>, +} + +impl Serialize for ParquetTableInfo { + fn serialize(&self, serializer: S) -> Result + where S: serde::Serializer { + ParquetTableInfoSerde { + read_options: self.read_options, + stage_info: self.stage_info.clone(), + files_info: self.files_info.clone(), + table_info: self.table_info.clone(), + arrow_schema: self.arrow_schema.clone(), + schema_descr_bytes: schema_to_bytes(&self.schema_descr), + schema_descr_root: self.schema_descr.root_schema().name().to_string(), + files_to_read: self.files_to_read.clone(), + schema_from: self.schema_from.clone(), + compression_ratio: self.compression_ratio, + leaf_fields: self.leaf_fields.clone(), + } + .serialize(serializer) } - i_prot.read_list_end().unwrap(); - let schema = types::from_thrift(&schema_elements).unwrap(); - Ok(Arc::new(SchemaDescriptor::new(schema))) } -fn ser_schema_desc(schema: &SchemaDescPtr, serializer: S) -> Result -where S: serde::Serializer { - let mut transport = Vec::::new(); - let mut o_prot = TCompactOutputProtocol::new(&mut transport); - let schema_elements = types::to_thrift(schema.root_schema()).map_err(|e| { - serde::ser::Error::custom(format!("Failed to convert schema to thrift: {:?}", e)) - })?; - o_prot - .write_list_begin(&TListIdentifier::new( - TType::Struct, - schema_elements.len() as i32, - )) - .unwrap(); - for e in schema_elements { - e.write_to_out_protocol(&mut o_prot).unwrap(); +impl<'de> Deserialize<'de> for ParquetTableInfo { + fn deserialize(deserializer: D) -> Result + where D: serde::Deserializer<'de> { + let helper = ParquetTableInfoSerde::deserialize(deserializer)?; + let schema_descr = schema_from_bytes(&helper.schema_descr_bytes).or_else(|_| { + ArrowSchemaConverter::new() + .schema_root(&helper.schema_descr_root) + .convert(&helper.arrow_schema) + .map(Arc::new) + }); + let schema_descr = schema_descr.map_err(|e| serde::de::Error::custom(e.to_string()))?; + + Ok(Self { + read_options: helper.read_options, + stage_info: helper.stage_info, + files_info: helper.files_info, + table_info: helper.table_info, + arrow_schema: helper.arrow_schema, + schema_descr, + files_to_read: helper.files_to_read, + schema_from: helper.schema_from, + compression_ratio: helper.compression_ratio, + leaf_fields: helper.leaf_fields, + need_stats_provider: false, + max_threads: 0, + max_memory_usage: 0, + }) } - o_prot.write_list_end().unwrap(); - serializer.serialize_bytes(&transport) +} + +fn schema_from_bytes(bytes: &[u8]) -> parquet::errors::Result { + let schema_string = String::from_utf8(bytes.to_vec()) + .map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?; + let schema = parse_message_type(&schema_string)?; + Ok(Arc::new(SchemaDescriptor::new(Arc::new(schema)))) +} + +fn schema_to_bytes(schema: &SchemaDescPtr) -> Vec { + let mut out = Vec::new(); + print_schema(&mut out, schema.root_schema()); + out } #[cfg(test)] @@ -132,10 +163,13 @@ mod tests { use arrow_schema::Schema as ArrowSchema; use databend_common_storage::StageFilesInfo; + use parquet::arrow::parquet_to_arrow_schema; use parquet::basic::ConvertedType; use parquet::basic::Repetition; use parquet::basic::Type as PhysicalType; use parquet::errors::ParquetError; + use parquet::schema::parser::parse_message_type; + use parquet::schema::printer::print_schema; use parquet::schema::types::SchemaDescPtr; use parquet::schema::types::SchemaDescriptor; use parquet::schema::types::Type; @@ -187,6 +221,17 @@ mod tests { Ok(Arc::new(SchemaDescriptor::new(Arc::new(schema)))) } + fn make_arrow_compatible_desc() -> Result { + let schema = parse_message_type( + " + message stage_file { + OPTIONAL INT64 number (UINT_64); + } + ", + )?; + Ok(Arc::new(SchemaDescriptor::new(Arc::new(schema)))) + } + #[test] fn test_serde() { let schema_descr = make_desc().unwrap(); @@ -214,6 +259,60 @@ mod tests { }; let s = serde_json::to_string(&info).unwrap(); let info = serde_json::from_str::(&s).unwrap(); - assert_eq!(info.schema_descr, schema_descr) + + let mut original = Vec::new(); + print_schema(&mut original, schema_descr.root_schema()); + let mut roundtrip = Vec::new(); + print_schema(&mut roundtrip, info.schema_descr.root_schema()); + + assert_eq!( + schema_descr.root_schema().name(), + info.schema_descr.root_schema().name() + ); + assert_eq!(original, roundtrip) + } + + #[test] + fn test_serde_falls_back_to_arrow_schema() { + let schema_descr = make_arrow_compatible_desc().unwrap(); + let arrow_schema = parquet_to_arrow_schema(&schema_descr, None).unwrap(); + let info = ParquetTableInfo { + schema_descr: schema_descr.clone(), + read_options: Default::default(), + stage_info: Default::default(), + files_info: StageFilesInfo { + path: "".to_string(), + files: None, + pattern: None, + }, + table_info: Default::default(), + leaf_fields: Arc::new(vec![]), + arrow_schema, + files_to_read: None, + schema_from: "".to_string(), + compression_ratio: 0.0, + need_stats_provider: false, + max_threads: 1, + max_memory_usage: 10000, + }; + + let mut json = serde_json::to_value(&info).unwrap(); + json["schema_descr_bytes"] = serde_json::json!(Vec::::from("invalid schema")); + + let info = serde_json::from_value::(json).unwrap(); + + assert_eq!( + schema_descr.root_schema().name(), + info.schema_descr.root_schema().name() + ); + assert_eq!(schema_descr.num_columns(), info.schema_descr.num_columns()); + assert_eq!( + schema_descr.column(0).name(), + info.schema_descr.column(0).name() + ); + assert_eq!( + schema_descr.column(0).physical_type(), + info.schema_descr.column(0).physical_type() + ); } } diff --git a/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs b/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs index e5301e19d7d72..19900c34f241a 100644 --- a/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs +++ b/src/query/service/src/servers/flight/v1/exchange/serde/exchange_serializer.rs @@ -21,6 +21,7 @@ use arrow_array::RecordBatchOptions; use arrow_flight::FlightData; use arrow_flight::SchemaAsIpc; use arrow_ipc::CompressionType; +use arrow_ipc::writer::CompressionContext; use arrow_ipc::writer::DictionaryTracker; use arrow_ipc::writer::IpcDataGenerator; use arrow_ipc::writer::IpcWriteOptions; @@ -240,10 +241,15 @@ pub fn batches_to_flight_data_with_options( let data_gen = IpcDataGenerator::default(); let mut dictionary_tracker = DictionaryTracker::new(false); + let mut compression_context = CompressionContext::default(); for batch in batches.iter() { - let (encoded_dictionaries, encoded_batch) = - data_gen.encoded_batch(batch, &mut dictionary_tracker, options)?; + let (encoded_dictionaries, encoded_batch) = data_gen.encode( + batch, + &mut dictionary_tracker, + options, + &mut compression_context, + )?; dictionaries.extend(encoded_dictionaries.into_iter().map(Into::into)); flight_data.push(encoded_batch.into()); diff --git a/src/query/service/src/servers/flight/v1/network/outbound_channel.rs b/src/query/service/src/servers/flight/v1/network/outbound_channel.rs index c576df9339f8b..2e0975755078f 100644 --- a/src/query/service/src/servers/flight/v1/network/outbound_channel.rs +++ b/src/query/service/src/servers/flight/v1/network/outbound_channel.rs @@ -21,6 +21,7 @@ use arrow_array::RecordBatchOptions; use arrow_flight::FlightData; use arrow_flight::FlightDescriptor; use arrow_ipc::CompressionType; +use arrow_ipc::writer::CompressionContext; use arrow_ipc::writer::DictionaryTracker; use arrow_ipc::writer::IpcDataGenerator; use arrow_ipc::writer::IpcWriteOptions; @@ -69,8 +70,13 @@ fn encode_batch( ) -> Result<(Vec, FlightData)> { let data_gen = IpcDataGenerator::default(); let mut dictionary_tracker = DictionaryTracker::new(false); - let (encoded_dictionaries, encoded_batch) = - data_gen.encoded_batch(batch, &mut dictionary_tracker, ipc_options)?; + let mut compression_context = CompressionContext::default(); + let (encoded_dictionaries, encoded_batch) = data_gen.encode( + batch, + &mut dictionary_tracker, + ipc_options, + &mut compression_context, + )?; let dictionaries: Vec = encoded_dictionaries.into_iter().map(Into::into).collect(); let batch_data: FlightData = encoded_batch.into(); Ok((dictionaries, batch_data)) diff --git a/src/query/service/src/servers/flight_sql/flight_sql_service/query.rs b/src/query/service/src/servers/flight_sql/flight_sql_service/query.rs index e3e27056fdabe..e1be8882bf63f 100644 --- a/src/query/service/src/servers/flight_sql/flight_sql_service/query.rs +++ b/src/query/service/src/servers/flight_sql/flight_sql_service/query.rs @@ -114,9 +114,15 @@ impl FlightSqlServiceImpl { .to_record_batch_with_dataschema(data_schema) .map_err(|e| ErrorCode::Internal(format!("{e:?}")))?; let mut dictionary_tracker = writer::DictionaryTracker::new(false); + let mut compression_context = writer::CompressionContext::default(); let (_encoded_dictionaries, encoded_batch) = data_gen - .encoded_batch(&batch, &mut dictionary_tracker, options) + .encode( + &batch, + &mut dictionary_tracker, + options, + &mut compression_context, + ) .map_err(|e| ErrorCode::Internal(format!("{e:?}")))?; Ok(encoded_batch.into()) diff --git a/src/query/service/src/spillers/row_group_encoder.rs b/src/query/service/src/spillers/row_group_encoder.rs index 6feeac15750c1..24f2d860007dc 100644 --- a/src/query/service/src/spillers/row_group_encoder.rs +++ b/src/query/service/src/spillers/row_group_encoder.rs @@ -40,8 +40,8 @@ use parquet::arrow::FieldLevels; use parquet::arrow::ProjectionMask; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; use parquet::arrow::arrow_writer::ArrowColumnWriter; +use parquet::arrow::arrow_writer::ArrowRowGroupWriterFactory; use parquet::arrow::arrow_writer::compute_leaves; -use parquet::arrow::arrow_writer::get_column_writers; use parquet::arrow::parquet_to_arrow_field_levels; use parquet::errors; use parquet::file::metadata::RowGroupMetaData; @@ -86,7 +86,7 @@ impl Properties { } pub fn new_encoder(&self) -> RowGroupEncoder { - RowGroupEncoder::new(&self.writer_props, self.schema.clone(), &self.parquet) + RowGroupEncoder::new(&self.writer_props, self.schema.clone(), &self.parquet, 0) } } @@ -97,8 +97,19 @@ pub struct RowGroupEncoder { } impl RowGroupEncoder { - fn new(props: &WriterPropertiesPtr, schema: Arc, parquet: &SchemaDescriptor) -> Self { - let writers = get_column_writers(parquet, props, &schema).unwrap(); + fn new( + props: &WriterPropertiesPtr, + schema: Arc, + parquet: &SchemaDescriptor, + row_group_index: usize, + ) -> Self { + let file_writer = + SerializedFileWriter::new(Vec::new(), parquet.root_schema_ptr(), props.clone()) + .unwrap(); + let row_group_factory = ArrowRowGroupWriterFactory::new(&file_writer, schema.clone()); + let writers = row_group_factory + .create_column_writers(row_group_index) + .unwrap(); Self { schema, props: props.clone(), @@ -204,6 +215,7 @@ impl FileWriter { self.writer.properties(), self.schema.clone(), self.writer.schema_descr(), + self.row_groups.len(), ) } @@ -235,12 +247,13 @@ impl FileWriter { let file_metadata = writer.finish()?; let tp = writer.schema_descr().root_schema_ptr(); let schema_descr = Arc::new(SchemaDescriptor::new(tp)); + let file_metadata = file_metadata.file_metadata(); let metadata = parquet::file::metadata::FileMetaData::new( - file_metadata.version, - file_metadata.num_rows, - file_metadata.created_by.clone(), - file_metadata.key_value_metadata.clone(), + file_metadata.version(), + file_metadata.num_rows(), + file_metadata.created_by().map(ToString::to_string), + file_metadata.key_value_metadata().cloned(), schema_descr, None, ); diff --git a/src/query/service/src/spillers/serialize.rs b/src/query/service/src/spillers/serialize.rs index 386bb731c60d9..e9c21602f8469 100644 --- a/src/query/service/src/spillers/serialize.rs +++ b/src/query/service/src/spillers/serialize.rs @@ -38,10 +38,10 @@ use opendal::Buffer; use parquet::arrow::ArrowWriter; use parquet::arrow::arrow_reader::ParquetRecordBatchReader; use parquet::basic::Compression; +use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::EnabledStatistics; use parquet::file::properties::WriterProperties; use parquet::file::reader::ChunkReader; -use parquet::format::FileMetaData; #[derive(Debug, Clone)] pub enum Layout { @@ -171,7 +171,7 @@ fn bare_blocks_from_parquet(data: R) -> Result( blocks: Vec, write_buffer: W, -) -> Result { +) -> Result { assert!(!blocks.is_empty()); let data_schema = blocks.first().unwrap().infer_schema(); diff --git a/src/query/service/src/test_kits/block_writer.rs b/src/query/service/src/test_kits/block_writer.rs index 95798364c68d8..d9d4398c03510 100644 --- a/src/query/service/src/test_kits/block_writer.rs +++ b/src/query/service/src/test_kits/block_writer.rs @@ -41,7 +41,7 @@ use databend_storages_common_table_meta::meta::TableMetaTimestamps; use databend_storages_common_table_meta::meta::encode_column_hll; use databend_storages_common_table_meta::table::TableCompression; use opendal::Operator; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaData; use uuid::Uuid; use super::old_version_generator; @@ -75,7 +75,7 @@ impl<'a> BlockWriter<'a> { block: DataBlock, col_stats: StatisticsOfColumns, cluster_stats: Option, - ) -> Result<(BlockMeta, Option, RawBlockHLL)> { + ) -> Result<(BlockMeta, Option, RawBlockHLL)> { let (location, block_id) = if !self.is_greater_than_v5 { let location_generator = old_version_generator::TableMetaLocationGenerator::with_prefix( self.location_generator.prefix().to_string(), @@ -140,7 +140,7 @@ impl<'a> BlockWriter<'a> { schema: TableSchemaRef, block: &DataBlock, block_id: Uuid, - ) -> Result<(u64, Option, Option)> { + ) -> Result<(u64, Option, Option)> { let location = self .location_generator .block_bloom_index_location(&block_id); diff --git a/src/query/service/tests/it/parquet_rs/data.rs b/src/query/service/tests/it/parquet_rs/data.rs index 3a6f4a4c6cd39..cd6f0f25fb29d 100644 --- a/src/query/service/tests/it/parquet_rs/data.rs +++ b/src/query/service/tests/it/parquet_rs/data.rs @@ -348,7 +348,7 @@ pub async fn make_test_file_rg(scenario: Scenario) -> (NamedTempFile, SchemaRef) .expect("tempfile creation"); let props = WriterProperties::builder() - .set_max_row_group_size(5) + .set_max_row_group_row_count(Some(5)) .build(); let batches = create_data_batch(scenario); diff --git a/src/query/service/tests/it/parquet_rs/prune_pages.rs b/src/query/service/tests/it/parquet_rs/prune_pages.rs index 5cb25d57841c6..ee7745333b86e 100644 --- a/src/query/service/tests/it/parquet_rs/prune_pages.rs +++ b/src/query/service/tests/it/parquet_rs/prune_pages.rs @@ -22,6 +22,7 @@ use parquet::arrow::arrow_reader::ArrowReaderMetadata; use parquet::arrow::arrow_reader::ArrowReaderOptions; use parquet::arrow::arrow_reader::RowSelection; use parquet::arrow::arrow_reader::RowSelector; +use parquet::file::metadata::PageIndexPolicy; use crate::parquet_rs::data::Scenario; use crate::parquet_rs::data::make_test_file_page; @@ -41,7 +42,7 @@ async fn test_batch(batches: &[(Scenario, &str, RowSelection)]) { let metadata = ArrowReaderMetadata::load( file.as_file(), ArrowReaderOptions::new() - .with_page_index(true) + .with_page_index_policy(PageIndexPolicy::from(true)) .with_skip_arrow_metadata(true), ) .unwrap(); diff --git a/src/query/service/tests/it/servers/flight_sql/flight_sql_handler.rs b/src/query/service/tests/it/servers/flight_sql/flight_sql_handler.rs index 58ae8d2efa0be..d0397e2cb094d 100644 --- a/src/query/service/tests/it/servers/flight_sql/flight_sql_handler.rs +++ b/src/query/service/tests/it/servers/flight_sql/flight_sql_handler.rs @@ -19,9 +19,9 @@ use std::io::Write; use arrow_array::RecordBatch; use arrow_cast::pretty::pretty_format_batches; +use arrow_flight::error::FlightError; use arrow_flight::flight_service_server::FlightServiceServer; use arrow_flight::sql::client::FlightSqlServiceClient; -use arrow_schema::ArrowError; use databend_common_base::runtime::Runtime; use databend_common_config::InnerConfig; use databend_common_config::UserAuthConfig; @@ -66,7 +66,7 @@ async fn client_with_uds(path: String) -> FlightSqlServiceClient { async fn run_query( client: &mut FlightSqlServiceClient, sql: &str, -) -> std::result::Result { +) -> std::result::Result { let mut stmt = client.prepare(sql.to_string(), None).await?; let res = if stmt.dataset_schema()?.fields.is_empty() { let affected_rows = client.execute_update(sql.to_string(), None).await?; @@ -76,7 +76,9 @@ async fn run_query( let ticket = flight_info.endpoint[0].ticket.as_ref().unwrap().clone(); let flight_data = client.do_get(ticket).await?; let batches: Vec = flight_data.try_collect().await.unwrap(); - pretty_format_batches(batches.as_slice())?.to_string() + pretty_format_batches(batches.as_slice()) + .map_err(FlightError::from)? + .to_string() }; Ok(res) } diff --git a/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs b/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs index b3ca71c7de0e1..87f9ccb2af340 100644 --- a/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs +++ b/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs @@ -35,6 +35,7 @@ use databend_query::test_kits::*; use databend_storages_common_cache::CacheAccessor; use databend_storages_common_cache::CacheValue; use databend_storages_common_cache::InMemoryLruCache; +use databend_storages_common_cache::ParquetMetaData; use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::ColumnMeta; use databend_storages_common_table_meta::meta::ColumnStatistics; @@ -46,7 +47,6 @@ use databend_storages_common_table_meta::meta::SingleColumnMeta; use databend_storages_common_table_meta::meta::Statistics; use databend_storages_common_table_meta::meta::Versioned; use opendal::Operator; -use parquet::format::FileMetaData; use sysinfo::System; use sysinfo::get_current_pid; use uuid::Uuid; @@ -384,7 +384,7 @@ where T: Clone + Into> { } #[allow(dead_code)] -async fn setup() -> databend_common_exception::Result { +async fn setup() -> databend_common_exception::Result { let fields = (0..23) .map(|_| TableField::new("id", TableDataType::Number(NumberDataType::Int32))) .collect::>(); diff --git a/src/query/storages/common/blocks/src/parquet_rs.rs b/src/query/storages/common/blocks/src/parquet_rs.rs index 7f4a6ac0a5079..cd3cfb5959c96 100644 --- a/src/query/storages/common/blocks/src/parquet_rs.rs +++ b/src/query/storages/common/blocks/src/parquet_rs.rs @@ -26,10 +26,10 @@ use databend_storages_common_table_meta::table::TableCompression; use parquet::arrow::ArrowWriter; use parquet::basic::Encoding; use parquet::file::metadata::KeyValue; +use parquet::file::metadata::ParquetMetaData; use parquet::file::properties::EnabledStatistics; use parquet::file::properties::WriterProperties; use parquet::file::properties::WriterVersion; -use parquet::format::FileMetaData; use parquet::schema::types::ColumnPath; /// Disable dictionary encoding once the NDV-to-row ratio is greater than this threshold. @@ -50,7 +50,7 @@ pub fn blocks_to_parquet( compression: TableCompression, enable_dictionary: bool, metadata: Option>, -) -> Result { +) -> Result { blocks_to_parquet_with_stats( table_schema, blocks, @@ -80,7 +80,7 @@ pub fn blocks_to_parquet_with_stats( enable_dictionary: bool, metadata: Option>, column_stats: Option<&StatisticsOfColumns>, -) -> Result { +) -> Result { assert!(!blocks.is_empty()); // Writer properties cannot be tweaked after ArrowWriter creation, so we mirror the behavior of @@ -178,7 +178,7 @@ pub fn build_parquet_writer_properties( let mut builder = WriterProperties::builder() .set_compression(compression.into()) // use `usize::MAX` to effectively limit the number of row groups to 1 - .set_max_row_group_size(usize::MAX) + .set_max_row_group_row_count(Some(usize::MAX)) .set_encoding(Encoding::PLAIN) .set_statistics_enabled(EnabledStatistics::None) .set_bloom_filter_enabled(false) diff --git a/src/query/storages/common/index/src/bloom_index.rs b/src/query/storages/common/index/src/bloom_index.rs index b984be546fe50..48f2bbee33e5e 100644 --- a/src/query/storages/common/index/src/bloom_index.rs +++ b/src/query/storages/common/index/src/bloom_index.rs @@ -76,11 +76,12 @@ use databend_storages_common_table_meta::meta::SingleColumnMeta; use databend_storages_common_table_meta::meta::StatisticsOfColumns; use databend_storages_common_table_meta::meta::Versioned; use jsonb::RawJsonb; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaData; use serde::Deserialize; use serde::Serialize; use super::eliminate_cast::is_injective_cast; +use super::index_common::index_columns_from_parquet_meta; use crate::Index; use crate::eliminate_cast::cast_const; use crate::filters::BinaryFuse32Builder; @@ -158,44 +159,13 @@ impl TryFrom for BloomIndexMeta { } } -impl TryFrom for BloomIndexMeta { +impl TryFrom for BloomIndexMeta { type Error = ErrorCode; - fn try_from(mut meta: FileMetaData) -> std::result::Result { - let rg = meta.row_groups.remove(0); - let mut col_metas = Vec::with_capacity(rg.columns.len()); - for x in &rg.columns { - match &x.meta_data { - Some(chunk_meta) => { - let col_start = - if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { - dict_page_offset - } else { - chunk_meta.data_page_offset - }; - let col_len = chunk_meta.total_compressed_size; - assert!( - col_start >= 0 && col_len >= 0, - "column start and length should not be negative" - ); - let num_values = chunk_meta.num_values as u64; - let res = SingleColumnMeta { - offset: col_start as u64, - len: col_len as u64, - num_values, - }; - let column_name = chunk_meta.path_in_schema[0].to_owned(); - col_metas.push((column_name, res)); - } - None => { - panic!( - "expecting chunk meta data while converting ThriftFileMetaData to BloomIndexMeta" - ) - } - } - } - col_metas.shrink_to_fit(); - Ok(Self { columns: col_metas }) + fn try_from(meta: ParquetMetaData) -> std::result::Result { + Ok(Self { + columns: index_columns_from_parquet_meta(&meta), + }) } } diff --git a/src/query/storages/common/index/src/index_common.rs b/src/query/storages/common/index/src/index_common.rs index e936323371281..6888d57aa766c 100644 --- a/src/query/storages/common/index/src/index_common.rs +++ b/src/query/storages/common/index/src/index_common.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; use bytes::Bytes; use databend_common_exception::ErrorCode; use databend_storages_common_table_meta::meta::SingleColumnMeta; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaData; #[derive(Clone, serde::Serialize, serde::Deserialize)] pub struct IndexMeta { @@ -114,56 +114,48 @@ macro_rules! impl_bincode_codec_for_file { impl_bincode_codec_for_meta!(IndexMeta, "index"); impl_bincode_codec_for_file!(IndexFile, "index"); -impl TryFrom for IndexMeta { - type Error = ErrorCode; +pub(crate) fn index_columns_from_parquet_meta( + meta: &ParquetMetaData, +) -> Vec<(String, SingleColumnMeta)> { + let row_group = &meta.row_groups()[0]; + let mut col_metas = Vec::with_capacity(row_group.columns().len()); + for chunk_meta in row_group.columns() { + let (offset, len) = chunk_meta.byte_range(); + let num_values = chunk_meta.num_values() as u64; + let column_name = chunk_meta.column_path().parts()[0].to_owned(); + col_metas.push((column_name, SingleColumnMeta { + offset, + len, + num_values, + })); + } + col_metas.shrink_to_fit(); + col_metas +} - fn try_from(mut meta: FileMetaData) -> std::result::Result { - let rg = meta.row_groups.remove(0); - let mut col_metas = Vec::with_capacity(rg.columns.len()); - for x in &rg.columns { - match &x.meta_data { - Some(chunk_meta) => { - let col_start = - if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { - dict_page_offset - } else { - chunk_meta.data_page_offset - }; - let col_len = chunk_meta.total_compressed_size; - assert!( - col_start >= 0 && col_len >= 0, - "column start and length should not be negative" - ); - let num_values = chunk_meta.num_values as u64; - let res = SingleColumnMeta { - offset: col_start as u64, - len: col_len as u64, - num_values, - }; - let column_name = chunk_meta.path_in_schema[0].to_owned(); - col_metas.push((column_name, res)); - } - None => { - panic!( - "expecting chunk meta data while converting ThriftFileMetaData to IndexMeta" - ) - } - } - } - col_metas.shrink_to_fit(); - let mut metadata = BTreeMap::new(); - if let Some(key_value_metadata) = meta.key_value_metadata { - for key_value in &key_value_metadata { - if key_value.key == "ARROW:schema" || key_value.value.is_none() { - continue; - } - metadata.insert(key_value.key.clone(), key_value.value.clone().unwrap()); +pub(crate) fn user_metadata_from_parquet_meta(meta: &ParquetMetaData) -> BTreeMap { + let mut metadata = BTreeMap::new(); + if let Some(key_value_metadata) = meta.file_metadata().key_value_metadata() { + for key_value in key_value_metadata { + let Some(value) = &key_value.value else { + continue; + }; + if key_value.key == "ARROW:schema" { + continue; } + metadata.insert(key_value.key.clone(), value.clone()); } + } + metadata +} + +impl TryFrom for IndexMeta { + type Error = ErrorCode; + fn try_from(meta: ParquetMetaData) -> std::result::Result { Ok(IndexMeta { - columns: col_metas, - metadata, + columns: index_columns_from_parquet_meta(&meta), + metadata: user_metadata_from_parquet_meta(&meta), }) } } diff --git a/src/query/storages/common/index/src/inverted_index.rs b/src/query/storages/common/index/src/inverted_index.rs index bdc3ba69e0703..dcc8388011b95 100644 --- a/src/query/storages/common/index/src/inverted_index.rs +++ b/src/query/storages/common/index/src/inverted_index.rs @@ -65,7 +65,7 @@ use levenshtein_automata::DFA; use levenshtein_automata::Distance; use levenshtein_automata::LevenshteinAutomatonBuilder; use log::warn; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaData; use roaring::RoaringTreemap; use tantivy::Directory; use tantivy::Term; @@ -1242,47 +1242,13 @@ pub struct InvertedIndexMeta { pub columns: Vec<(String, SingleColumnMeta)>, } -impl TryFrom for InvertedIndexMeta { +impl TryFrom for InvertedIndexMeta { type Error = ErrorCode; - fn try_from(mut meta: FileMetaData) -> std::result::Result { - let rg = meta.row_groups.remove(0); - let mut col_metas = Vec::with_capacity(rg.columns.len()); - for x in &rg.columns { - match &x.meta_data { - Some(chunk_meta) => { - let col_start = - if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { - dict_page_offset - } else { - chunk_meta.data_page_offset - }; - let col_len = chunk_meta.total_compressed_size; - assert!( - col_start >= 0 && col_len >= 0, - "column start and length should not be negative" - ); - let num_values = chunk_meta.num_values as u64; - let res = SingleColumnMeta { - offset: col_start as u64, - len: col_len as u64, - num_values, - }; - let column_name = chunk_meta.path_in_schema[0].to_owned(); - col_metas.push((column_name, res)); - } - None => { - panic!( - "expecting chunk meta data while converting ThriftFileMetaData to InvertedIndexMeta" - ) - } - } - } - col_metas.shrink_to_fit(); - + fn try_from(meta: ParquetMetaData) -> std::result::Result { Ok(Self { version: 3, - columns: col_metas, + columns: super::index_common::index_columns_from_parquet_meta(&meta), }) } } diff --git a/src/query/storages/common/index/src/virtual_column.rs b/src/query/storages/common/index/src/virtual_column.rs index b3d12208d9d80..4aa09b8c14e6b 100644 --- a/src/query/storages/common/index/src/virtual_column.rs +++ b/src/query/storages/common/index/src/virtual_column.rs @@ -23,7 +23,7 @@ use databend_common_exception::Result; use databend_common_expression::DataSchema; use databend_common_expression::types::DataType; use databend_storages_common_table_meta::meta::SingleColumnMeta; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaData; pub const VIRTUAL_COLUMN_STRING_TABLE_KEY: &str = "virtual_column_string_table"; pub const VIRTUAL_COLUMN_NODES_KEY: &str = "virtual_column_nodes"; @@ -97,16 +97,20 @@ impl TryFrom for VirtualColumnFileMeta { // - VIRTUAL_COLUMN_NODES_KEY: the trie encoded by string table ids and leaf indices. // - VIRTUAL_COLUMN_SHARED_COLUMN_IDS_KEY: mapping of source_column_id -> (key_id, value_id). // Column offsets/lengths/num_values and shared map columns are derived from parquet metadata. -impl TryFrom for VirtualColumnFileMeta { +impl TryFrom for VirtualColumnFileMeta { type Error = ErrorCode; - fn try_from(mut meta: FileMetaData) -> std::result::Result { + fn try_from(meta: ParquetMetaData) -> std::result::Result { let mut arrow_schema = None; let mut string_table = None; let mut virtual_column_nodes = None; let mut shared_column_ids = None; - let key_value_metadata = meta.key_value_metadata.unwrap_or_default(); + let key_value_metadata = meta + .file_metadata() + .key_value_metadata() + .cloned() + .unwrap_or_default(); for key_value in &key_value_metadata { if key_value.key == "ARROW:schema" { let encoded_meta = key_value.value.as_ref().unwrap(); @@ -135,31 +139,17 @@ impl TryFrom for VirtualColumnFileMeta { })?; let shared_column_ids = shared_column_ids.unwrap_or_default(); - let rg = meta.row_groups.remove(0); - let mut column_metas = Vec::with_capacity(rg.columns.len()); - for (column_idx, column) in rg.columns.iter().enumerate() { - let chunk_meta = column.meta_data.as_ref().ok_or_else(|| { - ErrorCode::Internal( - "expecting chunk meta data while converting ThriftFileMetaData to VirtualColumnFileMeta", - ) - })?; - let col_start = if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { - dict_page_offset - } else { - chunk_meta.data_page_offset - }; - let col_len = chunk_meta.total_compressed_size; - assert!( - col_start >= 0 && col_len >= 0, - "column start and length should not be negative" - ); - let num_values = chunk_meta.num_values as u64; + let row_group = &meta.row_groups()[0]; + let mut column_metas = Vec::with_capacity(row_group.columns().len()); + for (column_idx, chunk_meta) in row_group.columns().iter().enumerate() { + let (offset, len) = chunk_meta.byte_range(); + let num_values = chunk_meta.num_values() as u64; let res = SingleColumnMeta { - offset: col_start as u64, - len: col_len as u64, + offset, + len, num_values, }; - let data_type = data_type_from_path(&data_schema, &chunk_meta.path_in_schema)?; + let data_type = data_type_from_path(&data_schema, chunk_meta.column_path().parts())?; let column_id = column_idx as u32; column_metas.push(VirtualColumnIdWithMeta { column_id, diff --git a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment.rs b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment.rs index ddcca6eca44ae..fc8636e5df55e 100644 --- a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment.rs +++ b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment.rs @@ -270,7 +270,7 @@ impl AbstractSegment for ColumnOrientedSegment { // TODO(Sky): Construct the optimal props, enabling compression, encoding, etc., if performance is better. let props = Some( WriterProperties::builder() - .set_max_row_group_size(usize::MAX) + .set_max_row_group_row_count(Some(usize::MAX)) .build(), ); let arrow_schema = Arc::new(Schema::from(&self.segment_schema)); diff --git a/src/query/storages/fuse/src/io/read/block/parquet/adapter.rs b/src/query/storages/fuse/src/io/read/block/parquet/adapter.rs index 10916c6d5bf73..d2f2e8f117602 100644 --- a/src/query/storages/fuse/src/io/read/block/parquet/adapter.rs +++ b/src/query/storages/fuse/src/io/read/block/parquet/adapter.rs @@ -25,6 +25,9 @@ use parquet::column::page::PageIterator; use parquet::column::page::PageReader; use parquet::errors::Result as ParquetResult; use parquet::file::metadata::ColumnChunkMetaData; +use parquet::file::metadata::FileMetaData; +use parquet::file::metadata::ParquetMetaData; +use parquet::file::metadata::RowGroupMetaData; use parquet::file::serialized_reader::SerializedPageReader; use parquet::schema::types::SchemaDescriptor; @@ -65,10 +68,35 @@ impl<'a> RowGroupImplBuilder<'a> { } pub fn build(self) -> RowGroupImpl { + let schema_descriptor = Arc::new(self.schema_descriptor.clone()); + let column_metadata = (0..schema_descriptor.num_columns()) + .map(|dfs_id| { + self.column_chunk_metadatas + .get(&dfs_id) + .cloned() + .unwrap_or_else(|| { + ColumnChunkMetaData::builder(self.schema_descriptor.column(dfs_id)) + .set_compression(self.compression) + .set_data_page_offset(0) + .set_total_compressed_size(0) + .build() + .unwrap() + }) + }) + .collect(); + let row_group = RowGroupMetaData::builder(schema_descriptor.clone()) + .set_num_rows(self.num_rows as i64) + .set_column_metadata(column_metadata) + .build() + .unwrap(); RowGroupImpl { num_rows: self.num_rows, column_chunks: self.column_chunks, column_chunk_metadatas: self.column_chunk_metadatas, + parquet_meta: ParquetMetaData::new( + FileMetaData::new(0, self.num_rows as i64, None, None, schema_descriptor, None), + vec![row_group], + ), } } } @@ -78,6 +106,7 @@ pub struct RowGroupImpl { num_rows: usize, column_chunks: HashMap, column_chunk_metadatas: HashMap, + parquet_meta: ParquetMetaData, } impl RowGroups for RowGroupImpl { @@ -101,6 +130,14 @@ impl RowGroups for RowGroupImpl { reader: Some(Ok(page_reader)), })) } + + fn row_groups(&self) -> Box + '_> { + Box::new(self.parquet_meta.row_groups().iter()) + } + + fn metadata(&self) -> &databend_storages_common_cache::ParquetMetaData { + &self.parquet_meta + } } struct PageIteratorImpl { @@ -116,3 +153,61 @@ impl Iterator for PageIteratorImpl { } impl PageIterator for PageIteratorImpl {} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use opendal::Buffer; + use parquet::arrow::arrow_reader::RowGroups; + use parquet::basic::Compression; + use parquet::basic::Repetition; + use parquet::basic::Type as PhysicalType; + use parquet::schema::types::SchemaDescriptor; + use parquet::schema::types::Type; + + use super::RowGroupImplBuilder; + + fn test_schema() -> SchemaDescriptor { + let fields = vec![ + Arc::new( + Type::primitive_type_builder("c0", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(), + ), + Arc::new( + Type::primitive_type_builder("c1", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(), + ), + Arc::new( + Type::primitive_type_builder("c2", PhysicalType::INT32) + .with_repetition(Repetition::OPTIONAL) + .build() + .unwrap(), + ), + ]; + let schema = Type::group_type_builder("schema") + .with_fields(fields) + .build() + .unwrap(); + SchemaDescriptor::new(Arc::new(schema)) + } + + #[test] + fn test_build_sparse_projection_row_group_metadata() { + let schema = test_schema(); + let mut builder = RowGroupImplBuilder::new(1, &schema, Compression::UNCOMPRESSED); + builder.add_column_chunk(2, Buffer::from(vec![1_u8, 2, 3, 4])); + + let row_group = builder.build(); + let metadata = row_group.row_groups().next().unwrap(); + + assert_eq!(metadata.columns().len(), schema.num_columns()); + assert_eq!(metadata.column(2).compressed_size(), 4); + assert_eq!(metadata.column(0).compressed_size(), 0); + assert_eq!(metadata.column(1).compressed_size(), 0); + } +} diff --git a/src/query/storages/fuse/src/io/read/block/parquet/deserialize.rs b/src/query/storages/fuse/src/io/read/block/parquet/deserialize.rs index 6455f8b811c1f..bc3dc01db22b3 100644 --- a/src/query/storages/fuse/src/io/read/block/parquet/deserialize.rs +++ b/src/query/storages/fuse/src/io/read/block/parquet/deserialize.rs @@ -62,6 +62,7 @@ pub fn column_chunks_to_record_batch( DataItem::ColumnArray(_) => {} } } + projection_mask.sort_unstable(); let row_group = Box::new(builder.build()); let field_levels = parquet_to_arrow_field_levels( &parquet_schema, diff --git a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs index 8cdc6bba1ea34..9b6b88bcba793 100644 --- a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs +++ b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs @@ -40,10 +40,8 @@ use databend_storages_common_table_meta::meta::SingleColumnMeta; use futures_util::future::try_join_all; use opendal::Operator; use parquet::arrow::ArrowSchemaConverter; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaDataReader; use parquet::schema::types::SchemaDescPtr; -use parquet::thrift::TSerializable; -use thrift::protocol::TCompactInputProtocol; use crate::index::filters::BlockBloomFilterIndexVersion; use crate::index::filters::BlockFilter; @@ -280,10 +278,9 @@ fn load_index_meta_from_bytes(data: &Bytes) -> Result { } let remaining = data.len() - footer_len as usize; - let mut prot = TCompactInputProtocol::new(&data[remaining..]); - let thrift_meta = FileMetaData::read_from_in_protocol(&mut prot) + let parquet_meta = ParquetMetaDataReader::decode_metadata(&data[remaining..]) .map_err(|err| ErrorCode::StorageOther(format!("read bloom index meta failed, {err}")))?; - BloomIndexMeta::try_from(thrift_meta) + BloomIndexMeta::try_from(parquet_meta) } #[async_trait::async_trait] diff --git a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs index 094ea3538d3b0..4a83db11488ef 100644 --- a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs +++ b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs @@ -44,8 +44,8 @@ use futures::AsyncSeek; use futures_util::AsyncSeekExt; use opendal::Buffer; use opendal::Operator; -use parquet::format::FileMetaData; -use parquet::thrift::TSerializable; +use parquet::file::metadata::ParquetMetaData; +use parquet::file::metadata::ParquetMetaDataReader; pub use self::thrift_file_meta_read::read_thrift_file_metadata; @@ -362,7 +362,6 @@ pub async fn bytes_reader(op: &Operator, path: &str, len_hint: Option) -> R mod thrift_file_meta_read { use parquet::errors::ParquetError; - use thrift::protocol::TCompactInputProtocol; use super::*; @@ -395,7 +394,7 @@ mod thrift_file_meta_read { op: Operator, path: &str, len_hint: Option, - ) -> Result { + ) -> Result { let file_size = if let Some(len) = len_hint { len } else { @@ -443,11 +442,8 @@ mod thrift_file_meta_read { if (footer_len as usize) < buffer.len() { // the whole metadata is in the bytes we already read let remaining = buffer.len() - footer_len as usize; - - let mut prot = TCompactInputProtocol::new(&buffer[remaining..]); - let meta = FileMetaData::read_from_in_protocol(&mut prot) - .map_err(|err| ErrorCode::ParquetFileInvalid(err.to_string()))?; - Ok(meta) + ParquetMetaDataReader::decode_metadata(&buffer[remaining..]) + .map_err(|err| ErrorCode::ParquetFileInvalid(err.to_string())) } else { // the end of file read by default is not long enough, read again including the metadata. let buffer = op @@ -455,11 +451,9 @@ mod thrift_file_meta_read { .range(file_size - footer_len..file_size) .await .map_err(|err| ErrorCode::ParquetFileInvalid(err.to_string()))?; - - let mut prot = TCompactInputProtocol::new(buffer.reader()); - let meta = FileMetaData::read_from_in_protocol(&mut prot) - .map_err(|err| ErrorCode::ParquetFileInvalid(err.to_string()))?; - Ok(meta) + let buffer = buffer.to_vec(); + ParquetMetaDataReader::decode_metadata(&buffer) + .map_err(|err| ErrorCode::ParquetFileInvalid(err.to_string())) } } } diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index cdf6bd08f97af..469bda9f7d143 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -55,7 +55,7 @@ use databend_storages_common_table_meta::meta::ColumnMeta; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use databend_storages_common_table_meta::table::TableCompression; use parquet::arrow::ArrowWriter; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaData; use crate::FuseStorageFormat; use crate::FuseTable; @@ -126,7 +126,7 @@ impl ArrowParquetWriter { Ok(()) } - fn finish(&mut self) -> Result { + fn finish(&mut self) -> Result { let Initialized(writer) = self else { unreachable!("ArrowParquetWriter::finish called before initialization"); }; diff --git a/src/query/storages/fuse/src/io/write/virtual_column_builder.rs b/src/query/storages/fuse/src/io/write/virtual_column_builder.rs index f9d700e69f960..387d730fcdb48 100644 --- a/src/query/storages/fuse/src/io/write/virtual_column_builder.rs +++ b/src/query/storages/fuse/src/io/write/virtual_column_builder.rs @@ -74,7 +74,7 @@ use jsonb::Value as JsonbValue; use jsonb::keypath::KeyPath as JsonbKeyPath; use jsonb::keypath::KeyPaths as JsonbKeyPaths; use parquet::file::metadata::KeyValue; -use parquet::format::FileMetaData; +use parquet::file::metadata::ParquetMetaData; use siphasher::sip128::Hasher128; use siphasher::sip128::SipHasher24; @@ -797,67 +797,46 @@ impl VirtualColumnBuilder { fn file_meta_to_virtual_column_metas( &self, - file_meta: FileMetaData, + file_meta: ParquetMetaData, mut virtual_column_names: HashMap, mut columns_statistics: StatisticsOfColumns, ) -> Result> { - let num_row_groups = file_meta.row_groups.len(); + let num_row_groups = file_meta.row_groups().len(); if num_row_groups != 1 { return Err(ErrorCode::ParquetFileInvalid(format!( "invalid parquet file, expects only one row group, but got {}", num_row_groups ))); } - let row_group = &file_meta.row_groups[0]; + let row_group = &file_meta.row_groups()[0]; let mut draft_virtual_column_metas = Vec::with_capacity(virtual_column_names.len()); - for (i, col_chunk) in row_group.columns.iter().enumerate() { + for (i, chunk_meta) in row_group.columns().iter().enumerate() { let tmp_column_id = i as u32; - match &col_chunk.meta_data { - Some(chunk_meta) => { - let Some((source_column_id, key_name, variant_type)) = - virtual_column_names.remove(&chunk_meta.path_in_schema[0]) - else { - continue; - }; + let Some((source_column_id, key_name, variant_type)) = + virtual_column_names.remove(&chunk_meta.column_path().parts()[0]) + else { + continue; + }; - let col_start = - if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { - dict_page_offset - } else { - chunk_meta.data_page_offset - }; - let col_len = chunk_meta.total_compressed_size; - assert!( - col_start >= 0 && col_len >= 0, - "column start and length should not be negative" - ); - let num_values = chunk_meta.num_values as u64; - - let variant_type_code = VirtualColumnMeta::data_type_code(&variant_type); - let column_stat = columns_statistics.remove(&tmp_column_id); - let virtual_column_meta = VirtualColumnMeta { - offset: col_start as u64, - len: col_len as u64, - num_values, - data_type: variant_type_code, - column_stat, - }; + let (offset, len) = chunk_meta.byte_range(); + let variant_type_code = VirtualColumnMeta::data_type_code(&variant_type); + let column_stat = columns_statistics.remove(&tmp_column_id); + let virtual_column_meta = VirtualColumnMeta { + offset, + len, + num_values: chunk_meta.num_values() as u64, + data_type: variant_type_code, + column_stat, + }; - let draft_virtual_column_meta = DraftVirtualColumnMeta { - source_column_id, - name: key_name, - data_type: variant_type, - column_meta: virtual_column_meta, - }; - draft_virtual_column_metas.push(draft_virtual_column_meta); - } - None => { - return Err(ErrorCode::ParquetFileInvalid(format!( - "invalid parquet file, meta data of column is empty", - ))); - } - } + let draft_virtual_column_meta = DraftVirtualColumnMeta { + source_column_id, + name: key_name, + data_type: variant_type, + column_meta: virtual_column_meta, + }; + draft_virtual_column_metas.push(draft_virtual_column_meta); } Ok(draft_virtual_column_metas) } diff --git a/src/query/storages/fuse/src/operations/util.rs b/src/query/storages/fuse/src/operations/util.rs index 2afb0d89f5550..c2a65329b937d 100644 --- a/src/query/storages/fuse/src/operations/util.rs +++ b/src/query/storages/fuse/src/operations/util.rs @@ -71,11 +71,11 @@ pub fn set_backoff( } pub fn column_parquet_metas( - file_meta: &parquet::format::FileMetaData, + file_meta: &parquet::file::metadata::ParquetMetaData, schema: &TableSchemaRef, ) -> Result> { // currently we use one group only - let num_row_groups = file_meta.row_groups.len(); + let num_row_groups = file_meta.row_groups().len(); if num_row_groups != 1 { return Err(ErrorCode::ParquetFileInvalid(format!( "invalid parquet file, expects only one row group, but got {}", @@ -84,40 +84,20 @@ pub fn column_parquet_metas( } // use `to_leaf_column_ids` instead of `to_column_ids` to handle nested type column ids. let column_ids = schema.to_leaf_column_ids(); - let row_group = &file_meta.row_groups[0]; + let row_group = &file_meta.row_groups()[0]; // Make sure that schema and row_group has the same number column, or else it is a panic error. - assert_eq!(column_ids.len(), row_group.columns.len()); - let mut col_metas = HashMap::with_capacity(row_group.columns.len()); - for (idx, col_chunk) in row_group.columns.iter().enumerate() { - match &col_chunk.meta_data { - Some(chunk_meta) => { - let col_start = if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { - dict_page_offset - } else { - chunk_meta.data_page_offset - }; - let col_len = chunk_meta.total_compressed_size; - assert!( - col_start >= 0 && col_len >= 0, - "column start and length should not be negative" - ); - let num_values = chunk_meta.num_values as u64; - let res = SingleColumnMeta { - offset: col_start as u64, - len: col_len as u64, - num_values, - }; - // use column id as key instead of index - let column_id = column_ids[idx]; - col_metas.insert(column_id, ColumnMeta::Parquet(res)); - } - None => { - return Err(ErrorCode::ParquetFileInvalid(format!( - "invalid parquet file, meta data of column idx {} is empty", - idx - ))); - } - } + assert_eq!(column_ids.len(), row_group.columns().len()); + let mut col_metas = HashMap::with_capacity(row_group.columns().len()); + for (idx, chunk_meta) in row_group.columns().iter().enumerate() { + let (offset, len) = chunk_meta.byte_range(); + let res = SingleColumnMeta { + offset, + len, + num_values: chunk_meta.num_values() as u64, + }; + // use column id as key instead of index + let column_id = column_ids[idx]; + col_metas.insert(column_id, ColumnMeta::Parquet(res)); } Ok(col_metas) } diff --git a/src/query/storages/fuse/src/table_functions/fuse_encoding.rs b/src/query/storages/fuse/src/table_functions/fuse_encoding.rs index 4dbe97f551567..84a0db2c228b4 100644 --- a/src/query/storages/fuse/src/table_functions/fuse_encoding.rs +++ b/src/query/storages/fuse/src/table_functions/fuse_encoding.rs @@ -1,3 +1,5 @@ +#![allow(deprecated)] + // Copyright 2021 Datafuse Labs // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -60,7 +62,7 @@ use futures::stream::TryStreamExt; use opendal::Operator; use parquet::basic::Compression as ParquetCompression; use parquet::basic::Encoding as ParquetEncoding; -use parquet::format::Type as ParquetPhysicalType; +use parquet::basic::Type as ParquetPhysicalType; use crate::BlockReadResult; use crate::FuseStorageFormat; @@ -392,15 +394,15 @@ impl<'a> FuseEncodingImpl<'a> { column_name_filter: Arc>, ) -> Result> { let file_meta = read_thrift_file_metadata(operator, &location, Some(file_size)).await?; - if file_meta.row_groups.len() != 1 { + if file_meta.row_groups().len() != 1 { return Err(ErrorCode::ParquetFileInvalid(format!( "invalid parquet file {}, expects one row group but got {}", location, - file_meta.row_groups.len() + file_meta.row_groups().len() ))); } - let row_group = &file_meta.row_groups[0]; - let columns = &row_group.columns; + let row_group = &file_meta.row_groups()[0]; + let columns = row_group.columns(); let mut block_rows = Vec::new(); for field in fields.iter() { @@ -418,23 +420,15 @@ impl<'a> FuseEncodingImpl<'a> { // Missing column caused by schema evolutions continue; }; - let chunk_meta = column_chunk.meta_data.as_ref().ok_or_else(|| { + let compressed_size = u64::try_from(column_chunk.compressed_size()).map_err(|_| { ErrorCode::ParquetFileInvalid(format!( - "invalid parquet file {}, meta data of column {} is empty", - location, column_id + "invalid parquet file {}, compressed size overflow for column {}", + location, + field.name() )) })?; - - let compressed_size = - u64::try_from(chunk_meta.total_compressed_size).map_err(|_| { - ErrorCode::ParquetFileInvalid(format!( - "invalid parquet file {}, compressed size overflow for column {}", - location, - field.name() - )) - })?; let uncompressed_size = - u64::try_from(chunk_meta.total_uncompressed_size).map_err(|_| { + u64::try_from(column_chunk.uncompressed_size()).map_err(|_| { ErrorCode::ParquetFileInvalid(format!( "invalid parquet file {}, uncompressed size overflow for column {}", location, @@ -442,7 +436,7 @@ impl<'a> FuseEncodingImpl<'a> { )) })?; - let physical_type = parquet_physical_type_to_string(chunk_meta.type_); + let physical_type = parquet_physical_type_to_string(column_chunk.column_type()); block_rows.push(EncodingRow { table_name: table_name.as_str().to_string(), storage_format, @@ -452,8 +446,8 @@ impl<'a> FuseEncodingImpl<'a> { validity_size: None, compressed_size, uncompressed_size, - level_one: parquet_encodings_to_string(&chunk_meta.encodings), - level_two: Some(parquet_codec_to_string(chunk_meta.codec)), + level_one: parquet_encodings_to_string(column_chunk.encodings()), + level_two: Some(parquet_codec_to_string(column_chunk.compression())), }); } @@ -634,45 +628,41 @@ fn native_level_two_encoding(page_body: &PageBody) -> Option { } } -fn parquet_encodings_to_string(encodings: &[parquet::format::Encoding]) -> String { +fn parquet_encodings_to_string(encodings: impl Iterator) -> String { + let encodings = encodings + .map(parquet_encoding_to_string) + .collect::>(); if encodings.is_empty() { "unknown".to_string() } else { - encodings - .iter() - .map(|encoding| parquet_encoding_to_string(*encoding)) - .collect::>() - .join(",") + encodings.join(",") } } -fn parquet_encoding_to_string(encoding: parquet::format::Encoding) -> &'static str { - match ParquetEncoding::try_from(encoding) { - Ok(ParquetEncoding::PLAIN) => "plain", - Ok(ParquetEncoding::PLAIN_DICTIONARY) => "plain_dictionary", - Ok(ParquetEncoding::RLE) => "rle", - #[allow(deprecated)] - Ok(ParquetEncoding::BIT_PACKED) => "bit_packed", - Ok(ParquetEncoding::DELTA_BINARY_PACKED) => "delta_binary_packed", - Ok(ParquetEncoding::DELTA_LENGTH_BYTE_ARRAY) => "delta_length_byte_array", - Ok(ParquetEncoding::DELTA_BYTE_ARRAY) => "delta_byte_array", - Ok(ParquetEncoding::RLE_DICTIONARY) => "rle_dictionary", - Ok(ParquetEncoding::BYTE_STREAM_SPLIT) => "byte_stream_split", - Err(_) => "unknown", +fn parquet_encoding_to_string(encoding: ParquetEncoding) -> &'static str { + match encoding { + ParquetEncoding::PLAIN => "plain", + ParquetEncoding::PLAIN_DICTIONARY => "plain_dictionary", + ParquetEncoding::RLE => "rle", + ParquetEncoding::BIT_PACKED => "bit_packed", + ParquetEncoding::DELTA_BINARY_PACKED => "delta_binary_packed", + ParquetEncoding::DELTA_LENGTH_BYTE_ARRAY => "delta_length_byte_array", + ParquetEncoding::DELTA_BYTE_ARRAY => "delta_byte_array", + ParquetEncoding::RLE_DICTIONARY => "rle_dictionary", + ParquetEncoding::BYTE_STREAM_SPLIT => "byte_stream_split", } } -fn parquet_codec_to_string(codec: parquet::format::CompressionCodec) -> String { - match ParquetCompression::try_from(codec) { - Ok(ParquetCompression::UNCOMPRESSED) => "uncompressed".to_string(), - Ok(ParquetCompression::SNAPPY) => "snappy".to_string(), - Ok(ParquetCompression::GZIP(_)) => "gzip".to_string(), - Ok(ParquetCompression::LZO) => "lzo".to_string(), - Ok(ParquetCompression::BROTLI(_)) => "brotli".to_string(), - Ok(ParquetCompression::LZ4) => "lz4".to_string(), - Ok(ParquetCompression::ZSTD(_)) => "zstd".to_string(), - Ok(ParquetCompression::LZ4_RAW) => "lz4_raw".to_string(), - Err(_) => format!("compression_codec({})", codec.0), +fn parquet_codec_to_string(codec: ParquetCompression) -> String { + match codec { + ParquetCompression::UNCOMPRESSED => "uncompressed".to_string(), + ParquetCompression::SNAPPY => "snappy".to_string(), + ParquetCompression::GZIP(_) => "gzip".to_string(), + ParquetCompression::LZO => "lzo".to_string(), + ParquetCompression::BROTLI(_) => "brotli".to_string(), + ParquetCompression::LZ4 => "lz4".to_string(), + ParquetCompression::ZSTD(_) => "zstd".to_string(), + ParquetCompression::LZ4_RAW => "lz4_raw".to_string(), } } @@ -686,7 +676,6 @@ fn parquet_physical_type_to_string(ty: ParquetPhysicalType) -> &'static str { ParquetPhysicalType::DOUBLE => "DOUBLE", ParquetPhysicalType::BYTE_ARRAY => "BYTE_ARRAY", ParquetPhysicalType::FIXED_LEN_BYTE_ARRAY => "FIXED_LEN_BYTE_ARRAY", - parquet::format::Type(_) => "UNKNOWN", } } diff --git a/src/query/storages/fuse/src/table_functions/fuse_page.rs b/src/query/storages/fuse/src/table_functions/fuse_page.rs index c19a8f7bfe080..f98117460e3dd 100644 --- a/src/query/storages/fuse/src/table_functions/fuse_page.rs +++ b/src/query/storages/fuse/src/table_functions/fuse_page.rs @@ -1,3 +1,5 @@ +#![allow(deprecated)] + // Copyright 2021 Datafuse Labs // // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/query/storages/iceberg/src/table.rs b/src/query/storages/iceberg/src/table.rs index b115ce637022a..726f8d965b23f 100644 --- a/src/query/storages/iceberg/src/table.rs +++ b/src/query/storages/iceberg/src/table.rs @@ -41,8 +41,6 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::ColumnId; use databend_common_expression::DataSchema; -use databend_common_expression::FieldIndex; -use databend_common_expression::TableDataType; use databend_common_expression::TableField; use databend_common_expression::TableSchema; use databend_common_meta_app::schema::CatalogInfo; @@ -490,64 +488,34 @@ impl IcebergTable { .map(|v| v.name.clone()) .collect()), Projection::InnerColumns(path_indices) => { - let fields = schema.fields(); - let mut names = Vec::with_capacity(path_indices.len()); + // Iceberg scan.select() only accepts direct children of the table schema. + // Keep nested projection pruning in Databend's read path and only push down + // the required top-level columns to the Iceberg scan planner. + let mut top_level_indices = Vec::with_capacity(path_indices.len()); for path in path_indices.values() { - names.push(Self::inner_column_path_to_name(fields, path)?); - } - Ok(names) - } - } - } - - fn inner_column_path_to_name(fields: &[TableField], path: &[FieldIndex]) -> Result { - if path.is_empty() { - return Err(ErrorCode::BadArguments( - "Inner column path should not be empty".to_string(), - )); - } - - let field = fields.get(path[0]).ok_or_else(|| { - ErrorCode::BadArguments(format!("Inner column path {:?} is out of range", path)) - })?; - let mut name_parts = Vec::with_capacity(path.len()); - name_parts.push(field.name().clone()); - - let mut current_type = field.data_type().remove_nullable(); - for index in path.iter().skip(1) { - match ¤t_type { - TableDataType::Tuple { - fields_name, - fields_type, - } => { - let inner_name = fields_name.get(*index).ok_or_else(|| { - ErrorCode::BadArguments(format!( - "Inner column path {:?} is out of range for {}", - path, - name_parts.join(".") - )) + let index = *path.first().ok_or_else(|| { + ErrorCode::BadArguments("Inner column path should not be empty".to_string()) })?; - name_parts.push(inner_name.clone()); - let inner_type = fields_type.get(*index).ok_or_else(|| { - ErrorCode::BadArguments(format!( - "Inner column path {:?} is out of range for {}", - path, - name_parts.join(".") - )) - })?; - current_type = inner_type.remove_nullable(); - } - _ => { - return Err(ErrorCode::BadArguments(format!( - "Inner column path {:?} is invalid for non-tuple field {}", - path, - name_parts.join(".") - ))); + top_level_indices.push(index); } + top_level_indices.sort_unstable(); + top_level_indices.dedup(); + top_level_indices + .into_iter() + .map(|index| { + schema + .fields() + .get(index) + .map(|field| field.name().clone()) + .ok_or_else(|| { + ErrorCode::BadArguments(format!( + "Inner column projection root {index} is out of range" + )) + }) + }) + .collect() } } - - Ok(name_parts.join(".")) } fn convert_orc_schema(schema: &Schema) -> Schema { @@ -591,7 +559,8 @@ impl IcebergTable { .map(|(i, field)| (i, visit_field(field))) .unzip(); arrow_schema::DataType::Union( - arrow_schema::UnionFields::new(ids, fields), + arrow_schema::UnionFields::try_new(ids, fields) + .expect("existing union fields should remain valid"), *mode, ) } diff --git a/src/query/storages/parquet/Cargo.toml b/src/query/storages/parquet/Cargo.toml index d21c50769b1e0..27b8b5c1ac786 100644 --- a/src/query/storages/parquet/Cargo.toml +++ b/src/query/storages/parquet/Cargo.toml @@ -38,7 +38,6 @@ opendal = { workspace = true } parquet = { workspace = true } rand = { workspace = true } serde = { workspace = true } -thrift = { workspace = true } typetag = { workspace = true } [dev-dependencies] diff --git a/src/query/storages/parquet/src/parquet_reader/reader/row_group_reader.rs b/src/query/storages/parquet/src/parquet_reader/reader/row_group_reader.rs index 2a518426d4d19..0f45d0e7b82f6 100644 --- a/src/query/storages/parquet/src/parquet_reader/reader/row_group_reader.rs +++ b/src/query/storages/parquet/src/parquet_reader/reader/row_group_reader.rs @@ -53,7 +53,7 @@ use parquet::arrow::arrow_reader::RowSelection; use parquet::arrow::arrow_reader::RowSelector; use parquet::file::metadata::ParquetMetaData; use parquet::file::metadata::RowGroupMetaData; -use parquet::format::PageLocation; +use parquet::file::page_index::offset_index::PageLocation; use parquet::schema::types::SchemaDescPtr; use crate::DeleteType; diff --git a/src/query/storages/parquet/src/parquet_reader/row_group.rs b/src/query/storages/parquet/src/parquet_reader/row_group.rs index 46da387c0e2ac..91ce985ce5f72 100644 --- a/src/query/storages/parquet/src/parquet_reader/row_group.rs +++ b/src/query/storages/parquet/src/parquet_reader/row_group.rs @@ -31,11 +31,13 @@ use parquet::arrow::arrow_reader::RowSelection; use parquet::column::page::PageIterator; use parquet::column::page::PageReader; use parquet::errors::ParquetError; +use parquet::file::metadata::FileMetaData; +use parquet::file::metadata::ParquetMetaData; use parquet::file::metadata::RowGroupMetaData; +use parquet::file::page_index::offset_index::PageLocation; use parquet::file::reader::ChunkReader; use parquet::file::reader::Length; use parquet::file::serialized_reader::SerializedPageReader; -use parquet::format::PageLocation; use crate::read_settings::ReadSettings; @@ -181,15 +183,29 @@ pub async fn get_ranges( pub struct RowGroupCore { metadata: T, + parquet_meta: ParquetMetaData, page_locations: Option>>, column_chunks: Vec>>, } impl RowGroupCore { pub fn new(meta: T, page_locations: Option>>) -> RowGroupCore { + let row_group_meta = meta.meta().clone(); + let parquet_meta = ParquetMetaData::new( + FileMetaData::new( + 0, + row_group_meta.num_rows(), + None, + None, + row_group_meta.schema_descr_ptr(), + None, + ), + vec![row_group_meta], + ); RowGroupCore { column_chunks: vec![None; meta.meta().num_columns()], metadata: meta, + parquet_meta, page_locations, } } @@ -374,6 +390,14 @@ impl RowGroups for RowGroupCore { } } } + + fn row_groups(&self) -> Box + '_> { + Box::new(self.parquet_meta.row_groups().iter()) + } + + fn metadata(&self) -> &databend_storages_common_cache::ParquetMetaData { + &self.parquet_meta + } } pub trait AsMetaRef { @@ -435,6 +459,14 @@ impl RowGroups for InMemoryRowGroup<'_> { fn column_chunks(&self, i: usize) -> parquet::errors::Result> { self.core.column_chunks(i) } + + fn row_groups(&self) -> Box + '_> { + self.core.row_groups() + } + + fn metadata(&self) -> &databend_storages_common_cache::ParquetMetaData { + self.core.metadata() + } } pub async fn cached_range_full_read( diff --git a/src/query/storages/parquet/src/partition.rs b/src/query/storages/parquet/src/partition.rs index 26d7094bcfa0f..266d281e37692 100644 --- a/src/query/storages/parquet/src/partition.rs +++ b/src/query/storages/parquet/src/partition.rs @@ -15,7 +15,7 @@ use databend_common_expression::Scalar; use parquet::arrow::arrow_reader::RowSelector; use parquet::file::metadata::RowGroupMetaData; -use parquet::format::PageLocation; +use parquet::file::page_index::offset_index::PageLocation; use crate::row_group_serde::deserialize_row_group_meta; use crate::row_group_serde::serialize_row_group_meta; diff --git a/src/query/storages/parquet/src/pruning.rs b/src/query/storages/parquet/src/pruning.rs index bb4076019154e..edb208fb1620e 100644 --- a/src/query/storages/parquet/src/pruning.rs +++ b/src/query/storages/parquet/src/pruning.rs @@ -29,7 +29,7 @@ use databend_storages_common_table_meta::meta::StatisticsOfColumns; use parquet::arrow::arrow_reader::RowSelection; use parquet::arrow::arrow_reader::RowSelector; use parquet::file::metadata::ParquetMetaData; -use parquet::format::PageLocation; +use parquet::file::page_index::offset_index::PageLocation; use super::statistics::collect_row_group_stats; use crate::statistics::convert_index_to_column_statistics; diff --git a/src/query/storages/parquet/src/row_group_serde.rs b/src/query/storages/parquet/src/row_group_serde.rs index 142eae7b8dde0..ac507612e1dd7 100644 --- a/src/query/storages/parquet/src/row_group_serde.rs +++ b/src/query/storages/parquet/src/row_group_serde.rs @@ -12,78 +12,43 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::io::Cursor; -use std::sync::Arc; - use databend_common_exception::ErrorCode; +use parquet::file::metadata::FileMetaData; +use parquet::file::metadata::ParquetMetaData; +use parquet::file::metadata::ParquetMetaDataReader; +use parquet::file::metadata::ParquetMetaDataWriter; use parquet::file::metadata::RowGroupMetaData; -use parquet::format::RowGroup; -use parquet::format::SchemaElement; -use parquet::schema::types::SchemaDescriptor; -use parquet::schema::types::from_thrift; -use parquet::schema::types::to_thrift; -use parquet::thrift::TSerializable; use serde::Deserialize; -use thrift::protocol::TCompactInputProtocol; -use thrift::protocol::TCompactOutputProtocol; -use thrift::protocol::TInputProtocol; -use thrift::protocol::TListIdentifier; -use thrift::protocol::TOutputProtocol; -use thrift::protocol::TType; - -pub fn serialize_row_group_meta_to_bytes(meta: &RowGroupMetaData) -> Result, ErrorCode> { - let mut transport = Vec::::new(); - let mut o_prot = TCompactOutputProtocol::new(&mut transport); - let schema = meta.schema_descr(); - let schema_elements = to_thrift(schema.root_schema()) - .map_err(|e| wrap_error(e, " (while converting schema to thrift)"))?; - o_prot - .write_list_begin(&TListIdentifier::new( - TType::Struct, - schema_elements.len() as i32, - )) - .map_err(|e| wrap_error(e, " (while writing schema list header)"))?; - for element in schema_elements { - element - .write_to_out_protocol(&mut o_prot) - .map_err(|e| wrap_error(e, " (while writing schema element)"))?; - } - o_prot - .write_list_end() - .map_err(|e| wrap_error(e, " (while finishing schema list)"))?; - - let rg = meta.to_thrift(); - rg.write_to_out_protocol(&mut o_prot) - .map_err(|e| wrap_error(e, " (while writing row group meta)"))?; +fn wrap_in_parquet_meta(row_group: &RowGroupMetaData) -> ParquetMetaData { + let file_meta = FileMetaData::new( + 0, + row_group.num_rows(), + None, + None, + row_group.schema_descr_ptr(), + None, + ); + ParquetMetaData::new(file_meta, vec![row_group.clone()]) +} - Ok(transport) +pub fn serialize_row_group_meta_to_bytes(meta: &RowGroupMetaData) -> Result, ErrorCode> { + let parquet_meta = wrap_in_parquet_meta(meta); + let mut buffer = Vec::new(); + ParquetMetaDataWriter::new(&mut buffer, &parquet_meta) + .finish() + .map_err(ErrorCode::from_std_error)?; + Ok(buffer) } pub fn deserialize_row_group_meta_from_bytes(bytes: &[u8]) -> Result { - let cursor = Cursor::new(bytes); - let mut i_prot = TCompactInputProtocol::new(cursor); - - let list_ident = i_prot - .read_list_begin() - .map_err(|e| wrap_error(e, " (while reading schema list header)"))?; - let mut schema_elements: Vec = Vec::with_capacity(list_ident.size as usize); - for _ in 0..list_ident.size { - let element = SchemaElement::read_from_in_protocol(&mut i_prot) - .map_err(|e| wrap_error(e, " (while reading schema element)"))?; - schema_elements.push(element); - } - i_prot - .read_list_end() - .map_err(|e| wrap_error(e, " (while finishing schema list)"))?; - let schema = from_thrift(&schema_elements) - .map_err(|e| wrap_error(e, " (while converting thrift schema)"))?; - let schema = Arc::new(SchemaDescriptor::new(schema)); - - let rg = RowGroup::read_from_in_protocol(&mut i_prot) - .map_err(|e| wrap_error(e, " (while reading row group meta)"))?; - RowGroupMetaData::from_thrift(schema, rg) - .map_err(|e| wrap_error(e, " (while constructing row group meta)")) + let parquet_meta = + ParquetMetaDataReader::decode_metadata(bytes).map_err(ErrorCode::from_std_error)?; + parquet_meta + .row_groups() + .first() + .cloned() + .ok_or_else(|| ErrorCode::ParquetFileInvalid("serialized row group metadata is empty")) } pub fn serialize_row_group_meta( @@ -104,8 +69,3 @@ where D: serde::Deserializer<'de> { deserialize_row_group_meta_from_bytes(&bytes) .map_err(|e| serde::de::Error::custom(e.to_string())) } - -fn wrap_error(err: E, context: &'static str) -> ErrorCode -where E: std::error::Error + Send + Sync + 'static { - ErrorCode::from_std_error(err).add_message_back(context) -} diff --git a/src/query/storages/parquet/src/statistics/page.rs b/src/query/storages/parquet/src/statistics/page.rs index f579efac8af16..2736f1bcbeafc 100644 --- a/src/query/storages/parquet/src/statistics/page.rs +++ b/src/query/storages/parquet/src/statistics/page.rs @@ -20,123 +20,88 @@ use databend_common_expression::types::NumberDataType; use databend_common_expression::types::i256; use databend_storages_common_table_meta::meta::ColumnStatistics; use parquet::data_type::AsBytes; -use parquet::data_type::ByteArray; use parquet::data_type::FixedLenByteArray; use parquet::data_type::Int96; -use parquet::file::page_index::index::Index; -use parquet::file::page_index::index::PageIndex; +use parquet::file::page_index::column_index::ByteArrayColumnIndex; +use parquet::file::page_index::column_index::ColumnIndexMetaData; +use parquet::file::page_index::column_index::PrimitiveColumnIndex; use super::utils::decode_decimal128_from_bytes; use super::utils::decode_decimal256_from_bytes; pub fn convert_index_to_column_statistics( - index: &Index, + index: &ColumnIndexMetaData, num_pagas: usize, typ: &TableDataType, ) -> Vec> { match index { - Index::NONE => vec![None; num_pagas], - Index::BOOLEAN(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_bool(index, typ)) - .collect() - } - Index::INT32(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_int32(index, typ)) - .collect() - } - Index::INT64(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_int64(index, typ)) - .collect() - } - Index::INT96(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_int96(index, typ)) - .collect() - } - Index::FLOAT(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_float(index, typ)) - .collect() - } - Index::DOUBLE(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_double(index, typ)) - .collect() - } - Index::BYTE_ARRAY(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_byte_array(index, typ)) - .collect() - } - Index::FIXED_LEN_BYTE_ARRAY(index) => { - assert_eq!(num_pagas, index.indexes.len()); - index - .indexes - .iter() - .map(|index| convert_page_index_fixed_len_byte_array(index, typ)) - .collect() - } + ColumnIndexMetaData::NONE => vec![None; num_pagas], + ColumnIndexMetaData::BOOLEAN(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_bool(index, i, typ)) + .collect(), + ColumnIndexMetaData::INT32(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_int32(index, i, typ)) + .collect(), + ColumnIndexMetaData::INT64(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_int64(index, i, typ)) + .collect(), + ColumnIndexMetaData::INT96(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_int96(index, i, typ)) + .collect(), + ColumnIndexMetaData::FLOAT(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_float(index, i, typ)) + .collect(), + ColumnIndexMetaData::DOUBLE(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_double(index, i, typ)) + .collect(), + ColumnIndexMetaData::BYTE_ARRAY(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_byte_array(index, i, typ)) + .collect(), + ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index) => (0..index.num_pages() as usize) + .map(|i| convert_page_index_fixed_len_byte_array(index, i, typ)) + .collect(), } } fn convert_page_index_int32( - index: &PageIndex, + index: &PrimitiveColumnIndex, + idx: usize, typ: &TableDataType, ) -> Option { - match (index.min, index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => { let (max, min) = match typ { TableDataType::Number(NumberDataType::Int8) => { - (Scalar::from(max as i8), Scalar::from(min as i8)) + (Scalar::from(*max as i8), Scalar::from(*min as i8)) } TableDataType::Number(NumberDataType::Int16) => { - (Scalar::from(max as i16), Scalar::from(min as i16)) + (Scalar::from(*max as i16), Scalar::from(*min as i16)) } TableDataType::Number(NumberDataType::Int32) => { - (Scalar::from(max), Scalar::from(min)) + (Scalar::from(*max), Scalar::from(*min)) } TableDataType::Number(NumberDataType::UInt8) => { - (Scalar::from(max as u8), Scalar::from(min as u8)) + (Scalar::from(*max as u8), Scalar::from(*min as u8)) } TableDataType::Number(NumberDataType::UInt16) => { - (Scalar::from(max as u16), Scalar::from(min as u16)) + (Scalar::from(*max as u16), Scalar::from(*min as u16)) } TableDataType::Number(NumberDataType::UInt32) => { - (Scalar::from(max as u32), Scalar::from(min as u32)) + (Scalar::from(*max as u32), Scalar::from(*min as u32)) } - TableDataType::Date => (Scalar::Date(max), Scalar::Date(min)), + TableDataType::Date => (Scalar::Date(*max), Scalar::Date(*min)), TableDataType::Decimal(decimal) => match decimal { DecimalDataType::Decimal128(size) => ( - Scalar::Decimal(DecimalScalar::Decimal128(i128::from(max), *size)), - Scalar::Decimal(DecimalScalar::Decimal128(i128::from(min), *size)), + Scalar::Decimal(DecimalScalar::Decimal128(i128::from(*max), *size)), + Scalar::Decimal(DecimalScalar::Decimal128(i128::from(*min), *size)), ), DecimalDataType::Decimal256(size) => ( - Scalar::Decimal(DecimalScalar::Decimal256(i256::from(max), *size)), - Scalar::Decimal(DecimalScalar::Decimal256(i256::from(min), *size)), + Scalar::Decimal(DecimalScalar::Decimal256(i256::from(*max), *size)), + Scalar::Decimal(DecimalScalar::Decimal256(i256::from(*min), *size)), ), _ => unreachable!(), }, @@ -149,26 +114,31 @@ fn convert_page_index_int32( } fn convert_page_index_int64( - index: &PageIndex, + index: &PrimitiveColumnIndex, + idx: usize, typ: &TableDataType, ) -> Option { - match (index.min, index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => { let (max, min) = match typ { TableDataType::Number(NumberDataType::UInt64) => { - (Scalar::from(max as u64), Scalar::from(min as u64)) + (Scalar::from(*max as u64), Scalar::from(*min as u64)) } TableDataType::Number(NumberDataType::Int64) => { - (Scalar::from(max), Scalar::from(min)) + (Scalar::from(*max), Scalar::from(*min)) } - TableDataType::Timestamp => (Scalar::Timestamp(max), Scalar::Timestamp(min)), + TableDataType::Timestamp => (Scalar::Timestamp(*max), Scalar::Timestamp(*min)), TableDataType::Decimal(DecimalDataType::Decimal128(size)) => ( - Scalar::Decimal(DecimalScalar::Decimal128(i128::from(max), *size)), - Scalar::Decimal(DecimalScalar::Decimal128(i128::from(min), *size)), + Scalar::Decimal(DecimalScalar::Decimal128(i128::from(*max), *size)), + Scalar::Decimal(DecimalScalar::Decimal128(i128::from(*min), *size)), ), TableDataType::Decimal(DecimalDataType::Decimal256(size)) => ( - Scalar::Decimal(DecimalScalar::Decimal256(i256::from(max), *size)), - Scalar::Decimal(DecimalScalar::Decimal256(i256::from(min), *size)), + Scalar::Decimal(DecimalScalar::Decimal256(i256::from(*max), *size)), + Scalar::Decimal(DecimalScalar::Decimal256(i256::from(*min), *size)), ), _ => unreachable!(), }; @@ -179,10 +149,15 @@ fn convert_page_index_int64( } fn convert_page_index_int96( - index: &PageIndex, + index: &PrimitiveColumnIndex, + idx: usize, _typ: &TableDataType, ) -> Option { - match (&index.min, &index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => Some(ColumnStatistics::new( Scalar::Timestamp(min.to_micros()), Scalar::Timestamp(max.to_micros()), @@ -195,13 +170,18 @@ fn convert_page_index_int96( } fn convert_page_index_float( - index: &PageIndex, + index: &PrimitiveColumnIndex, + idx: usize, _typ: &TableDataType, ) -> Option { - match (index.min, index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => Some(ColumnStatistics::new( - Scalar::from(min), - Scalar::from(max), + Scalar::from(*min), + Scalar::from(*max), null_count as u64, 0, None, @@ -211,13 +191,18 @@ fn convert_page_index_float( } fn convert_page_index_bool( - index: &PageIndex, + index: &PrimitiveColumnIndex, + idx: usize, _typ: &TableDataType, ) -> Option { - match (index.min, index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => Some(ColumnStatistics::new( - Scalar::Boolean(min), - Scalar::Boolean(max), + Scalar::Boolean(*min), + Scalar::Boolean(*max), null_count as u64, 0, None, @@ -227,13 +212,18 @@ fn convert_page_index_bool( } fn convert_page_index_double( - index: &PageIndex, + index: &PrimitiveColumnIndex, + idx: usize, _typ: &TableDataType, ) -> Option { - match (index.min, index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => Some(ColumnStatistics::new( - Scalar::from(min), - Scalar::from(max), + Scalar::from(*min), + Scalar::from(*max), null_count as u64, 0, None, @@ -243,10 +233,15 @@ fn convert_page_index_double( } fn convert_page_index_byte_array( - index: &PageIndex, + index: &ByteArrayColumnIndex, + idx: usize, _typ: &TableDataType, ) -> Option { - match (&index.min, &index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => Some(ColumnStatistics::new( Scalar::String(String::from_utf8(min.as_bytes().to_vec()).ok()?), Scalar::String(String::from_utf8(max.as_bytes().to_vec()).ok()?), @@ -259,19 +254,24 @@ fn convert_page_index_byte_array( } fn convert_page_index_fixed_len_byte_array( - index: &PageIndex, + index: &ByteArrayColumnIndex, + idx: usize, typ: &TableDataType, ) -> Option { - match (&index.min, &index.max, index.null_count) { + match ( + index.min_value(idx), + index.max_value(idx), + index.null_count(idx), + ) { (Some(min), Some(max), Some(null_count)) => { let (max, min) = match typ { TableDataType::Decimal(DecimalDataType::Decimal128(size)) => ( - decode_decimal128_from_bytes(max, *size), - decode_decimal128_from_bytes(min, *size), + decode_decimal128_from_bytes(&FixedLenByteArray::from(max.to_vec()), *size), + decode_decimal128_from_bytes(&FixedLenByteArray::from(min.to_vec()), *size), ), TableDataType::Decimal(DecimalDataType::Decimal256(size)) => ( - decode_decimal256_from_bytes(max, *size), - decode_decimal256_from_bytes(min, *size), + decode_decimal256_from_bytes(&FixedLenByteArray::from(max.to_vec()), *size), + decode_decimal256_from_bytes(&FixedLenByteArray::from(min.to_vec()), *size), ), _ => unreachable!(), }; diff --git a/src/query/storages/stage/src/append/lance_dataset/committer_processor.rs b/src/query/storages/stage/src/append/lance_dataset/committer_processor.rs index 122d57c3533c3..c842ec80a4d85 100644 --- a/src/query/storages/stage/src/append/lance_dataset/committer_processor.rs +++ b/src/query/storages/stage/src/append/lance_dataset/committer_processor.rs @@ -37,6 +37,7 @@ use lance_io::object_store::ObjectStoreParams; use lance_io::object_store::ObjectStoreRegistry; use lance_io::object_writer::ObjectWriter; use lance_io::traits::WriteExt; +use lance_io::traits::Writer; use lance_table::format::DataStorageFormat; use lance_table::format::Fragment; use lance_table::format::Manifest; diff --git a/src/query/storages/stage/src/append/lance_dataset/writer_processor.rs b/src/query/storages/stage/src/append/lance_dataset/writer_processor.rs index f5d1f49261e75..7ad5c414cf689 100644 --- a/src/query/storages/stage/src/append/lance_dataset/writer_processor.rs +++ b/src/query/storages/stage/src/append/lance_dataset/writer_processor.rs @@ -97,7 +97,11 @@ pub(crate) fn lance_compatible_arrow_schema(schema: &ArrowSchema) -> ArrowSchema .iter() .map(|(i, field)| (i, visit_field(field))) .unzip(); - ArrowDataType::Union(arrow_schema::UnionFields::new(ids, fields), *mode) + ArrowDataType::Union( + arrow_schema::UnionFields::try_new(ids, fields) + .expect("existing union fields should remain valid"), + *mode, + ) } ArrowDataType::Dictionary(key, value) => { ArrowDataType::Dictionary(Box::new(visit_type(key)), Box::new(visit_type(value))) @@ -322,6 +326,7 @@ impl FragmentWriterParams { data_accessor: Operator, dataset_path: &str, ) -> Result { + let scheme = data_accessor.info().scheme().to_string(); let object_store: Arc = Arc::new(OpendalStore::new(data_accessor)); let mut root = PathBuf::from("/"); @@ -329,9 +334,12 @@ impl FragmentWriterParams { if !normalized.is_empty() { root.push(normalized); } - let base_url = Url::from_directory_path(root).map_err(|_| { - ErrorCode::Internal("invalid base url for lance object store".to_string()) + let mut base_url = Url::parse(&format!("{scheme}:///")).map_err(|err| { + ErrorCode::Internal(format!( + "invalid lance object store scheme '{scheme}': {err}" + )) })?; + base_url.set_path(root.to_string_lossy().as_ref()); #[allow(deprecated)] let store_params = ObjectStoreParams { @@ -692,6 +700,11 @@ mod tests { use arrow_array::StringViewArray; use arrow_schema::Field; + use lance_io::object_store::ObjectStore as LanceObjectStore; + use lance_io::object_store::ObjectStoreRegistry; + use object_store::path::Path; + use opendal::Operator; + use opendal::services::Memory; use super::*; @@ -745,4 +758,28 @@ mod tests { Ok(()) } + + #[test] + fn test_build_store_params_preserves_accessor_scheme() { + let accessor = Operator::new(Memory::default()).unwrap().finish(); + let params = FragmentWriterParams::build_store_params(accessor, "tmp").unwrap(); + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let (store, base_path) = runtime + .block_on(async { + LanceObjectStore::from_uri_and_params( + Arc::new(ObjectStoreRegistry::default()), + "tmp", + ¶ms, + ) + .await + }) + .unwrap(); + + assert!(!store.is_local()); + assert_eq!(base_path, Path::parse("/tmp").unwrap()); + } } diff --git a/src/query/storages/stage/src/append/parquet_file/writer_processor.rs b/src/query/storages/stage/src/append/parquet_file/writer_processor.rs index 6dc2569cf835b..4942a473357d8 100644 --- a/src/query/storages/stage/src/append/parquet_file/writer_processor.rs +++ b/src/query/storages/stage/src/append/parquet_file/writer_processor.rs @@ -93,7 +93,7 @@ fn create_writer( .set_writer_version(WriterVersion::PARQUET_2_0) .set_compression(compression) .set_created_by(create_by) - .set_max_row_group_size(MAX_ROW_GROUP_SIZE) + .set_max_row_group_row_count(Some(MAX_ROW_GROUP_SIZE)) .set_statistics_enabled(EnabledStatistics::Chunk) .set_dictionary_enabled(true) .set_bloom_filter_enabled(false); diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0027_func_fuse_encoding.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0027_func_fuse_encoding.test index e8d386eae7a94..94083a3e4ba85 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0027_func_fuse_encoding.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0027_func_fuse_encoding.test @@ -81,12 +81,12 @@ query TTTTIIITT select * exclude(block_location) from fuse_encoding('db_09_0027', 't_parquet') order by column_name; ---- t_parquet Parquet c1 INT NULL (INT32) NULL 57 48 plain,rle,rle_dictionary zstd -t_parquet Parquet c2 INT NULL (INT32) NULL 202 429 plain,rle zstd +t_parquet Parquet c2 INT NULL (INT32) NULL 203 429 plain,rle zstd query TTTTIIITT select * exclude(block_location) from fuse_encoding('db_09_0027', 't_parquet', 'c2'); ---- -t_parquet Parquet c2 INT NULL (INT32) NULL 202 429 plain,rle zstd +t_parquet Parquet c2 INT NULL (INT32) NULL 203 429 plain,rle zstd statement ok DROP DATABASE db_09_0027 diff --git a/tests/sqllogictests/suites/mode/standalone/explain/fold_agg.test b/tests/sqllogictests/suites/mode/standalone/explain/fold_agg.test index 9877ef014e023..6ef5027ce029c 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/fold_agg.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/fold_agg.test @@ -159,7 +159,7 @@ EvalScalar ├── scan id: 0 ├── output columns: [number (#0), money (#2)] ├── read rows: 3000 - ├── read size: 10.51 KiB + ├── read size: ├── partitions total: 2 ├── partitions scanned: 2 ├── pruning stats: [segments: >, blocks: >] @@ -205,7 +205,7 @@ EvalScalar ├── scan id: 0 ├── output columns: [number (#0), money (#2)] ├── read rows: 3000 - ├── read size: 10.51 KiB + ├── read size: ├── partitions total: 2 ├── partitions scanned: 2 ├── pruning stats: [segments: >, blocks: >] diff --git a/tests/sqllogictests/suites/mode/standalone/explain/runtime_filter_inlist_bloom_prune.test b/tests/sqllogictests/suites/mode/standalone/explain/runtime_filter_inlist_bloom_prune.test index e0e58a0ff13a7..8254373b939c9 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/runtime_filter_inlist_bloom_prune.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/runtime_filter_inlist_bloom_prune.test @@ -80,7 +80,7 @@ HashJoin ├── scan id: 0 ├── output columns: [number (#0)] ├── read rows: 100000 - ├── read size: 167.25 KiB + ├── read size: 180.86 KiB ├── partitions total: 99 ├── partitions scanned: 99 ├── pruning stats: [segments: >, blocks: >] @@ -149,7 +149,7 @@ HashJoin ├── scan id: 0 ├── output columns: [number (#0)] ├── read rows: 100000 - ├── read size: 167.25 KiB + ├── read size: 180.86 KiB ├── partitions total: 99 ├── partitions scanned: 99 ├── pruning stats: [segments: >, blocks: >] diff --git a/tests/sqllogictests/suites/mode/standalone/explain/union.test b/tests/sqllogictests/suites/mode/standalone/explain/union.test index ce093e663b770..62b3d2b457d11 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/union.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/union.test @@ -277,7 +277,7 @@ UnionAll ├── scan id: 1 ├── output columns: [b (#1)] ├── read rows: 10000 - ├── read size: 10.59 KiB + ├── read size: 10.60 KiB ├── partitions total: 1 ├── partitions scanned: 1 ├── pruning stats: [segments: >, blocks: >] @@ -296,7 +296,7 @@ UnionAll │ ├── scan id: 0 │ ├── output columns: [a (#0)] │ ├── read rows: 10000 -│ ├── read size: 10.59 KiB +│ ├── read size: 10.60 KiB │ ├── partitions total: 1 │ ├── partitions scanned: 1 │ ├── pruning stats: [segments: >, blocks: >]