Merge branch 'main' into iceberg-split-serialization-dpp

mbutrovich · mbutrovich · commit 51f8c426c662 · 2026-02-05T09:15:05.000-08:00
diff --git a/.github/workflows/iceberg_spark_test.yml b/.github/workflows/iceberg_spark_test.yml
@@ -77,6 +77,8 @@ jobs:
         # Use CI profile for faster builds (no LTO) and to share cache with pr_build_linux.yml.
         run: |
           cd native && cargo build --profile ci
+        env:
+          RUSTFLAGS: "-Ctarget-cpu=x86-64-v3"
 
       - name: Save Cargo cache
         uses: actions/cache/save@v5
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -94,6 +94,8 @@ jobs:
           # CI profile: same overflow behavior as release, but faster compilation
           # (no LTO, parallel codegen)
           cargo build --profile ci
+        env:
+          RUSTFLAGS: "-Ctarget-cpu=x86-64-v3"
 
       - name: Upload native library
         uses: actions/upload-artifact@v6
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -94,6 +94,8 @@ jobs:
           # CI profile: same overflow behavior as release, but faster compilation
           # (no LTO, parallel codegen)
           cargo build --profile ci
+        env:
+          RUSTFLAGS: "-Ctarget-cpu=apple-m1"
 
       - name: Upload native library
         uses: actions/upload-artifact@v6
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -83,6 +83,8 @@ jobs:
         run: |
           cd native
           cargo build --profile ci
+        env:
+          RUSTFLAGS: "-Ctarget-cpu=x86-64-v3"
 
       - name: Upload native library
         uses: actions/upload-artifact@v6
diff --git a/dev/regenerate-golden-files.sh b/dev/regenerate-golden-files.sh
@@ -74,16 +74,6 @@ build_native() {
     cd native && cargo build && cd ..
 }
 
-# Install Comet for a specific Spark version
-install_for_spark_version() {
-    local spark_version=$1
-    echo ""
-    echo "=============================================="
-    echo "[INFO] Installing Comet for Spark $spark_version"
-    echo "=============================================="
-    ./mvnw install -DskipTests -Pspark-$spark_version
-}
-
 # Regenerate golden files for a specific Spark version
 regenerate_golden_files() {
     local spark_version=$1
@@ -94,12 +84,12 @@ regenerate_golden_files() {
     echo "=============================================="
 
     echo "[INFO] Running CometTPCDSV1_4_PlanStabilitySuite..."
-    SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark \
+    SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw \
         -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" \
         -Pspark-$spark_version -nsu test
 
     echo "[INFO] Running CometTPCDSV2_7_PlanStabilitySuite..."
-    SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark \
+    SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw \
         -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" \
         -Pspark-$spark_version -nsu test
 }
@@ -158,9 +148,8 @@ main() {
         versions=("3.4" "3.5" "4.0")
     fi
 
-    # Install and regenerate for each version
+    # Regenerate for each version
     for version in "${versions[@]}"; do
-        install_for_spark_version "$version"
         regenerate_golden_files "$version"
     done
 
diff --git a/docs/source/contributor-guide/development.md b/docs/source/contributor-guide/development.md
@@ -191,52 +191,43 @@ Spark version, and runs the plan stability tests with `SPARK_GENERATE_GOLDEN_FIL
 
 Alternatively, you can run the tests manually using the following commands.
 
-First, Comet needs to be installed for each Spark version to be tested:
-
-```sh
-./mvnw install -DskipTests -Pspark-3.4
-./mvnw install -DskipTests -Pspark-3.5
-# note that Spark 4.0 requires JDK 17 or later
-./mvnw install -DskipTests -Pspark-4.0
-```
-
 Note that the output files get written to `$SPARK_HOME`.
 
 The tests can be run with:
 
 ```sh
 export SPARK_HOME=`pwd`
-./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
-./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
-./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
+./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
+./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
+./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
 ```
 
 and
 
 ```sh
 export SPARK_HOME=`pwd`
-./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
-./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
-./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
+./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
+./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
+./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
 ```
 
 If your pull request changes the query plans generated by Comet, you should regenerate the golden files.
 To regenerate the golden files, you can run the following commands.
 
 ```sh
 export SPARK_HOME=`pwd`
-SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
-SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
-SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
+SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.4 -nsu test
+SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-3.5 -nsu test
+SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV1_4_PlanStabilitySuite" -Pspark-4.0 -nsu test
 ```
 
 and
 
 ```sh
 export SPARK_HOME=`pwd`
-SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
-SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
-SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -pl spark -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
+SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.4 -nsu test
+SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-3.5 -nsu test
+SPARK_GENERATE_GOLDEN_FILES=1 ./mvnw -Dsuites="org.apache.spark.sql.comet.CometTPCDSV2_7_PlanStabilitySuite" -Pspark-4.0 -nsu test
 ```
 
 ## Benchmark
diff --git a/docs/source/contributor-guide/sql-file-tests.md b/docs/source/contributor-guide/sql-file-tests.md
@@ -25,10 +25,21 @@ way to add expression and operator test coverage without writing Scala test code
 
 ## Running the tests
 
+Run all SQL file tests:
+
+```shell
+./mvnw test -Dsuites="org.apache.comet.CometSqlFileTestSuite" -Dtest=none
+```
+
+Run a single test file by adding the file name (without `.sql` extension) after the suite name:
+
 ```shell
-mvn test -pl spark -Dsuites="org.apache.comet.CometSqlFileTestSuite" -Dtest=none
+./mvnw test -Dsuites="org.apache.comet.CometSqlFileTestSuite create_named_struct" -Dtest=none
 ```
 
+This uses ScalaTest's substring matching, so the argument must match part of the test name.
+Test names follow the pattern `sql-file: expressions/<category>/<file>.sql [<config>]`.
+
 ## Test file location
 
 SQL test files live under:
@@ -208,7 +219,7 @@ SELECT space(n) FROM test_space WHERE n < 0
 6. Run the tests to verify:
 
    ```shell
-   mvn test -pl spark -Dsuites="org.apache.comet.CometSqlFileTestSuite" -Dtest=none
+   ./mvnw test -Dsuites="org.apache.comet.CometSqlFileTestSuite" -Dtest=none
    ```
 
 ### Tips for writing thorough tests
diff --git a/native/core/src/execution/shuffle/comet_partitioning.rs b/native/core/src/execution/shuffle/comet_partitioning.rs
@@ -46,3 +46,26 @@ impl CometPartitioning {
         }
     }
 }
+
+pub(super) fn pmod(hash: u32, n: usize) -> usize {
+    let hash = hash as i32;
+    let n = n as i32;
+    let r = hash % n;
+    let result = if r < 0 { (r + n) % n } else { r };
+    result as usize
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pmod() {
+        let i: Vec<u32> = vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 0xa05b5d7b, 0xcd1e64fb];
+        let result = i.into_iter().map(|i| pmod(i, 200)).collect::<Vec<usize>>();
+
+        // expected partition from Spark with n=200
+        let expected = vec![69, 5, 193, 171, 115];
+        assert_eq!(result, expected);
+    }
+}
diff --git a/native/core/src/execution/shuffle/mod.rs b/native/core/src/execution/shuffle/mod.rs
@@ -18,6 +18,7 @@
 pub(crate) mod codec;
 mod comet_partitioning;
 mod metrics;
+mod partitioners;
 mod shuffle_writer;
 pub mod spark_unsafe;
 mod writers;
diff --git a/native/core/src/execution/shuffle/partitioners/mod.rs b/native/core/src/execution/shuffle/partitioners/mod.rs
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod multi_partition;
+mod partitioned_batch_iterator;
+mod single_partition;
+
+use arrow::record_batch::RecordBatch;
+use datafusion::common::Result;
+
+pub(super) use multi_partition::MultiPartitionShuffleRepartitioner;
+pub(super) use partitioned_batch_iterator::PartitionedBatchIterator;
+pub(super) use single_partition::SinglePartitionShufflePartitioner;
+
+#[async_trait::async_trait]
+pub(super) trait ShufflePartitioner: Send + Sync {
+    /// Insert a batch into the partitioner
+    async fn insert_batch(&mut self, batch: RecordBatch) -> Result<()>;
+    /// Write shuffle data and shuffle index file to disk
+    fn shuffle_write(&mut self) -> Result<()>;
+}
diff --git a/native/core/src/execution/shuffle/partitioners/multi_partition.rs b/native/core/src/execution/shuffle/partitioners/multi_partition.rs
diff --git a/native/core/src/execution/shuffle/partitioners/partitioned_batch_iterator.rs b/native/core/src/execution/shuffle/partitioners/partitioned_batch_iterator.rs
diff --git a/native/core/src/execution/shuffle/partitioners/single_partition.rs b/native/core/src/execution/shuffle/partitioners/single_partition.rs
diff --git a/native/core/src/execution/shuffle/shuffle_writer.rs b/native/core/src/execution/shuffle/shuffle_writer.rs
diff --git a/native/core/src/execution/shuffle/writers/partition_writer.rs b/native/core/src/execution/shuffle/writers/partition_writer.rs