feat: support PartialMerge

comphead · comphead · commit 336aadd074b1 · 2026-04-20T13:38:42.000-07:00
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -1035,8 +1035,8 @@ impl PhysicalPlanner {
                                 )
                                 .schema(Arc::clone(&child_schema))
                                 .alias(format!("col_{idx}"))
-                                .with_ignore_nulls(false)
-                                .with_distinct(false)
+                                .with_ignore_nulls(expr.ignore_nulls())
+                                .with_distinct(expr.is_distinct())
                                 .build()
                                 .map_err(|e| ExecutionError::DataFusionError(e.to_string()))?;
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala b/spark/src/main/scala/org/apache/spark/sql/comet/operators.scala
@@ -1460,6 +1460,24 @@ trait CometBaseAggregate {
         }
       }
 
+      // FIRST/LAST are order-dependent aggregates whose merge result depends on
+      // hash table processing order. In PartialMerge mode, DataFusion's hash table
+      // may process rows in a different order than Spark's, producing different results.
+      val hasPartialMergeMode = modeSet.contains(PartialMerge)
+      if (hasPartialMergeMode) {
+        val unsupportedAggs = aggregateExpressions.filter { a =>
+          a.mode == PartialMerge && (a.aggregateFunction.isInstanceOf[First] ||
+            a.aggregateFunction.isInstanceOf[Last])
+        }
+        if (unsupportedAggs.nonEmpty) {
+          withInfo(
+            aggregate,
+            s"PartialMerge not supported for order-dependent aggregates: " +
+              unsupportedAggs.map(_.aggregateFunction.prettyName).mkString(", "))
+          return None
+        }
+      }
+
       // Per-expression binding: Partial expressions bind to child output,
       // PartialMerge/Final expressions do not (native planner handles their input).
       val output = child.output
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -640,7 +640,8 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
                 sql("CREATE TEMP VIEW v AS SELECT _1, _2 FROM tbl ORDER BY _1")
                 checkSparkAnswerAndOperator(
                   "SELECT _2, SUM(_1), SUM(DISTINCT _1), MIN(_1), MAX(_1), COUNT(_1)," +
-                    " COUNT(DISTINCT _1), AVG(_1), FIRST(_1), LAST(_1) FROM v GROUP BY _2")
+                    " COUNT(DISTINCT _1), AVG(_1)" +
+                    " FROM v GROUP BY _2 ORDER BY _2")
               }
             }
           }
@@ -649,6 +650,33 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  // FIRST/LAST are order-dependent aggregates whose merge result depends on hash table
+  // processing order. In PartialMerge mode, DataFusion's hash table may process rows
+  // in a different order than Spark's, so we fall back to Spark for correctness.
+  test("partialMerge - FIRST/LAST with distinct aggregates falls back") {
+    val numValues = 10000
+    Seq(100).foreach { numGroups =>
+      Seq(128).foreach { batchSize =>
+        withSQLConf(
+          SQLConf.COALESCE_PARTITIONS_ENABLED.key -> "true",
+          CometConf.COMET_BATCH_SIZE.key -> batchSize.toString) {
+          withParquetTable(
+            (0 until numValues).map(i => (i, Random.nextInt() % numGroups)),
+            "tbl",
+            false) {
+            withView("v") {
+              sql("CREATE TEMP VIEW v AS SELECT _1, _2 FROM tbl ORDER BY _1")
+              checkSparkAnswerAndFallbackReason(
+                "SELECT _2, FIRST(_1), LAST(_1), COUNT(DISTINCT _1)" +
+                  " FROM v GROUP BY _2 ORDER BY _2",
+                "PartialMerge not supported for order-dependent aggregates")
+            }
+          }
+        }
+      }
+    }
+  }
+
   test("partialMerge - cnt distinct + sum") {
     withTempDir(dir => {
       withSQLConf("spark.comet.enabled" -> "false") {