Df52 migration

comphead · comphead · commit f7aad61135a3 · 2026-02-09T09:22:00.000-08:00
diff --git a/native/core/src/execution/operators/scan.rs b/native/core/src/execution/operators/scan.rs
@@ -94,6 +94,7 @@ impl ScanExec {
 
         // Build schema directly from data types since get_next now always unpacks dictionaries
         let schema = schema_from_data_types(&data_types);
+        dbg!(&schema);
 
         let cache = PlanProperties::new(
             EquivalenceProperties::new(Arc::clone(&schema)),
@@ -209,6 +210,8 @@ impl ScanExec {
 
             let array = make_array(array_data);
 
+            dbg!(&array, &selection_indices_arrays);
+
             // Apply selection if selection vectors exist (applies to all columns)
             let array = if let Some(ref selection_arrays) = selection_indices_arrays {
                 let indices = &selection_arrays[i];
@@ -487,7 +490,7 @@ impl ScanStream<'_> {
     ) -> DataFusionResult<RecordBatch, DataFusionError> {
         let schema_fields = self.schema.fields();
         assert_eq!(columns.len(), schema_fields.len());
-
+        dbg!(&columns, &self.schema);
         // Cast dictionary-encoded primitive arrays to regular arrays and cast
         // Utf8/LargeUtf8/Binary arrays to dictionary-encoded if the schema is
         // defined as dictionary-encoded and the data in this batch is not
@@ -507,6 +510,7 @@ impl ScanStream<'_> {
             })
             .collect::<Result<Vec<_>, _>>()?;
         let options = RecordBatchOptions::new().with_row_count(Some(num_rows));
+        dbg!(&new_columns, &self.schema);
         RecordBatch::try_new_with_options(Arc::clone(&self.schema), new_columns, &options)
             .map_err(|e| arrow_datafusion_err!(e))
     }
@@ -517,6 +521,7 @@ impl Stream for ScanStream<'_> {
 
     fn poll_next(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         let mut timer = self.baseline_metrics.elapsed_compute().timer();
+        dbg!(&self.scan);
         let mut scan_batch = self.scan.batch.try_lock().unwrap();
 
         let input_batch = &*scan_batch;
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -667,65 +667,31 @@ impl PhysicalPlanner {
     ) -> Result<Arc<dyn PhysicalExpr>, ExecutionError> {
         let left = self.create_expr(left, Arc::clone(&input_schema))?;
         let right = self.create_expr(right, Arc::clone(&input_schema))?;
-        let left_type = left.data_type(&input_schema);
-        let right_type = right.data_type(&input_schema);
-        match (&op, &left_type, &right_type) {
-            // Handle date arithmetic with Int8/Int16/Int32 by:
-            // 1. Casting Date32 to Int32 (days since epoch)
-            // 2. Performing the arithmetic as Int32 +/- Int32
-            // 3. Casting the result back to Date32 using DataFusion's CastExpr
-            // Arrow's date arithmetic kernel only supports Date32 +/- Interval types
-            // Note: We use DataFusion's CastExpr for the final cast because Spark's Cast
-            // doesn't support Int32 -> Date32 conversion
-            (
-                DataFusionOperator::Plus | DataFusionOperator::Minus,
-                Ok(DataType::Date32),
-                Ok(DataType::Int8) | Ok(DataType::Int16) | Ok(DataType::Int32),
-            ) => {
-                // Cast Date32 to Int32 (days since epoch)
-                let left_as_int = Arc::new(Cast::new(
-                    left,
-                    DataType::Int32,
-                    SparkCastOptions::new_without_timezone(EvalMode::Legacy, false),
-                ));
-                // Cast Int8/Int16 to Int32 if needed
-                let right_as_int: Arc<dyn PhysicalExpr> =
-                    if matches!(right_type, Ok(DataType::Int32)) {
-                        right
-                    } else {
-                        Arc::new(Cast::new(
-                            right,
-                            DataType::Int32,
-                            SparkCastOptions::new_without_timezone(EvalMode::Legacy, false),
-                        ))
-                    };
-                // Perform the arithmetic as Int32 +/- Int32
-                let result_int = Arc::new(BinaryExpr::new(left_as_int, op, right_as_int));
-                // Cast the result back to Date32 using DataFusion's CastExpr
-                // (Spark's Cast doesn't support Int32 -> Date32)
-                Ok(Arc::new(CastExpr::new(result_int, DataType::Date32, None)))
-            }
+        match (
+            &op,
+            left.data_type(&input_schema),
+            right.data_type(&input_schema),
+        ) {
             (
                 DataFusionOperator::Plus | DataFusionOperator::Minus | DataFusionOperator::Multiply,
                 Ok(DataType::Decimal128(p1, s1)),
                 Ok(DataType::Decimal128(p2, s2)),
             ) if ((op == DataFusionOperator::Plus || op == DataFusionOperator::Minus)
-                && max(*s1, *s2) as u8 + max(*p1 - *s1 as u8, *p2 - *s2 as u8)
+                && max(s1, s2) as u8 + max(p1 - s1 as u8, p2 - s2 as u8)
                     >= DECIMAL128_MAX_PRECISION)
-                || (op == DataFusionOperator::Multiply
-                    && *p1 + *p2 >= DECIMAL128_MAX_PRECISION) =>
+                || (op == DataFusionOperator::Multiply && p1 + p2 >= DECIMAL128_MAX_PRECISION) =>
             {
                 let data_type = return_type.map(to_arrow_datatype).unwrap();
                 // For some Decimal128 operations, we need wider internal digits.
                 // Cast left and right to Decimal256 and cast the result back to Decimal128
                 let left = Arc::new(Cast::new(
                     left,
-                    DataType::Decimal256(*p1, *s1),
+                    DataType::Decimal256(p1, s1),
                     SparkCastOptions::new_without_timezone(EvalMode::Legacy, false),
                 ));
                 let right = Arc::new(Cast::new(
                     right,
-                    DataType::Decimal256(*p2, *s2),
+                    DataType::Decimal256(p2, s2),
                     SparkCastOptions::new_without_timezone(EvalMode::Legacy, false),
                 ));
                 let child = Arc::new(BinaryExpr::new(left, op, right));
@@ -999,6 +965,7 @@ impl PhysicalPlanner {
                 ))
             }
             OpStruct::NativeScan(scan) => {
+                dbg!(&scan);
                 let data_schema = convert_spark_types_to_arrow_schema(scan.data_schema.as_slice());
                 let required_schema: SchemaRef =
                     convert_spark_types_to_arrow_schema(scan.required_schema.as_slice());
@@ -1146,6 +1113,7 @@ impl PhysicalPlanner {
                 ))
             }
             OpStruct::Scan(scan) => {
+                dbg!(&scan);
                 let data_types = scan.fields.iter().map(to_arrow_datatype).collect_vec();
 
                 // If it is not test execution context for unit test, we should have at least one
@@ -1172,6 +1140,8 @@ impl PhysicalPlanner {
                     scan.arrow_ffi_safe,
                 )?;
 
+                dbg!(&scan);
+
                 Ok((
                     vec![scan.clone()],
                     Arc::new(SparkPlan::new(spark_plan.plan_id, Arc::new(scan), vec![])),
@@ -4411,12 +4381,10 @@ mod tests {
     fn test_date_sub_with_int8_cast_error() {
         use arrow::array::Date32Array;
 
-        let session_ctx = SessionContext::new();
-        let task_ctx = session_ctx.task_ctx();
-        let planner = PhysicalPlanner::new(Arc::from(session_ctx), 0);
+        let planner = PhysicalPlanner::default();
+        let row_count = 3;
 
-        // Create a scan operator with Date32 (DATE) and Int8 (TINYINT) columns
-        // This simulates the schema from the Scala test where _20 is DATE and _2 is TINYINT
+        // Create a Scan operator with Date32 (DATE) and Int8 (TINYINT) columns
         let op_scan = Operator {
             plan_id: 0,
             children: vec![],
@@ -4431,7 +4399,7 @@ mod tests {
                         type_info: None,
                     },
                 ],
-                source: "test".to_string(),
+                source: "".to_string(),
                 arrow_ffi_safe: false,
             })),
         };
@@ -4486,22 +4454,27 @@ mod tests {
         let (mut scans, datafusion_plan) =
             planner.create_plan(&projection, &mut vec![], 1).unwrap();
 
-        // Execute the plan with test data
+        // Create test data: Date32 and Int8 columns
+        let date_array = Date32Array::from(vec![Some(19000), Some(19001), Some(19002)]);
+        let int8_array = Int8Array::from(vec![Some(1i8), Some(2i8), Some(3i8)]);
+
+        // Set input batch for the scan
+        let input_batch = InputBatch::Batch(vec![Arc::new(date_array), Arc::new(int8_array)], row_count);
+        scans[0].set_input_batch(input_batch);
+
+        let session_ctx = SessionContext::new();
+        let task_ctx = session_ctx.task_ctx();
         let mut stream = datafusion_plan.native_plan.execute(0, task_ctx).unwrap();
 
         let runtime = tokio::runtime::Runtime::new().unwrap();
         let (tx, mut rx) = mpsc::channel(1);
 
-        // Send test data: Date32 values and Int8 values
+        // Separate thread to send the EOF signal once we've processed the only input batch
         runtime.spawn(async move {
-            // Create Date32 array (days since epoch)
-            // 19000 days = approximately 2022-01-01
+            // Create test data again for the second batch
             let date_array = Date32Array::from(vec![Some(19000), Some(19001), Some(19002)]);
-            // Create Int8 array
             let int8_array = Int8Array::from(vec![Some(1i8), Some(2i8), Some(3i8)]);
-
-            let input_batch1 =
-                InputBatch::Batch(vec![Arc::new(date_array), Arc::new(int8_array)], 3);
+            let input_batch1 = InputBatch::Batch(vec![Arc::new(date_array), Arc::new(int8_array)], row_count);
             let input_batch2 = InputBatch::EOF;
 
             let batches = vec![input_batch1, input_batch2];
@@ -4511,7 +4484,6 @@ mod tests {
             }
         });
 
-        // Execute and expect success - the Int8 should be cast to Int32 for date arithmetic
         runtime.block_on(async move {
             loop {
                 let batch = rx.recv().await.unwrap();
@@ -4524,10 +4496,13 @@ mod tests {
                             "Expected success for date - int8 operation but got error: {:?}",
                             result.unwrap_err()
                         );
+
                         let batch = result.unwrap();
-                        assert_eq!(batch.num_rows(), 3);
+                        assert_eq!(batch.num_rows(), row_count);
+
                         // The result should be Date32 type
                         assert_eq!(batch.column(0).data_type(), &DataType::Date32);
+
                         // Verify the values: 19000-1=18999, 19001-2=18999, 19002-3=18999
                         let date_array = batch
                             .column(0)
@@ -4537,7 +4512,6 @@ mod tests {
                         assert_eq!(date_array.value(0), 18999); // 19000 - 1
                         assert_eq!(date_array.value(1), 18999); // 19001 - 2
                         assert_eq!(date_array.value(2), 18999); // 19002 - 3
-                        break;
                     }
                     Poll::Ready(None) => {
                         break;
diff --git a/native/core/src/parquet/mod.rs b/native/core/src/parquet/mod.rs
@@ -703,6 +703,7 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
     key_unwrapper_obj: JObject,
     metrics_node: JObject,
 ) -> jlong {
+    dbg!("Java_org_apache_comet_parquet_Native_initRecordBatchReader");
     try_unwrap_or_throw(&e, |mut env| unsafe {
         JVMClasses::init(&mut env);
         let session_config = SessionConfig::new().with_batch_size(batch_size as usize);
@@ -776,6 +777,8 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
             encryption_enabled,
         )?;
 
+        dbg!(&scan);
+
         let partition_index: usize = 0;
         let batch_stream = Some(scan.execute(partition_index, session_ctx.task_ctx())?);
 
@@ -787,6 +790,9 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
             reader_state: ParquetReaderState::Init,
         };
         let res = Box::new(ctx);
+
+        dbg!("end Java_org_apache_comet_parquet_Native_initRecordBatchReader");
+
         Ok(Box::into_raw(res) as i64)
     })
 }
diff --git a/native/core/src/parquet/parquet_exec.rs b/native/core/src/parquet/parquet_exec.rs
@@ -96,6 +96,8 @@ pub(crate) fn init_datasource_exec(
         TableSchema::from_file_schema(Arc::clone(&required_schema))
     };
 
+    dbg!(&table_schema);
+
     let mut parquet_source =
         ParquetSource::new(table_schema).with_table_parquet_options(table_parquet_options);
 
@@ -135,12 +137,11 @@ pub(crate) fn init_datasource_exec(
         .collect();
 
     let mut file_scan_config_builder =
-        FileScanConfigBuilder::new(object_store_url, file_source).with_file_groups(file_groups);
+        FileScanConfigBuilder::new(object_store_url, file_source).with_file_groups(file_groups).with_expr_adapter(Some(expr_adapter_factory));
 
     if let Some(projection_vector) = projection_vector {
         file_scan_config_builder = file_scan_config_builder
-            .with_projection_indices(Some(projection_vector))?
-            .with_expr_adapter(Some(expr_adapter_factory));
+            .with_projection_indices(Some(projection_vector))?;
     }
 
     let file_scan_config = file_scan_config_builder.build();
diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs
@@ -25,7 +25,7 @@
 
 use crate::parquet::parquet_support::{spark_parquet_convert, SparkParquetOptions};
 use arrow::array::{ArrayRef, RecordBatch, RecordBatchOptions};
-use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow::datatypes::{Field, Schema, SchemaRef};
 use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion::common::{ColumnStatistics, Result as DataFusionResult};
 use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper};
@@ -102,7 +102,6 @@ impl PhysicalExprAdapterFactory for SparkPhysicalExprAdapterFactory {
 struct SparkPhysicalExprAdapter {
     /// The logical schema expected by the query
     logical_file_schema: SchemaRef,
-    #[allow(dead_code)]
     /// The physical schema of the actual file being read
     physical_file_schema: SchemaRef,
     /// Spark-specific options for type conversions
@@ -161,44 +160,56 @@ impl SparkPhysicalExprAdapter {
         Ok(Transformed::no(expr))
     }
 
-    // Cast expressions that currently not supported in DF
-    // For example, Arrow's date arithmetic kernel only supports Date32 +/- Int32 (days)
-    // but Spark may send Int8/Int16 values. We need to cast them to Int32.
+    /// Cast Column expressions where the physical and logical datatypes differ.
+    ///
+    /// This function traverses the expression tree and for each Column expression,
+    /// checks if the physical file schema datatype differs from the logical file schema
+    /// datatype. If they differ, it wraps the Column with a CastColumnExpr to perform
+    /// the necessary type conversion.
     fn cast_datafusion_unsupported_expr(
         &self,
         expr: Arc<dyn PhysicalExpr>,
     ) -> DataFusionResult<Arc<dyn PhysicalExpr>> {
-        use datafusion::logical_expr::Operator;
-        use datafusion::physical_expr::expressions::{BinaryExpr, CastColumnExpr};
+        use datafusion::physical_expr::expressions::CastColumnExpr;
 
         expr.transform(|e| {
-            // Check if this is a BinaryExpr with date arithmetic
-            if let Some(binary) = e.as_any().downcast_ref::<BinaryExpr>() {
-                let op = binary.op();
-                // Only handle Plus and Minus for date arithmetic
-                if matches!(op, &Operator::Plus | &Operator::Minus) {
-                    let left = binary.left();
-                    let right = binary.right();
-
-                    let left_type = left.data_type(&self.logical_file_schema);
-                    let right_type = right.data_type(&self.logical_file_schema);
-
-                    // Check for Date32 +/- Int8 or Date32 +/- Int16
-                    if let (Ok(DataType::Date32), Ok(ref rt @ (DataType::Int8 | DataType::Int16))) =
-                        (&left_type, &right_type)
-                    {
-                        // Cast the right operand (Int8/Int16) to Int32
-                        let input_field = Arc::new(Field::new("input", rt.clone(), true));
-                        let target_field = Arc::new(Field::new("cast", DataType::Int32, true));
-                        let casted_right: Arc<dyn PhysicalExpr> = Arc::new(CastColumnExpr::new(
-                            Arc::clone(right),
+            // Check if this is a Column expression
+            if let Some(column) = e.as_any().downcast_ref::<Column>() {
+                let col_idx = column.index();
+
+                // Get the logical datatype (expected by the query)
+                let logical_field = self.logical_file_schema.fields().get(col_idx);
+                // Get the physical datatype (actual file schema)
+                let physical_field = self.physical_file_schema.fields().get(col_idx);
+
+                dbg!(&logical_field, &physical_field);
+
+                if let (Some(logical_field), Some(physical_field)) = (logical_field, physical_field)
+                {
+                    let logical_type = logical_field.data_type();
+                    let physical_type = physical_field.data_type();
+
+                    // If datatypes differ, insert a CastColumnExpr
+                    if logical_type != physical_type || 1==1 {
+                        let input_field = Arc::new(Field::new(
+                            physical_field.name(),
+                            physical_type.clone(),
+                            physical_field.is_nullable(),
+                        ));
+                        let target_field = Arc::new(Field::new(
+                            logical_field.name(),
+                            logical_type.clone(),
+                            logical_field.is_nullable(),
+                        ));
+
+                        let cast_expr: Arc<dyn PhysicalExpr> = Arc::new(CastColumnExpr::new(
+                            e.clone(),
                             input_field,
                             target_field,
                             None,
                         ));
-                        let new_binary: Arc<dyn PhysicalExpr> =
-                            Arc::new(BinaryExpr::new(Arc::clone(left), *op, casted_right));
-                        return Ok(Transformed::yes(new_binary));
+                        dbg!(&cast_expr);
+                        return Ok(Transformed::yes(cast_expr));
                     }
                 }
             }
@@ -459,7 +470,6 @@ impl SchemaMapper for SchemaMapping {
     /// columns, so if one needs a RecordBatch with a schema that references columns which are not
     /// in the projected, it would be better to use `map_partial_batch`
     fn map_batch(&self, batch: RecordBatch) -> datafusion::common::Result<RecordBatch> {
-        dbg!("map_batch");
         let batch_rows = batch.num_rows();
         let batch_cols = batch.columns().to_vec();
 
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala