1515// specific language governing permissions and limitations
1616// under the License.
1717use arrow:: {
18- array:: { ArrayRef , TimestampMicrosecondArray , TimestampMillisecondArray } ,
18+ array:: { make_array , ArrayRef , TimestampMicrosecondArray , TimestampMillisecondArray } ,
1919 compute:: CastOptions ,
2020 datatypes:: { DataType , FieldRef , Schema , TimeUnit } ,
2121 record_batch:: RecordBatch ,
2222} ;
2323
24+ use crate :: parquet:: parquet_support:: { spark_parquet_convert, SparkParquetOptions } ;
2425use datafusion:: common:: format:: DEFAULT_CAST_OPTIONS ;
2526use datafusion:: common:: Result as DataFusionResult ;
2627use datafusion:: common:: ScalarValue ;
@@ -33,6 +34,59 @@ use std::{
3334 sync:: Arc ,
3435} ;
3536
37+ /// Returns true if two DataTypes are structurally equivalent (same data layout)
38+ /// but may differ in field names within nested types.
39+ fn types_differ_only_in_field_names ( physical : & DataType , logical : & DataType ) -> bool {
40+ match ( physical, logical) {
41+ ( DataType :: List ( pf) , DataType :: List ( lf) ) => {
42+ pf. is_nullable ( ) == lf. is_nullable ( )
43+ && ( pf. data_type ( ) == lf. data_type ( )
44+ || types_differ_only_in_field_names ( pf. data_type ( ) , lf. data_type ( ) ) )
45+ }
46+ ( DataType :: LargeList ( pf) , DataType :: LargeList ( lf) ) => {
47+ pf. is_nullable ( ) == lf. is_nullable ( )
48+ && ( pf. data_type ( ) == lf. data_type ( )
49+ || types_differ_only_in_field_names ( pf. data_type ( ) , lf. data_type ( ) ) )
50+ }
51+ ( DataType :: Map ( pf, p_sorted) , DataType :: Map ( lf, l_sorted) ) => {
52+ p_sorted == l_sorted
53+ && pf. is_nullable ( ) == lf. is_nullable ( )
54+ && ( pf. data_type ( ) == lf. data_type ( )
55+ || types_differ_only_in_field_names ( pf. data_type ( ) , lf. data_type ( ) ) )
56+ }
57+ ( DataType :: Struct ( pfields) , DataType :: Struct ( lfields) ) => {
58+ // For Struct types, field names are semantically meaningful (they
59+ // identify different columns), so we require name equality here.
60+ // This distinguishes from List/Map wrapper field names ("item" vs
61+ // "element") which are purely cosmetic.
62+ pfields. len ( ) == lfields. len ( )
63+ && pfields. iter ( ) . zip ( lfields. iter ( ) ) . all ( |( pf, lf) | {
64+ pf. name ( ) == lf. name ( )
65+ && pf. is_nullable ( ) == lf. is_nullable ( )
66+ && ( pf. data_type ( ) == lf. data_type ( )
67+ || types_differ_only_in_field_names ( pf. data_type ( ) , lf. data_type ( ) ) )
68+ } )
69+ }
70+ _ => false ,
71+ }
72+ }
73+
74+ /// Recursively relabel an array so its DataType matches `target_type`.
75+ /// This only changes metadata (field names, nullability flags in nested fields);
76+ /// it does NOT change the underlying buffer data.
77+ fn relabel_array ( array : ArrayRef , target_type : & DataType ) -> ArrayRef {
78+ if array. data_type ( ) == target_type {
79+ return array;
80+ }
81+ let data = array. to_data ( ) ;
82+ let new_data = data
83+ . into_builder ( )
84+ . data_type ( target_type. clone ( ) )
85+ . build ( )
86+ . expect ( "relabel_array: data layout must be compatible" ) ;
87+ make_array ( new_data)
88+ }
89+
3690/// Casts a Timestamp(Microsecond) array to Timestamp(Millisecond) by dividing values by 1000.
3791/// Preserves the timezone from the target type.
3892fn cast_timestamp_micros_to_millis_array (
@@ -79,6 +133,9 @@ pub struct CometCastColumnExpr {
79133 target_field : FieldRef ,
80134 /// Options forwarded to [`cast_column`].
81135 cast_options : CastOptions < ' static > ,
136+ /// Spark parquet options for complex nested type conversions.
137+ /// When present, enables `spark_parquet_convert` as a fallback.
138+ parquet_options : Option < SparkParquetOptions > ,
82139}
83140
84141// Manually derive `PartialEq`/`Hash` as `Arc<dyn PhysicalExpr>` does not
@@ -89,6 +146,7 @@ impl PartialEq for CometCastColumnExpr {
89146 && self . input_physical_field . eq ( & other. input_physical_field )
90147 && self . target_field . eq ( & other. target_field )
91148 && self . cast_options . eq ( & other. cast_options )
149+ && self . parquet_options . eq ( & other. parquet_options )
92150 }
93151}
94152
@@ -98,6 +156,7 @@ impl Hash for CometCastColumnExpr {
98156 self . input_physical_field . hash ( state) ;
99157 self . target_field . hash ( state) ;
100158 self . cast_options . hash ( state) ;
159+ self . parquet_options . hash ( state) ;
101160 }
102161}
103162
@@ -114,8 +173,15 @@ impl CometCastColumnExpr {
114173 input_physical_field : physical_field,
115174 target_field,
116175 cast_options : cast_options. unwrap_or ( DEFAULT_CAST_OPTIONS ) ,
176+ parquet_options : None ,
117177 }
118178 }
179+
180+ /// Set Spark parquet options to enable complex nested type conversions.
181+ pub fn with_parquet_options ( mut self , options : SparkParquetOptions ) -> Self {
182+ self . parquet_options = Some ( options) ;
183+ self
184+ }
119185}
120186
121187impl Display for CometCastColumnExpr {
@@ -145,18 +211,17 @@ impl PhysicalExpr for CometCastColumnExpr {
145211 fn evaluate ( & self , batch : & RecordBatch ) -> DataFusionResult < ColumnarValue > {
146212 let value = self . expr . evaluate ( batch) ?;
147213
148- if value
149- . data_type ( )
150- . equals_datatype ( self . target_field . data_type ( ) )
151- {
214+ // Use == (PartialEq) instead of equals_datatype because equals_datatype
215+ // ignores field names in nested types (Struct, List, Map). We need to detect
216+ // when field names differ (e.g., Struct("a","b") vs Struct("c","d")) so that
217+ // we can apply spark_parquet_convert for field-name-based selection.
218+ if value. data_type ( ) == * self . target_field . data_type ( ) {
152219 return Ok ( value) ;
153220 }
154221
155222 let input_physical_field = self . input_physical_field . data_type ( ) ;
156223 let target_field = self . target_field . data_type ( ) ;
157224
158- // dbg!(&input_physical_field, &target_field, &value);
159-
160225 // Handle specific type conversions with custom casts
161226 match ( input_physical_field, target_field) {
162227 // Timestamp(Microsecond) -> Timestamp(Millisecond)
@@ -174,7 +239,30 @@ impl PhysicalExpr for CometCastColumnExpr {
174239 }
175240 _ => Ok ( value) ,
176241 } ,
177- _ => Ok ( value) ,
242+ // Nested types that differ only in field names (e.g., List element named
243+ // "item" vs "element", or Map entries named "key_value" vs "entries").
244+ // Re-label the array so the DataType metadata matches the logical schema.
245+ ( physical, logical)
246+ if physical != logical && types_differ_only_in_field_names ( physical, logical) =>
247+ {
248+ match value {
249+ ColumnarValue :: Array ( array) => {
250+ let relabeled = relabel_array ( array, logical) ;
251+ Ok ( ColumnarValue :: Array ( relabeled) )
252+ }
253+ other => Ok ( other) ,
254+ }
255+ }
256+ // Fallback: use spark_parquet_convert for complex nested type conversions
257+ // (e.g., List<Struct{a,b,c}> → List<Struct{a,c}>, Map field selection, etc.)
258+ _ => {
259+ if let Some ( parquet_options) = & self . parquet_options {
260+ let converted = spark_parquet_convert ( value, target_field, parquet_options) ?;
261+ Ok ( converted)
262+ } else {
263+ Ok ( value)
264+ }
265+ }
178266 }
179267 }
180268
@@ -192,12 +280,16 @@ impl PhysicalExpr for CometCastColumnExpr {
192280 ) -> DataFusionResult < Arc < dyn PhysicalExpr > > {
193281 assert_eq ! ( children. len( ) , 1 ) ;
194282 let child = children. pop ( ) . expect ( "CastColumnExpr child" ) ;
195- Ok ( Arc :: new ( Self :: new (
283+ let mut new_expr = Self :: new (
196284 child,
197285 Arc :: clone ( & self . input_physical_field ) ,
198286 Arc :: clone ( & self . target_field ) ,
199287 Some ( self . cast_options . clone ( ) ) ,
200- ) ) )
288+ ) ;
289+ if let Some ( opts) = & self . parquet_options {
290+ new_expr = new_expr. with_parquet_options ( opts. clone ( ) ) ;
291+ }
292+ Ok ( Arc :: new ( new_expr) )
201293 }
202294
203295 fn fmt_sql ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
0 commit comments