@@ -1151,33 +1151,28 @@ impl PhysicalPlanner {
11511151 ) )
11521152 }
11531153 OpStruct :: IcebergScan ( scan) => {
1154- let required_schema: SchemaRef =
1155- convert_spark_types_to_arrow_schema ( scan. required_schema . as_slice ( ) ) ;
1154+ // Extract common data and single partition's file tasks
1155+ // Per-partition injection happens in Scala before sending to native
1156+ let common = scan
1157+ . common
1158+ . as_ref ( )
1159+ . ok_or_else ( || GeneralError ( "IcebergScan missing common data" . into ( ) ) ) ?;
11561160
1157- let catalog_properties: HashMap < String , String > = scan
1161+ let required_schema =
1162+ convert_spark_types_to_arrow_schema ( common. required_schema . as_slice ( ) ) ;
1163+ let catalog_properties: HashMap < String , String > = common
11581164 . catalog_properties
11591165 . iter ( )
11601166 . map ( |( k, v) | ( k. clone ( ) , v. clone ( ) ) )
11611167 . collect ( ) ;
1162-
1163- let metadata_location = scan. metadata_location . clone ( ) ;
1164-
1165- debug_assert ! (
1166- !scan. file_partitions. is_empty( ) ,
1167- "IcebergScan must have at least one file partition. This indicates a bug in Scala serialization."
1168- ) ;
1169-
1170- let tasks = parse_file_scan_tasks (
1171- scan,
1172- & scan. file_partitions [ self . partition as usize ] . file_scan_tasks ,
1173- ) ?;
1174- let file_task_groups = vec ! [ tasks] ;
1168+ let metadata_location = common. metadata_location . clone ( ) ;
1169+ let tasks = parse_file_scan_tasks_from_common ( common, & scan. file_scan_tasks ) ?;
11751170
11761171 let iceberg_scan = IcebergScanExec :: new (
11771172 metadata_location,
11781173 required_schema,
11791174 catalog_properties,
1180- file_task_groups ,
1175+ tasks ,
11811176 ) ?;
11821177
11831178 Ok ( (
@@ -2762,15 +2757,14 @@ fn partition_data_to_struct(
27622757/// Each task contains a residual predicate that is used for row-group level filtering
27632758/// during Parquet scanning.
27642759///
2765- /// This function uses deduplication pools from the IcebergScan to avoid redundant parsing
2766- /// of schemas, partition specs, partition types, name mappings, and other repeated data.
2767- fn parse_file_scan_tasks (
2768- proto_scan : & spark_operator:: IcebergScan ,
2760+ /// This function uses deduplication pools from the IcebergScanCommon to avoid redundant
2761+ /// parsing of schemas, partition specs, partition types, name mappings, and other repeated data.
2762+ fn parse_file_scan_tasks_from_common (
2763+ proto_common : & spark_operator:: IcebergScanCommon ,
27692764 proto_tasks : & [ spark_operator:: IcebergFileScanTask ] ,
27702765) -> Result < Vec < iceberg:: scan:: FileScanTask > , ExecutionError > {
2771- // Build caches upfront: for 10K tasks with 1 schema, this parses the schema
2772- // once instead of 10K times, eliminating redundant JSON deserialization
2773- let schema_cache: Vec < Arc < iceberg:: spec:: Schema > > = proto_scan
2766+ // Parse each unique schema once, not once per task
2767+ let schema_cache: Vec < Arc < iceberg:: spec:: Schema > > = proto_common
27742768 . schema_pool
27752769 . iter ( )
27762770 . map ( |json| {
@@ -2783,7 +2777,7 @@ fn parse_file_scan_tasks(
27832777 } )
27842778 . collect :: < Result < Vec < _ > , _ > > ( ) ?;
27852779
2786- let partition_spec_cache: Vec < Option < Arc < iceberg:: spec:: PartitionSpec > > > = proto_scan
2780+ let partition_spec_cache: Vec < Option < Arc < iceberg:: spec:: PartitionSpec > > > = proto_common
27872781 . partition_spec_pool
27882782 . iter ( )
27892783 . map ( |json| {
@@ -2793,7 +2787,7 @@ fn parse_file_scan_tasks(
27932787 } )
27942788 . collect ( ) ;
27952789
2796- let name_mapping_cache: Vec < Option < Arc < iceberg:: spec:: NameMapping > > > = proto_scan
2790+ let name_mapping_cache: Vec < Option < Arc < iceberg:: spec:: NameMapping > > > = proto_common
27972791 . name_mapping_pool
27982792 . iter ( )
27992793 . map ( |json| {
@@ -2803,7 +2797,7 @@ fn parse_file_scan_tasks(
28032797 } )
28042798 . collect ( ) ;
28052799
2806- let delete_files_cache: Vec < Vec < iceberg:: scan:: FileScanTaskDeleteFile > > = proto_scan
2800+ let delete_files_cache: Vec < Vec < iceberg:: scan:: FileScanTaskDeleteFile > > = proto_common
28072801 . delete_files_pool
28082802 . iter ( )
28092803 . map ( |list| {
@@ -2815,7 +2809,7 @@ fn parse_file_scan_tasks(
28152809 "EQUALITY_DELETES" => iceberg:: spec:: DataContentType :: EqualityDeletes ,
28162810 other => {
28172811 return Err ( GeneralError ( format ! (
2818- "Invalid delete content type '{}'. This indicates a bug in Scala serialization. " ,
2812+ "Invalid delete content type '{}'" ,
28192813 other
28202814 ) ) )
28212815 }
@@ -2836,7 +2830,6 @@ fn parse_file_scan_tasks(
28362830 } )
28372831 . collect :: < Result < Vec < _ > , _ > > ( ) ?;
28382832
2839- // Partition data pool is in protobuf messages
28402833 let results: Result < Vec < _ > , _ > = proto_tasks
28412834 . iter ( )
28422835 . map ( |proto_task| {
@@ -2870,7 +2863,7 @@ fn parse_file_scan_tasks(
28702863 } ;
28712864
28722865 let bound_predicate = if let Some ( idx) = proto_task. residual_idx {
2873- proto_scan
2866+ proto_common
28742867 . residual_pool
28752868 . get ( idx as usize )
28762869 . and_then ( convert_spark_expr_to_predicate)
@@ -2890,24 +2883,22 @@ fn parse_file_scan_tasks(
28902883 } ;
28912884
28922885 let partition = if let Some ( partition_data_idx) = proto_task. partition_data_idx {
2893- // Get partition data from protobuf pool
2894- let partition_data_proto = proto_scan
2886+ let partition_data_proto = proto_common
28952887 . partition_data_pool
28962888 . get ( partition_data_idx as usize )
28972889 . ok_or_else ( || {
28982890 ExecutionError :: GeneralError ( format ! (
28992891 "Invalid partition_data_idx: {} (pool size: {})" ,
29002892 partition_data_idx,
2901- proto_scan . partition_data_pool. len( )
2893+ proto_common . partition_data_pool. len( )
29022894 ) )
29032895 } ) ?;
29042896
2905- // Convert protobuf PartitionData to iceberg Struct
29062897 match partition_data_to_struct ( partition_data_proto) {
29072898 Ok ( s) => Some ( s) ,
29082899 Err ( e) => {
29092900 return Err ( ExecutionError :: GeneralError ( format ! (
2910- "Failed to deserialize partition data from protobuf : {}" ,
2901+ "Failed to deserialize partition data: {}" ,
29112902 e
29122903 ) ) )
29132904 }
@@ -2926,14 +2917,14 @@ fn parse_file_scan_tasks(
29262917 . and_then ( |idx| name_mapping_cache. get ( idx as usize ) )
29272918 . and_then ( |opt| opt. clone ( ) ) ;
29282919
2929- let project_field_ids = proto_scan
2920+ let project_field_ids = proto_common
29302921 . project_field_ids_pool
29312922 . get ( proto_task. project_field_ids_idx as usize )
29322923 . ok_or_else ( || {
29332924 ExecutionError :: GeneralError ( format ! (
29342925 "Invalid project_field_ids_idx: {} (pool size: {})" ,
29352926 proto_task. project_field_ids_idx,
2936- proto_scan . project_field_ids_pool. len( )
2927+ proto_common . project_field_ids_pool. len( )
29372928 ) )
29382929 } ) ?
29392930 . field_ids
0 commit comments