|
19 | 19 |
|
20 | 20 | use arrow::{ |
21 | 21 | array::{ |
22 | | - Array, ArrayRef, AsArray, BinaryArrayType, FixedSizeBinaryArray, |
23 | | - GenericBinaryArray, GenericStringArray, OffsetSizeTrait, |
| 22 | + Array, ArrayRef, AsArray, BinaryArrayType, GenericBinaryArray, |
| 23 | + GenericStringArray, OffsetSizeTrait, |
24 | 24 | }, |
25 | 25 | datatypes::DataType, |
26 | 26 | }; |
@@ -239,7 +239,7 @@ fn encode_array(array: &ArrayRef, encoding: Encoding) -> Result<ColumnarValue> { |
239 | 239 | encoding.encode_array::<_, i64>(&array.as_binary::<i64>()) |
240 | 240 | } |
241 | 241 | DataType::FixedSizeBinary(_) => { |
242 | | - encoding.encode_fsb_array(array.as_fixed_size_binary()) |
| 242 | + encoding.encode_array::<_, i32>(&array.as_fixed_size_binary()) |
243 | 243 | } |
244 | 244 | dt => { |
245 | 245 | internal_err!("Unexpected data type for encode: {dt}") |
@@ -307,7 +307,7 @@ fn decode_array(array: &ArrayRef, encoding: Encoding) -> Result<ColumnarValue> { |
307 | 307 | let array = array.as_fixed_size_binary(); |
308 | 308 | // TODO: could we be more conservative by accounting for nulls? |
309 | 309 | let estimate = array.len().saturating_mul(*size as usize); |
310 | | - encoding.decode_fsb_array(array, estimate) |
| 310 | + encoding.decode_array::<_, i32>(&array, estimate) |
311 | 311 | } |
312 | 312 | dt => { |
313 | 313 | internal_err!("Unexpected data type for decode: {dt}") |
@@ -404,24 +404,6 @@ impl Encoding { |
404 | 404 | } |
405 | 405 | } |
406 | 406 |
|
407 | | - // TODO: refactor this away once https://github.com/apache/arrow-rs/pull/8993 lands |
408 | | - fn encode_fsb_array(self, array: &FixedSizeBinaryArray) -> Result<ArrayRef> { |
409 | | - match self { |
410 | | - Self::Base64 => { |
411 | | - let array: GenericStringArray<i32> = array |
412 | | - .iter() |
413 | | - .map(|x| x.map(|x| BASE64_ENGINE.encode(x))) |
414 | | - .collect(); |
415 | | - Ok(Arc::new(array)) |
416 | | - } |
417 | | - Self::Hex => { |
418 | | - let array: GenericStringArray<i32> = |
419 | | - array.iter().map(|x| x.map(hex::encode)).collect(); |
420 | | - Ok(Arc::new(array)) |
421 | | - } |
422 | | - } |
423 | | - } |
424 | | - |
425 | 407 | // OutputOffset important to ensure Large types output Large arrays |
426 | 408 | fn decode_array<'a, InputBinaryArray, OutputOffset>( |
427 | 409 | self, |
@@ -461,73 +443,6 @@ impl Encoding { |
461 | 443 | } |
462 | 444 | } |
463 | 445 | } |
464 | | - |
465 | | - // TODO: refactor this away once https://github.com/apache/arrow-rs/pull/8993 lands |
466 | | - fn decode_fsb_array( |
467 | | - self, |
468 | | - value: &FixedSizeBinaryArray, |
469 | | - approx_data_size: usize, |
470 | | - ) -> Result<ArrayRef> { |
471 | | - fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> { |
472 | | - // only write input / 2 bytes to buf |
473 | | - let out_len = input.len() / 2; |
474 | | - let buf = &mut buf[..out_len]; |
475 | | - hex::decode_to_slice(input, buf) |
476 | | - .map_err(|e| exec_datafusion_err!("Failed to decode from hex: {e}"))?; |
477 | | - Ok(out_len) |
478 | | - } |
479 | | - |
480 | | - fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result<usize> { |
481 | | - BASE64_ENGINE |
482 | | - .decode_slice(input, buf) |
483 | | - .map_err(|e| exec_datafusion_err!("Failed to decode from base64: {e}")) |
484 | | - } |
485 | | - |
486 | | - fn delegated_decode<DecodeFunction>( |
487 | | - decode: DecodeFunction, |
488 | | - input: &FixedSizeBinaryArray, |
489 | | - conservative_upper_bound_size: usize, |
490 | | - ) -> Result<ArrayRef> |
491 | | - where |
492 | | - DecodeFunction: Fn(&[u8], &mut [u8]) -> Result<usize>, |
493 | | - { |
494 | | - let mut values = vec![0; conservative_upper_bound_size]; |
495 | | - let mut offsets = OffsetBufferBuilder::new(input.len()); |
496 | | - let mut total_bytes_decoded = 0; |
497 | | - for v in input.iter() { |
498 | | - if let Some(v) = v { |
499 | | - let cursor = &mut values[total_bytes_decoded..]; |
500 | | - let decoded = decode(v, cursor)?; |
501 | | - total_bytes_decoded += decoded; |
502 | | - offsets.push_length(decoded); |
503 | | - } else { |
504 | | - offsets.push_length(0); |
505 | | - } |
506 | | - } |
507 | | - // We reserved an upper bound size for the values buffer, but we only use the actual size |
508 | | - values.truncate(total_bytes_decoded); |
509 | | - let binary_array = GenericBinaryArray::<i32>::try_new( |
510 | | - offsets.finish(), |
511 | | - Buffer::from_vec(values), |
512 | | - input.nulls().cloned(), |
513 | | - )?; |
514 | | - Ok(Arc::new(binary_array)) |
515 | | - } |
516 | | - |
517 | | - match self { |
518 | | - Self::Base64 => { |
519 | | - let upper_bound = base64::decoded_len_estimate(approx_data_size); |
520 | | - delegated_decode(base64_decode, value, upper_bound) |
521 | | - } |
522 | | - Self::Hex => { |
523 | | - // Calculate the upper bound for decoded byte size |
524 | | - // For hex encoding, each pair of hex characters (2 bytes) represents 1 byte when decoded |
525 | | - // So the upper bound is half the length of the input values. |
526 | | - let upper_bound = approx_data_size / 2; |
527 | | - delegated_decode(hex_decode, value, upper_bound) |
528 | | - } |
529 | | - } |
530 | | - } |
531 | 446 | } |
532 | 447 |
|
533 | 448 | fn delegated_decode<'a, DecodeFunction, InputBinaryArray, OutputOffset>( |
|
0 commit comments