diff --git a/src/bin/orc/common.rs b/src/bin/orc/common.rs index 1415230..84a1d11 100644 --- a/src/bin/orc/common.rs +++ b/src/bin/orc/common.rs @@ -86,9 +86,17 @@ pub fn format_stats(stats: &ColumnStatistics) -> String { parts.push(format!("min={min}")); parts.push(format!("max={max}")); } - TypeStatistics::String { min, max, .. } => { - parts.push(format!("min={min}")); - parts.push(format!("max={max}")); + TypeStatistics::String { + lower_bound, + upper_bound, + sum: _, + is_exact_min, + is_exact_max, + } => { + parts.push(format!("min={lower_bound}")); + parts.push(format!("max={upper_bound}")); + parts.push(format!("is_exact_min={is_exact_min}")); + parts.push(format!("is_exact_max={is_exact_max}")); } TypeStatistics::Bucket { true_count } => { parts.push(format!("true_count={true_count}")); diff --git a/src/bin/orc/stats.rs b/src/bin/orc/stats.rs index fac7de2..52602ca 100644 --- a/src/bin/orc/stats.rs +++ b/src/bin/orc/stats.rs @@ -53,11 +53,19 @@ fn print_column_stats(col_stats: &ColumnStatistics) { println!("* Sum: {sum}"); } } - orc_rust::statistics::TypeStatistics::String { min, max, sum } => { + orc_rust::statistics::TypeStatistics::String { + lower_bound, + upper_bound, + sum, + is_exact_min, + is_exact_max, + } => { println!("* Data type String"); - println!("* Minimum: {min}"); - println!("* Maximum: {max}"); + println!("* Minimum: {lower_bound}"); + println!("* Maximum: {upper_bound}"); println!("* Sum: {sum}"); + println!("* IsExactMin: {is_exact_min}"); + println!("* IsExactMax: {is_exact_max}"); } orc_rust::statistics::TypeStatistics::Bucket { true_count } => { println!("* Data type Bucket"); diff --git a/src/row_group_filter.rs b/src/row_group_filter.rs index 28b1531..a834d87 100644 --- a/src/row_group_filter.rs +++ b/src/row_group_filter.rs @@ -249,8 +249,21 @@ fn evaluate_comparison_with_stats( } // String comparisons - TypeStatistics::String { min, max, .. } => match value { - PredicateValue::Utf8(Some(v)) => evaluate_string_comparison(min, max, op, v), + TypeStatistics::String { + lower_bound, + upper_bound, + is_exact_min, + is_exact_max, + .. + } => match value { + PredicateValue::Utf8(Some(v)) => evaluate_string_comparison( + lower_bound, + upper_bound, + op, + v, + *is_exact_min, + *is_exact_max, + ), _ => { return Err(UnexpectedSnafu { msg: "Type mismatch: expected string value".to_string(), @@ -295,7 +308,7 @@ fn evaluate_comparison_with_stats( PredicateValue::Utf8(Some(v)) => { // For decimal, we need to compare strings // This is a simplified implementation - evaluate_string_comparison(min, max, op, v) + evaluate_string_comparison(min, max, op, v, true, true) } _ => { return Err(UnexpectedSnafu { @@ -461,31 +474,41 @@ fn evaluate_float_comparison(min: f64, max: f64, op: ComparisonOp, value: f64) - } } -fn evaluate_string_comparison(min: &str, max: &str, op: ComparisonOp, value: &str) -> bool { +fn evaluate_string_comparison( + lower_bound: &str, + upper_bound: &str, + op: ComparisonOp, + value: &str, + is_exact_min: bool, + is_exact_max: bool, +) -> bool { + // Check if the column's minimum is <= value + let min_le_value = lower_bound < value || (lower_bound == value && is_exact_min); + + // Check if the column's maximum is >= value + let max_ge_value = upper_bound > value || (upper_bound == value && is_exact_max); + match op { - ComparisonOp::Equal => { - // col = value: keep if value is within [min, max] lexicographically - min <= value && value <= max - } + // Range intersection: The value must be reachable from both sides. + ComparisonOp::Equal => min_le_value && max_ge_value, + + // One-sided inclusive checks reuse the logic above. + ComparisonOp::LessThanOrEqual => min_le_value, + ComparisonOp::GreaterThanOrEqual => max_ge_value, + + // Strict checks are simple. + // Note: We don't need to check exactness here. + // e.g., for LessThan: if lower_bound == value, then actual_min >= value, + // so NO row can be strictly less than value. + ComparisonOp::LessThan => lower_bound < value, + ComparisonOp::GreaterThan => upper_bound > value, + + // Special case: Only prune != if we are certain the column contains ONLY `value`. ComparisonOp::NotEqual => { - // col != value: keep if value is not the only value - !(min == value && max == value) - } - ComparisonOp::LessThan => { - // col < value: keep if min < value - min < value - } - ComparisonOp::LessThanOrEqual => { - // col <= value: keep if min <= value - min <= value - } - ComparisonOp::GreaterThan => { - // col > value: keep if max > value - max > value - } - ComparisonOp::GreaterThanOrEqual => { - // col >= value: keep if max >= value - max >= value + let is_single_value_col = lower_bound == upper_bound && is_exact_min && is_exact_max; + + // Keep unless it's a single-value column exactly matching the target + !(is_single_value_col && lower_bound == value) } } } @@ -1324,4 +1347,145 @@ mod tests { assert_eq!(result.len(), 1); assert!(result[0]); } + + #[test] + fn test_evaluate_string_comparison() { + use crate::predicate::ComparisonOp; + + // Helper to make the call shorter + let eval = |lower: &str, + upper: &str, + op: ComparisonOp, + val: &str, + exact_min: bool, + exact_max: bool| { + super::evaluate_string_comparison(lower, upper, op, val, exact_min, exact_max) + }; + + // 1. EQUAL + // Range ["a", "c"], value "b" -> Keep + assert!(eval("a", "c", ComparisonOp::Equal, "b", true, true)); + // Range ["a", "c"], value "d" -> Skip + assert!(!eval("a", "c", ComparisonOp::Equal, "d", true, true)); + // Range ["a", "c"], value "a" -> Keep + assert!(eval("a", "c", ComparisonOp::Equal, "a", true, true)); + // Range ["a", "c"], value "c" -> Keep + assert!(eval("a", "c", ComparisonOp::Equal, "c", true, true)); + + // Truncated stats (inexact) + // Range ["a", "c"] (min inexact), value "a" -> Skip (actual min > "a") + assert!(!eval("a", "c", ComparisonOp::Equal, "a", false, true)); + // Range ["a", "c"] (max inexact), value "c" -> Skip (actual max < "c" max is rounded up) + assert!(!eval("a", "c", ComparisonOp::Equal, "c", true, false)); + + // 2. LESS THAN (< value) + // Range ["a", "c"], value "b". "a" < "b" -> Keep. + assert!(eval("a", "c", ComparisonOp::LessThan, "b", true, true)); + // Range ["d", "e"], value "b". "d" >= "b" -> Skip. + assert!(!eval("d", "e", ComparisonOp::LessThan, "b", true, true)); + // Range ["a", "c"], value "a". "a" < "a" is false. + // If exact, min="a", so no value < "a". Skip. + assert!(!eval("a", "c", ComparisonOp::LessThan, "a", true, true)); + // If not exact, min="a" (truncated). Actual min >= "a". + // So actual min >= value. No value < "a". Skip. + assert!(!eval("a", "c", ComparisonOp::LessThan, "a", false, true)); + + // 3. GREATER THAN (> value) + // Range ["a", "c"], value "b". "c" > "b" -> Keep. + assert!(eval("a", "c", ComparisonOp::GreaterThan, "b", true, true)); + // Range ["a", "b"], value "c". "b" <= "c" -> Skip. + assert!(!eval("a", "b", ComparisonOp::GreaterThan, "c", true, true)); + // Range ["a", "c"], value "c". "c" > "c" is false. + // If exact, max="c", so no value > "c". Skip. + assert!(!eval("a", "c", ComparisonOp::GreaterThan, "c", true, true)); + // If not exact, actual max < "c". Skip. + assert!(!eval("a", "c", ComparisonOp::GreaterThan, "c", true, false)); + + // 4. NOT EQUAL + // Range ["a", "c"], value "b". Keep. + assert!(eval("a", "c", ComparisonOp::NotEqual, "b", true, true)); + // Range ["a", "a"], value "a". + // Exact: Skip. + assert!(!eval("a", "a", ComparisonOp::NotEqual, "a", true, true)); + + // 5. LESS THAN OR EQUAL (<= value) + // Range ["a", "c"], value "b". Keep. + assert!(eval( + "a", + "c", + ComparisonOp::LessThanOrEqual, + "b", + true, + true + )); + // Range ["a", "c"], value "a". Keep. + assert!(eval( + "a", + "c", + ComparisonOp::LessThanOrEqual, + "a", + true, + true + )); + // Range ["b", "c"], value "a". "b" > "a". Skip. + assert!(!eval( + "b", + "c", + ComparisonOp::LessThanOrEqual, + "a", + true, + true + )); + // Inexact min: + // Range ["a", "c"] (min inexact), value "a". + // Actual_min > "a". Skip. + assert!(!eval( + "a", + "c", + ComparisonOp::LessThanOrEqual, + "a", + false, + true + )); + + // 6. GREATER THAN OR EQUAL (>= value) + // Range ["a", "c"], value "b". Keep. + assert!(eval( + "a", + "c", + ComparisonOp::GreaterThanOrEqual, + "b", + true, + true + )); + // Range ["a", "c"], value "c". Keep. + assert!(eval( + "a", + "c", + ComparisonOp::GreaterThanOrEqual, + "c", + true, + true + )); + // Range ["a", "b"], value "c". "b" < "c". Skip. + assert!(!eval( + "a", + "b", + ComparisonOp::GreaterThanOrEqual, + "c", + true, + true + )); + // Inexact max: + // Range ["a", "b"] (max inexact), value "b". + // Actual_max < "b". Skip. + assert!(!eval( + "a", + "b", + ComparisonOp::GreaterThanOrEqual, + "b", + true, + false + )); + } } diff --git a/src/statistics.rs b/src/statistics.rs index 6891e91..cb12b15 100644 --- a/src/statistics.rs +++ b/src/statistics.rs @@ -58,10 +58,14 @@ pub enum TypeStatistics { sum: Option, }, String { - min: String, - max: String, + lower_bound: String, + upper_bound: String, /// Total length of all strings sum: i64, + /// If true, 'min' is an exact minimum. If false, it is a lower bound. + is_exact_min: bool, + /// If true, 'max' is an exact maximum. If false, it is an upper bound. + is_exact_max: bool, }, /// For Boolean Bucket { true_count: u64 }, @@ -101,7 +105,9 @@ impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics { type Error = error::OrcError; fn try_from(value: &proto::ColumnStatistics) -> Result { - let type_statistics = if let Some(stats) = &value.int_statistics { + let type_statistics = if value.number_of_values() == 0 { + None + } else if let Some(stats) = &value.int_statistics { Some(TypeStatistics::Integer { min: stats.minimum(), max: stats.maximum(), @@ -114,10 +120,22 @@ impl TryFrom<&proto::ColumnStatistics> for ColumnStatistics { sum: stats.sum, }) } else if let Some(stats) = &value.string_statistics { + let (lower_bound, is_exact_min) = stats + .minimum + .as_deref() + .map(|s| (s, true)) + .unwrap_or_else(|| (stats.lower_bound(), false)); + let (upper_bound, is_exact_max) = stats + .maximum + .as_deref() + .map(|s| (s, true)) + .unwrap_or_else(|| (stats.upper_bound(), false)); Some(TypeStatistics::String { - min: stats.minimum().to_owned(), - max: stats.maximum().to_owned(), + lower_bound: lower_bound.to_owned(), + upper_bound: upper_bound.to_owned(), sum: stats.sum(), + is_exact_min, + is_exact_max, }) } else if let Some(stats) = &value.bucket_statistics { // TODO: false count? diff --git a/tests/bin/expected/stats.out b/tests/bin/expected/stats.out index 916c414..3597db2 100644 --- a/tests/bin/expected/stats.out +++ b/tests/bin/expected/stats.out @@ -24,6 +24,8 @@ File "tests/basic/data/test.orc" has 21 columns * Minimum: a * Maximum: ee * Sum: 12 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -32,6 +34,8 @@ File "tests/basic/data/test.orc" has 21 columns * Minimum: a * Maximum: ddd * Sum: 9 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -40,6 +44,8 @@ File "tests/basic/data/test.orc" has 21 columns * Minimum: a * Maximum: ddd * Sum: 8 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -48,6 +54,8 @@ File "tests/basic/data/test.orc" has 21 columns * Minimum: aaaaa * Maximum: ddddd * Sum: 20 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -128,6 +136,8 @@ File "tests/basic/data/test.orc" has 21 columns * Minimum: a * Maximum: eeeee * Sum: 15 +* IsExactMin: true +* IsExactMax: true * Num values: 5 * Has nulls: false @@ -136,6 +146,8 @@ File "tests/basic/data/test.orc" has 21 columns * Minimum: a * Maximum: eeeee * Sum: 15 +* IsExactMin: true +* IsExactMax: true * Num values: 5 * Has nulls: false @@ -191,6 +203,8 @@ File "tests/basic/data/test.orc" has 1 stripes * Minimum: a * Maximum: ee * Sum: 12 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -199,6 +213,8 @@ File "tests/basic/data/test.orc" has 1 stripes * Minimum: a * Maximum: ddd * Sum: 9 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -207,6 +223,8 @@ File "tests/basic/data/test.orc" has 1 stripes * Minimum: a * Maximum: ddd * Sum: 8 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -215,6 +233,8 @@ File "tests/basic/data/test.orc" has 1 stripes * Minimum: aaaaa * Maximum: ddddd * Sum: 20 +* IsExactMin: true +* IsExactMax: true * Num values: 4 * Has nulls: true @@ -295,6 +315,8 @@ File "tests/basic/data/test.orc" has 1 stripes * Minimum: a * Maximum: eeeee * Sum: 15 +* IsExactMin: true +* IsExactMax: true * Num values: 5 * Has nulls: false @@ -303,6 +325,8 @@ File "tests/basic/data/test.orc" has 1 stripes * Minimum: a * Maximum: eeeee * Sum: 15 +* IsExactMin: true +* IsExactMax: true * Num values: 5 * Has nulls: false