diff --git a/paimon-common/src/main/java/org/apache/paimon/format/SimpleColStats.java b/paimon-common/src/main/java/org/apache/paimon/format/SimpleColStats.java index 0b0062b7568f..368e14d13e46 100644 --- a/paimon-common/src/main/java/org/apache/paimon/format/SimpleColStats.java +++ b/paimon-common/src/main/java/org/apache/paimon/format/SimpleColStats.java @@ -29,6 +29,7 @@ *
  • min: the minimum value of the column *
  • max: the maximum value of the column *
  • nullCount: the number of nulls + *
  • nanCount: the number of NaN values for FLOAT/DOUBLE columns, null if unknown * */ public class SimpleColStats { @@ -38,11 +39,21 @@ public class SimpleColStats { @Nullable private final Object min; @Nullable private final Object max; private final Long nullCount; + @Nullable private final Long nanCount; public SimpleColStats(@Nullable Object min, @Nullable Object max, @Nullable Long nullCount) { + this(min, max, nullCount, null); + } + + public SimpleColStats( + @Nullable Object min, + @Nullable Object max, + @Nullable Long nullCount, + @Nullable Long nanCount) { this.min = min; this.max = max; this.nullCount = nullCount; + this.nanCount = nanCount; } @Nullable @@ -60,6 +71,11 @@ public Long nullCount() { return nullCount; } + @Nullable + public Long nanCount() { + return nanCount; + } + public boolean isNone() { return min == null && max == null && nullCount == null; } @@ -72,16 +88,17 @@ public boolean equals(Object o) { SimpleColStats that = (SimpleColStats) o; return Objects.equals(min, that.min) && Objects.equals(max, that.max) - && Objects.equals(nullCount, that.nullCount); + && Objects.equals(nullCount, that.nullCount) + && Objects.equals(nanCount, that.nanCount); } @Override public int hashCode() { - return Objects.hash(min, max, nullCount); + return Objects.hash(min, max, nullCount, nanCount); } @Override public String toString() { - return String.format("{%s, %s, %d}", min, max, nullCount); + return String.format("{%s, %s, %d, %s}", min, max, nullCount, nanCount); } } diff --git a/paimon-common/src/main/java/org/apache/paimon/statistics/AbstractSimpleColStatsCollector.java b/paimon-common/src/main/java/org/apache/paimon/statistics/AbstractSimpleColStatsCollector.java index ed2d75efa06b..f4a50768b0b0 100644 --- a/paimon-common/src/main/java/org/apache/paimon/statistics/AbstractSimpleColStatsCollector.java +++ b/paimon-common/src/main/java/org/apache/paimon/statistics/AbstractSimpleColStatsCollector.java @@ -29,8 +29,10 @@ public abstract class AbstractSimpleColStatsCollector implements SimpleColStatsC protected long nullCount; + protected long nanCount; + @Override public SimpleColStats result() { - return new SimpleColStats(minValue, maxValue, nullCount); + return new SimpleColStats(minValue, maxValue, nullCount, nanCount); } } diff --git a/paimon-common/src/main/java/org/apache/paimon/statistics/CountsSimpleColStatsCollector.java b/paimon-common/src/main/java/org/apache/paimon/statistics/CountsSimpleColStatsCollector.java index 7933cef46475..c80d25dceeb0 100644 --- a/paimon-common/src/main/java/org/apache/paimon/statistics/CountsSimpleColStatsCollector.java +++ b/paimon-common/src/main/java/org/apache/paimon/statistics/CountsSimpleColStatsCollector.java @@ -28,11 +28,17 @@ public class CountsSimpleColStatsCollector extends AbstractSimpleColStatsCollect public void collect(Object field, Serializer serializer) { if (field == null) { nullCount++; + return; + } + if (field instanceof Double && Double.isNaN((Double) field)) { + nanCount++; + } else if (field instanceof Float && Float.isNaN((Float) field)) { + nanCount++; } } @Override public SimpleColStats convert(SimpleColStats source) { - return new SimpleColStats(null, null, source.nullCount()); + return new SimpleColStats(null, null, source.nullCount(), source.nanCount()); } } diff --git a/paimon-common/src/main/java/org/apache/paimon/statistics/FullSimpleColStatsCollector.java b/paimon-common/src/main/java/org/apache/paimon/statistics/FullSimpleColStatsCollector.java index a1e05eb2dc74..4bd6672f50bb 100644 --- a/paimon-common/src/main/java/org/apache/paimon/statistics/FullSimpleColStatsCollector.java +++ b/paimon-common/src/main/java/org/apache/paimon/statistics/FullSimpleColStatsCollector.java @@ -31,6 +31,15 @@ public void collect(Object field, Serializer fieldSerializer) { return; } + if (field instanceof Double && Double.isNaN((Double) field)) { + nanCount++; + return; + } + if (field instanceof Float && Float.isNaN((Float) field)) { + nanCount++; + return; + } + // TODO use comparator for not comparable types and extract this logic to a util class if (!(field instanceof Comparable)) { return; diff --git a/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateSimpleColStatsCollector.java b/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateSimpleColStatsCollector.java index cba942c7905c..6da980b366ef 100644 --- a/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateSimpleColStatsCollector.java +++ b/paimon-common/src/main/java/org/apache/paimon/statistics/TruncateSimpleColStatsCollector.java @@ -53,6 +53,15 @@ public void collect(Object field, Serializer fieldSerializer) { return; } + if (field instanceof Double && Double.isNaN((Double) field)) { + nanCount++; + return; + } + if (field instanceof Float && Float.isNaN((Float) field)) { + nanCount++; + return; + } + // fast fail since the result is not correct if (failed) { return; @@ -86,17 +95,17 @@ public SimpleColStats convert(SimpleColStats source) { Object min = truncateMin(source.min()); Object max = truncateMax(source.max()); if (max == null) { - return new SimpleColStats(null, null, source.nullCount()); + return new SimpleColStats(null, null, source.nullCount(), source.nanCount()); } - return new SimpleColStats(min, max, source.nullCount()); + return new SimpleColStats(min, max, source.nullCount(), source.nanCount()); } @Override public SimpleColStats result() { if (failed) { - return new SimpleColStats(null, null, nullCount); + return new SimpleColStats(null, null, nullCount, nanCount); } - return new SimpleColStats(minValue, maxValue, nullCount); + return new SimpleColStats(minValue, maxValue, nullCount, nanCount); } /** @return a truncated value less or equal than the old value. */ diff --git a/paimon-common/src/test/java/org/apache/paimon/statistics/SimpleColStatsCollectorTest.java b/paimon-common/src/test/java/org/apache/paimon/statistics/SimpleColStatsCollectorTest.java index 4511ae26ee7a..ccad73f0b873 100644 --- a/paimon-common/src/test/java/org/apache/paimon/statistics/SimpleColStatsCollectorTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/statistics/SimpleColStatsCollectorTest.java @@ -25,6 +25,8 @@ import org.apache.paimon.data.serializer.Serializer; import org.apache.paimon.format.SimpleColStats; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; import org.apache.paimon.types.IntType; import org.apache.paimon.types.RowType; import org.apache.paimon.types.VarCharType; @@ -112,14 +114,14 @@ public void testCounts() { check( rows, 0, - new SimpleColStats(null, null, 0L), - new SimpleColStats(1, 4, 0L), + new SimpleColStats(null, null, 0L, 0L), + new SimpleColStats(1, 4, 0L, 0L), new CountsSimpleColStatsCollector()); check( rows, 1, - new SimpleColStats(null, null, 1L), - new SimpleColStats(s1, s3, 1L), + new SimpleColStats(null, null, 1L, 0L), + new SimpleColStats(s1, s3, 1L, 0L), new CountsSimpleColStatsCollector()); } @@ -130,14 +132,16 @@ public void testFull() { check( rows, 0, - new SimpleColStats(1, 4, 0L), - new SimpleColStats(1, 4, 0L), + new SimpleColStats(1, 4, 0L, 0L), + new SimpleColStats(1, 4, 0L, 0L), new FullSimpleColStatsCollector()); check( rows, 1, - new SimpleColStats(BinaryString.fromString(s1), BinaryString.fromString(s3), 1L), - new SimpleColStats(BinaryString.fromString(s1), BinaryString.fromString(s3), 1L), + new SimpleColStats( + BinaryString.fromString(s1), BinaryString.fromString(s3), 1L, 0L), + new SimpleColStats( + BinaryString.fromString(s1), BinaryString.fromString(s3), 1L, 0L), new FullSimpleColStatsCollector()); } @@ -148,18 +152,110 @@ public void testTruncate() { check( rows, 0, - new SimpleColStats(1, 4, 0L), - new SimpleColStats(1, 4, 0L), + new SimpleColStats(1, 4, 0L, 0L), + new SimpleColStats(1, 4, 0L, 0L), new TruncateSimpleColStatsCollector(1)); check( rows, 1, new SimpleColStats( - BinaryString.fromString(s1_t), BinaryString.fromString(s3_t), 1L), - new SimpleColStats(BinaryString.fromString(s1), BinaryString.fromString(s3), 1L), + BinaryString.fromString(s1_t), BinaryString.fromString(s3_t), 1L, 0L), + new SimpleColStats( + BinaryString.fromString(s1), BinaryString.fromString(s3), 1L, 0L), new TruncateSimpleColStatsCollector(2)); } + @Test + public void testFullCountsNaNAndExcludesFromBounds() { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "d", new DoubleType()), + new DataField(1, "f", new FloatType()))); + Serializer[] floatSerializers = new Serializer[2]; + for (int i = 0; i < rowType.getFieldCount(); i++) { + floatSerializers[i] = InternalSerializers.create(rowType.getTypeAt(i)); + } + + FullSimpleColStatsCollector doubleCollector = new FullSimpleColStatsCollector(); + doubleCollector.collect(1.0d, floatSerializers[0]); + doubleCollector.collect(Double.NaN, floatSerializers[0]); + doubleCollector.collect(5.0d, floatSerializers[0]); + doubleCollector.collect(Double.NaN, floatSerializers[0]); + doubleCollector.collect(null, floatSerializers[0]); + assertThat(doubleCollector.result()).isEqualTo(new SimpleColStats(1.0d, 5.0d, 1L, 2L)); + + FullSimpleColStatsCollector floatCollector = new FullSimpleColStatsCollector(); + floatCollector.collect(2.0f, floatSerializers[1]); + floatCollector.collect(Float.NaN, floatSerializers[1]); + floatCollector.collect(7.0f, floatSerializers[1]); + assertThat(floatCollector.result()).isEqualTo(new SimpleColStats(2.0f, 7.0f, 0L, 1L)); + } + + @Test + public void testCountsNaN() { + Serializer doubleSerializer = InternalSerializers.create(new DoubleType()); + CountsSimpleColStatsCollector collector = new CountsSimpleColStatsCollector(); + collector.collect(1.0d, doubleSerializer); + collector.collect(Double.NaN, doubleSerializer); + collector.collect(null, doubleSerializer); + collector.collect(Double.NaN, doubleSerializer); + assertThat(collector.result()).isEqualTo(new SimpleColStats(null, null, 1L, 2L)); + } + + @Test + public void testFullAllNaN() { + Serializer doubleSerializer = InternalSerializers.create(new DoubleType()); + FullSimpleColStatsCollector collector = new FullSimpleColStatsCollector(); + collector.collect(Double.NaN, doubleSerializer); + collector.collect(Double.NaN, doubleSerializer); + collector.collect(Double.NaN, doubleSerializer); + assertThat(collector.result()).isEqualTo(new SimpleColStats(null, null, 0L, 3L)); + } + + @Test + public void testFullOnlyNaNAndNull() { + Serializer doubleSerializer = InternalSerializers.create(new DoubleType()); + FullSimpleColStatsCollector collector = new FullSimpleColStatsCollector(); + collector.collect(null, doubleSerializer); + collector.collect(Double.NaN, doubleSerializer); + collector.collect(null, doubleSerializer); + collector.collect(Double.NaN, doubleSerializer); + collector.collect(null, doubleSerializer); + assertThat(collector.result()).isEqualTo(new SimpleColStats(null, null, 3L, 2L)); + } + + @Test + public void testNoneIgnoresNaN() { + Serializer doubleSerializer = InternalSerializers.create(new DoubleType()); + NoneSimpleColStatsCollector collector = new NoneSimpleColStatsCollector(); + collector.collect(Double.NaN, doubleSerializer); + collector.collect(1.0d, doubleSerializer); + collector.collect(Double.NaN, doubleSerializer); + assertThat(collector.result()).isEqualTo(SimpleColStats.NONE); + assertThat(collector.result().nanCount()).isNull(); + } + + @Test + public void testConvertPreservesNanCount() { + SimpleColStats source = new SimpleColStats(1.0d, 5.0d, 2L, 7L); + assertThat(new FullSimpleColStatsCollector().convert(source).nanCount()).isEqualTo(7L); + assertThat(new CountsSimpleColStatsCollector().convert(source).nanCount()).isEqualTo(7L); + assertThat(new TruncateSimpleColStatsCollector(16).convert(source).nanCount()) + .isEqualTo(7L); + assertThat(new NoneSimpleColStatsCollector().convert(source).nanCount()).isNull(); + } + + @Test + public void testSimpleColStatsEqualityIncludesNanCount() { + assertThat(new SimpleColStats(1.0d, 5.0d, 0L, 0L)) + .isNotEqualTo(new SimpleColStats(1.0d, 5.0d, 0L, 1L)); + assertThat(new SimpleColStats(1.0d, 5.0d, 0L, 0L)) + .isNotEqualTo(new SimpleColStats(1.0d, 5.0d, 0L, null)); + assertThat(new SimpleColStats(1.0d, 5.0d, 0L, 7L)) + .isEqualTo(new SimpleColStats(1.0d, 5.0d, 0L, 7L)); + } + @Test public void testTruncateTwoChar() { TruncateSimpleColStatsCollector t1 = new TruncateSimpleColStatsCollector(1); diff --git a/paimon-core/src/main/java/org/apache/paimon/iceberg/manifest/IcebergManifestFile.java b/paimon-core/src/main/java/org/apache/paimon/iceberg/manifest/IcebergManifestFile.java index 46fd14390672..d9b3bdfca1ca 100644 --- a/paimon-core/src/main/java/org/apache/paimon/iceberg/manifest/IcebergManifestFile.java +++ b/paimon-core/src/main/java/org/apache/paimon/iceberg/manifest/IcebergManifestFile.java @@ -259,7 +259,8 @@ public IcebergManifestFileMeta result() throws IOException { switch (type.getTypeRoot()) { case FLOAT: case DOUBLE: - containsNan = isNaN(fieldStats.min()) || isNaN(fieldStats.max()); + Long nanCount = fieldStats.nanCount(); + containsNan = nanCount != null && nanCount > 0; break; default: // contains_nan is only meaningful for FLOAT/DOUBLE per the Iceberg spec @@ -287,15 +288,5 @@ public IcebergManifestFileMeta result() throws IOException { deletedRowsCount, partitionSummaries); } - - private boolean isNaN(@Nullable Object value) { - if (value instanceof Float) { - return Float.isNaN((Float) value); - } - if (value instanceof Double) { - return Double.isNaN((Double) value); - } - return false; - } } } diff --git a/paimon-core/src/test/java/org/apache/paimon/stats/SimpleStatsCollectorTest.java b/paimon-core/src/test/java/org/apache/paimon/stats/SimpleStatsCollectorTest.java index 18536906089d..14013cdfa930 100644 --- a/paimon-core/src/test/java/org/apache/paimon/stats/SimpleStatsCollectorTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/stats/SimpleStatsCollectorTest.java @@ -26,6 +26,8 @@ import org.apache.paimon.statistics.FullSimpleColStatsCollector; import org.apache.paimon.statistics.SimpleColStatsCollector; import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; import org.apache.paimon.types.IntType; import org.apache.paimon.types.RowType; import org.apache.paimon.types.VarCharType; @@ -59,24 +61,26 @@ public void testCollect() { assertThat(collector.extract()) .isEqualTo( new SimpleColStats[] { - new SimpleColStats(1, 1, 0L), + new SimpleColStats(1, 1, 0L, 0L), new SimpleColStats( BinaryString.fromString("Paimon"), BinaryString.fromString("Paimon"), + 0L, 0L), - new SimpleColStats(null, null, 0L) + new SimpleColStats(null, null, 0L, 0L) }); collector.collect(GenericRow.of(3, null, new GenericArray(new int[] {3, 30}))); assertThat(collector.extract()) .isEqualTo( new SimpleColStats[] { - new SimpleColStats(1, 3, 0L), + new SimpleColStats(1, 3, 0L, 0L), new SimpleColStats( BinaryString.fromString("Paimon"), BinaryString.fromString("Paimon"), - 1L), - new SimpleColStats(null, null, 0L) + 1L, + 0L), + new SimpleColStats(null, null, 0L, 0L) }); collector.collect( @@ -87,24 +91,59 @@ public void testCollect() { assertThat(collector.extract()) .isEqualTo( new SimpleColStats[] { - new SimpleColStats(1, 3, 1L), + new SimpleColStats(1, 3, 1L, 0L), new SimpleColStats( BinaryString.fromString("Apache"), BinaryString.fromString("Paimon"), - 1L), - new SimpleColStats(null, null, 0L) + 1L, + 0L), + new SimpleColStats(null, null, 0L, 0L) }); collector.collect(GenericRow.of(2, BinaryString.fromString("Batch"), null)); assertThat(collector.extract()) .isEqualTo( new SimpleColStats[] { - new SimpleColStats(1, 3, 1L), + new SimpleColStats(1, 3, 1L, 0L), new SimpleColStats( BinaryString.fromString("Apache"), BinaryString.fromString("Paimon"), - 1L), - new SimpleColStats(null, null, 1L) + 1L, + 0L), + new SimpleColStats(null, null, 1L, 0L) + }); + } + + @Test + public void testCollectNaN() { + RowType rowType = + RowType.of(new DoubleType(), new FloatType(), new IntType(), new VarCharType(10)); + SimpleStatsCollector collector = + new SimpleStatsCollector( + rowType, + IntStream.range(0, rowType.getFieldCount()) + .mapToObj( + i -> + (SimpleColStatsCollector.Factory) + FullSimpleColStatsCollector::new) + .toArray(SimpleColStatsCollector.Factory[]::new)); + + collector.collect(GenericRow.of(1.0d, 1.0f, 1, BinaryString.fromString("a"))); + collector.collect(GenericRow.of(Double.NaN, 2.0f, 2, BinaryString.fromString("b"))); + collector.collect(GenericRow.of(5.0d, Float.NaN, 3, null)); + collector.collect(GenericRow.of(Double.NaN, Float.NaN, null, BinaryString.fromString("c"))); + + assertThat(collector.extract()) + .isEqualTo( + new SimpleColStats[] { + new SimpleColStats(1.0d, 5.0d, 0L, 2L), + new SimpleColStats(1.0f, 2.0f, 0L, 2L), + new SimpleColStats(1, 3, 1L, 0L), + new SimpleColStats( + BinaryString.fromString("a"), + BinaryString.fromString("c"), + 1L, + 0L) }); } }