Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
* <li>min: the minimum value of the column
* <li>max: the maximum value of the column
* <li>nullCount: the number of nulls
* <li>nanCount: the number of NaN values for FLOAT/DOUBLE columns, null if unknown
* </ul>
*/
public class SimpleColStats {
Expand All @@ -38,11 +39,21 @@ public class SimpleColStats {
@Nullable private final Object min;
@Nullable private final Object max;
private final Long nullCount;
@Nullable private final Long nanCount;

public SimpleColStats(@Nullable Object min, @Nullable Object max, @Nullable Long nullCount) {
this(min, max, nullCount, null);
}

public SimpleColStats(
@Nullable Object min,
@Nullable Object max,
@Nullable Long nullCount,
@Nullable Long nanCount) {
this.min = min;
this.max = max;
this.nullCount = nullCount;
this.nanCount = nanCount;
}

@Nullable
Expand All @@ -60,6 +71,11 @@ public Long nullCount() {
return nullCount;
}

@Nullable
public Long nanCount() {
return nanCount;
}

public boolean isNone() {
return min == null && max == null && nullCount == null;
}
Expand All @@ -72,16 +88,17 @@ public boolean equals(Object o) {
SimpleColStats that = (SimpleColStats) o;
return Objects.equals(min, that.min)
&& Objects.equals(max, that.max)
&& Objects.equals(nullCount, that.nullCount);
&& Objects.equals(nullCount, that.nullCount)
&& Objects.equals(nanCount, that.nanCount);
}

@Override
public int hashCode() {
return Objects.hash(min, max, nullCount);
return Objects.hash(min, max, nullCount, nanCount);
}

@Override
public String toString() {
return String.format("{%s, %s, %d}", min, max, nullCount);
return String.format("{%s, %s, %d, %s}", min, max, nullCount, nanCount);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@ public abstract class AbstractSimpleColStatsCollector implements SimpleColStatsC

protected long nullCount;

protected long nanCount;

@Override
public SimpleColStats result() {
return new SimpleColStats(minValue, maxValue, nullCount);
return new SimpleColStats(minValue, maxValue, nullCount, nanCount);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,17 @@ public class CountsSimpleColStatsCollector extends AbstractSimpleColStatsCollect
public void collect(Object field, Serializer<Object> serializer) {
if (field == null) {
nullCount++;
return;
}
if (field instanceof Double && Double.isNaN((Double) field)) {
nanCount++;
} else if (field instanceof Float && Float.isNaN((Float) field)) {
nanCount++;
}
}

@Override
public SimpleColStats convert(SimpleColStats source) {
return new SimpleColStats(null, null, source.nullCount());
return new SimpleColStats(null, null, source.nullCount(), source.nanCount());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ public void collect(Object field, Serializer<Object> fieldSerializer) {
return;
}

if (field instanceof Double && Double.isNaN((Double) field)) {
nanCount++;
return;
}
if (field instanceof Float && Float.isNaN((Float) field)) {
nanCount++;
return;
}

// TODO use comparator for not comparable types and extract this logic to a util class
if (!(field instanceof Comparable)) {
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ public void collect(Object field, Serializer<Object> fieldSerializer) {
return;
}

if (field instanceof Double && Double.isNaN((Double) field)) {
nanCount++;
return;
}
if (field instanceof Float && Float.isNaN((Float) field)) {
nanCount++;
return;
}

// fast fail since the result is not correct
if (failed) {
return;
Expand Down Expand Up @@ -86,17 +95,17 @@ public SimpleColStats convert(SimpleColStats source) {
Object min = truncateMin(source.min());
Object max = truncateMax(source.max());
if (max == null) {
return new SimpleColStats(null, null, source.nullCount());
return new SimpleColStats(null, null, source.nullCount(), source.nanCount());
}
return new SimpleColStats(min, max, source.nullCount());
return new SimpleColStats(min, max, source.nullCount(), source.nanCount());
}

@Override
public SimpleColStats result() {
if (failed) {
return new SimpleColStats(null, null, nullCount);
return new SimpleColStats(null, null, nullCount, nanCount);
}
return new SimpleColStats(minValue, maxValue, nullCount);
return new SimpleColStats(minValue, maxValue, nullCount, nanCount);
}

/** @return a truncated value less or equal than the old value. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import org.apache.paimon.data.serializer.Serializer;
import org.apache.paimon.format.SimpleColStats;
import org.apache.paimon.types.DataField;
import org.apache.paimon.types.DoubleType;
import org.apache.paimon.types.FloatType;
import org.apache.paimon.types.IntType;
import org.apache.paimon.types.RowType;
import org.apache.paimon.types.VarCharType;
Expand Down Expand Up @@ -112,14 +114,14 @@ public void testCounts() {
check(
rows,
0,
new SimpleColStats(null, null, 0L),
new SimpleColStats(1, 4, 0L),
new SimpleColStats(null, null, 0L, 0L),
new SimpleColStats(1, 4, 0L, 0L),
new CountsSimpleColStatsCollector());
check(
rows,
1,
new SimpleColStats(null, null, 1L),
new SimpleColStats(s1, s3, 1L),
new SimpleColStats(null, null, 1L, 0L),
new SimpleColStats(s1, s3, 1L, 0L),
new CountsSimpleColStatsCollector());
}

Expand All @@ -130,14 +132,16 @@ public void testFull() {
check(
rows,
0,
new SimpleColStats(1, 4, 0L),
new SimpleColStats(1, 4, 0L),
new SimpleColStats(1, 4, 0L, 0L),
new SimpleColStats(1, 4, 0L, 0L),
new FullSimpleColStatsCollector());
check(
rows,
1,
new SimpleColStats(BinaryString.fromString(s1), BinaryString.fromString(s3), 1L),
new SimpleColStats(BinaryString.fromString(s1), BinaryString.fromString(s3), 1L),
new SimpleColStats(
BinaryString.fromString(s1), BinaryString.fromString(s3), 1L, 0L),
new SimpleColStats(
BinaryString.fromString(s1), BinaryString.fromString(s3), 1L, 0L),
new FullSimpleColStatsCollector());
}

Expand All @@ -148,18 +152,110 @@ public void testTruncate() {
check(
rows,
0,
new SimpleColStats(1, 4, 0L),
new SimpleColStats(1, 4, 0L),
new SimpleColStats(1, 4, 0L, 0L),
new SimpleColStats(1, 4, 0L, 0L),
new TruncateSimpleColStatsCollector(1));
check(
rows,
1,
new SimpleColStats(
BinaryString.fromString(s1_t), BinaryString.fromString(s3_t), 1L),
new SimpleColStats(BinaryString.fromString(s1), BinaryString.fromString(s3), 1L),
BinaryString.fromString(s1_t), BinaryString.fromString(s3_t), 1L, 0L),
new SimpleColStats(
BinaryString.fromString(s1), BinaryString.fromString(s3), 1L, 0L),
new TruncateSimpleColStatsCollector(2));
}

@Test
public void testFullCountsNaNAndExcludesFromBounds() {
RowType rowType =
new RowType(
Arrays.asList(
new DataField(0, "d", new DoubleType()),
new DataField(1, "f", new FloatType())));
Serializer<Object>[] floatSerializers = new Serializer[2];
for (int i = 0; i < rowType.getFieldCount(); i++) {
floatSerializers[i] = InternalSerializers.create(rowType.getTypeAt(i));
}

FullSimpleColStatsCollector doubleCollector = new FullSimpleColStatsCollector();
doubleCollector.collect(1.0d, floatSerializers[0]);
doubleCollector.collect(Double.NaN, floatSerializers[0]);
doubleCollector.collect(5.0d, floatSerializers[0]);
doubleCollector.collect(Double.NaN, floatSerializers[0]);
doubleCollector.collect(null, floatSerializers[0]);
assertThat(doubleCollector.result()).isEqualTo(new SimpleColStats(1.0d, 5.0d, 1L, 2L));

FullSimpleColStatsCollector floatCollector = new FullSimpleColStatsCollector();
floatCollector.collect(2.0f, floatSerializers[1]);
floatCollector.collect(Float.NaN, floatSerializers[1]);
floatCollector.collect(7.0f, floatSerializers[1]);
assertThat(floatCollector.result()).isEqualTo(new SimpleColStats(2.0f, 7.0f, 0L, 1L));
}

@Test
public void testCountsNaN() {
Serializer<Object> doubleSerializer = InternalSerializers.create(new DoubleType());
CountsSimpleColStatsCollector collector = new CountsSimpleColStatsCollector();
collector.collect(1.0d, doubleSerializer);
collector.collect(Double.NaN, doubleSerializer);
collector.collect(null, doubleSerializer);
collector.collect(Double.NaN, doubleSerializer);
assertThat(collector.result()).isEqualTo(new SimpleColStats(null, null, 1L, 2L));
}

@Test
public void testFullAllNaN() {
Serializer<Object> doubleSerializer = InternalSerializers.create(new DoubleType());
FullSimpleColStatsCollector collector = new FullSimpleColStatsCollector();
collector.collect(Double.NaN, doubleSerializer);
collector.collect(Double.NaN, doubleSerializer);
collector.collect(Double.NaN, doubleSerializer);
assertThat(collector.result()).isEqualTo(new SimpleColStats(null, null, 0L, 3L));
}

@Test
public void testFullOnlyNaNAndNull() {
Serializer<Object> doubleSerializer = InternalSerializers.create(new DoubleType());
FullSimpleColStatsCollector collector = new FullSimpleColStatsCollector();
collector.collect(null, doubleSerializer);
collector.collect(Double.NaN, doubleSerializer);
collector.collect(null, doubleSerializer);
collector.collect(Double.NaN, doubleSerializer);
collector.collect(null, doubleSerializer);
assertThat(collector.result()).isEqualTo(new SimpleColStats(null, null, 3L, 2L));
}

@Test
public void testNoneIgnoresNaN() {
Serializer<Object> doubleSerializer = InternalSerializers.create(new DoubleType());
NoneSimpleColStatsCollector collector = new NoneSimpleColStatsCollector();
collector.collect(Double.NaN, doubleSerializer);
collector.collect(1.0d, doubleSerializer);
collector.collect(Double.NaN, doubleSerializer);
assertThat(collector.result()).isEqualTo(SimpleColStats.NONE);
assertThat(collector.result().nanCount()).isNull();
}

@Test
public void testConvertPreservesNanCount() {
SimpleColStats source = new SimpleColStats(1.0d, 5.0d, 2L, 7L);
assertThat(new FullSimpleColStatsCollector().convert(source).nanCount()).isEqualTo(7L);
assertThat(new CountsSimpleColStatsCollector().convert(source).nanCount()).isEqualTo(7L);
assertThat(new TruncateSimpleColStatsCollector(16).convert(source).nanCount())
.isEqualTo(7L);
assertThat(new NoneSimpleColStatsCollector().convert(source).nanCount()).isNull();
}

@Test
public void testSimpleColStatsEqualityIncludesNanCount() {
assertThat(new SimpleColStats(1.0d, 5.0d, 0L, 0L))
.isNotEqualTo(new SimpleColStats(1.0d, 5.0d, 0L, 1L));
assertThat(new SimpleColStats(1.0d, 5.0d, 0L, 0L))
.isNotEqualTo(new SimpleColStats(1.0d, 5.0d, 0L, null));
assertThat(new SimpleColStats(1.0d, 5.0d, 0L, 7L))
.isEqualTo(new SimpleColStats(1.0d, 5.0d, 0L, 7L));
}

@Test
public void testTruncateTwoChar() {
TruncateSimpleColStatsCollector t1 = new TruncateSimpleColStatsCollector(1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ public IcebergManifestFileMeta result() throws IOException {
switch (type.getTypeRoot()) {
case FLOAT:
case DOUBLE:
containsNan = isNaN(fieldStats.min()) || isNaN(fieldStats.max());
Long nanCount = fieldStats.nanCount();
containsNan = nanCount != null && nanCount > 0;
break;
default:
// contains_nan is only meaningful for FLOAT/DOUBLE per the Iceberg spec
Expand Down Expand Up @@ -287,15 +288,5 @@ public IcebergManifestFileMeta result() throws IOException {
deletedRowsCount,
partitionSummaries);
}

private boolean isNaN(@Nullable Object value) {
if (value instanceof Float) {
return Float.isNaN((Float) value);
}
if (value instanceof Double) {
return Double.isNaN((Double) value);
}
return false;
}
}
}
Loading
Loading