Skip to content

Commit bd70924

Browse files
committed
Merge branch 'main' into iceberg-split-serialization-dpp
2 parents 9cc541a + a2f8e54 commit bd70924

69 files changed

Lines changed: 1832 additions & 853 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/actions/java-test/action.yaml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ inputs:
3232
scan_impl:
3333
description: 'The default Parquet scan implementation'
3434
required: false
35-
default: 'native_comet'
35+
default: 'auto'
3636
upload-test-reports:
3737
description: 'Whether to upload test results including coverage to GitHub'
3838
required: false
@@ -146,7 +146,3 @@ runs:
146146
path: "**/target/surefire-reports/*.txt"
147147
retention-days: 7 # 1 week for test reports
148148
overwrite: true
149-
150-
- name: Upload coverage results
151-
if: ${{ inputs.upload-test-reports == 'true' }}
152-
uses: codecov/codecov-action@v5

.github/workflows/docs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ jobs:
4747
python-version: "3.10"
4848

4949
- name: Setup Java
50-
uses: actions/setup-java@v4
50+
uses: actions/setup-java@v5
5151
with:
5252
distribution: 'temurin'
5353
java-version: '17'

.github/workflows/pr_build_linux.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ jobs:
164164
- name: "Spark 3.4, JDK 11, Scala 2.12"
165165
java_version: "11"
166166
maven_opts: "-Pspark-3.4 -Pscala-2.12"
167-
scan_impl: "native_comet"
167+
scan_impl: "auto"
168168

169169
- name: "Spark 3.5.5, JDK 17, Scala 2.13"
170170
java_version: "17"
@@ -174,7 +174,7 @@ jobs:
174174
- name: "Spark 3.5.6, JDK 17, Scala 2.13"
175175
java_version: "17"
176176
maven_opts: "-Pspark-3.5 -Dspark.version=3.5.6 -Pscala-2.13"
177-
scan_impl: "native_comet"
177+
scan_impl: "auto"
178178

179179
- name: "Spark 3.5, JDK 17, Scala 2.12"
180180
java_version: "17"

.github/workflows/spark_sql_test.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,15 +116,12 @@ jobs:
116116
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
117117
# Test combinations:
118118
# - auto scan: all Spark versions (3.4, 3.5, 4.0)
119-
# - native_comet: Spark 3.4, 3.5
120119
# - native_iceberg_compat: Spark 3.5 only
121120
config:
122121
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'auto', scan-env: ''}
123122
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'auto', scan-env: ''}
124-
- {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}
125-
- {spark-short: '3.4', spark-full: '3.4.3', java: 11, scan-impl: 'native_comet', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_comet'}
126-
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'native_comet', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_comet'}
127123
- {spark-short: '3.5', spark-full: '3.5.8', java: 11, scan-impl: 'native_iceberg_compat', scan-env: 'COMET_PARQUET_SCAN_IMPL=native_iceberg_compat'}
124+
- {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}
128125
# Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
129126
exclude:
130127
- config: {spark-short: '4.0', spark-full: '4.0.1', java: 17, scan-impl: 'auto', scan-env: ''}

Makefile

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,26 +51,26 @@ format:
5151

5252
# build native libs for amd64 architecture Linux/MacOS on a Linux/amd64 machine/container
5353
core-amd64-libs:
54-
cd native && cargo build -j 2 --release $(FEATURES_ARG)
54+
cd native && RUSTFLAGS="-Ctarget-cpu=x86-64-v3" cargo build -j 2 --release $(FEATURES_ARG)
5555
ifdef HAS_OSXCROSS
5656
rustup target add x86_64-apple-darwin
5757
cd native && cargo build -j 2 --target x86_64-apple-darwin --release $(FEATURES_ARG)
5858
endif
5959

6060
# build native libs for arm64 architecture Linux/MacOS on a Linux/arm64 machine/container
6161
core-arm64-libs:
62-
cd native && cargo build -j 2 --release $(FEATURES_ARG)
62+
cd native && RUSTFLAGS="-Ctarget-cpu=neoverse-n1" cargo build -j 2 --release $(FEATURES_ARG)
6363
ifdef HAS_OSXCROSS
6464
rustup target add aarch64-apple-darwin
6565
cd native && cargo build -j 2 --target aarch64-apple-darwin --release $(FEATURES_ARG)
6666
endif
6767

6868
core-amd64:
6969
rustup target add x86_64-apple-darwin
70-
cd native && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
70+
cd native && RUSTFLAGS="-Ctarget-cpu=skylake" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
7171
mkdir -p common/target/classes/org/apache/comet/darwin/x86_64
7272
cp native/target/x86_64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/x86_64
73-
cd native && RUSTFLAGS="-Ctarget-cpu=haswell -Ctarget-feature=-prefer-256-bit" cargo build --release $(FEATURES_ARG)
73+
cd native && RUSTFLAGS="-Ctarget-cpu=x86-64-v3" cargo build --release $(FEATURES_ARG)
7474
mkdir -p common/target/classes/org/apache/comet/linux/amd64
7575
cp native/target/release/libcomet.so common/target/classes/org/apache/comet/linux/amd64
7676
jar -cf common/target/comet-native-x86_64.jar \
@@ -83,7 +83,7 @@ core-arm64:
8383
cd native && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release $(FEATURES_ARG)
8484
mkdir -p common/target/classes/org/apache/comet/darwin/aarch64
8585
cp native/target/aarch64-apple-darwin/release/libcomet.dylib common/target/classes/org/apache/comet/darwin/aarch64
86-
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)
86+
cd native && RUSTFLAGS="-Ctarget-cpu=neoverse-n1" cargo build --release $(FEATURES_ARG)
8787
mkdir -p common/target/classes/org/apache/comet/linux/aarch64
8888
cp native/target/release/libcomet.so common/target/classes/org/apache/comet/linux/aarch64
8989
jar -cf common/target/comet-native-aarch64.jar \
@@ -94,8 +94,8 @@ core-arm64:
9494
release-linux: clean
9595
rustup target add aarch64-apple-darwin x86_64-apple-darwin
9696
cd native && RUSTFLAGS="-Ctarget-cpu=apple-m1" CC=arm64-apple-darwin21.4-clang CXX=arm64-apple-darwin21.4-clang++ CARGO_FEATURE_NEON=1 cargo build --target aarch64-apple-darwin --release $(FEATURES_ARG)
97-
cd native && RUSTFLAGS="-Ctarget-cpu=skylake -Ctarget-feature=-prefer-256-bit" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
98-
cd native && RUSTFLAGS="-Ctarget-cpu=native -Ctarget-feature=-prefer-256-bit" cargo build --release $(FEATURES_ARG)
97+
cd native && RUSTFLAGS="-Ctarget-cpu=skylake" CC=o64-clang CXX=o64-clang++ cargo build --target x86_64-apple-darwin --release $(FEATURES_ARG)
98+
cd native && RUSTFLAGS="-Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)
9999
./mvnw install -Prelease -DskipTests $(PROFILES)
100100
release:
101101
cd native && RUSTFLAGS="$(RUSTFLAGS) -Ctarget-cpu=native" cargo build --release $(FEATURES_ARG)

common/src/main/java/org/apache/arrow/c/AbstractCometSchemaImporter.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import org.apache.arrow.vector.FieldVector;
2424
import org.apache.arrow.vector.types.pojo.Field;
2525

26+
import org.apache.comet.IcebergApi;
27+
2628
/** This is a simple wrapper around SchemaImporter to make it accessible from Java Arrow. */
2729
public abstract class AbstractCometSchemaImporter {
2830
private final BufferAllocator allocator;
@@ -67,6 +69,7 @@ public FieldVector importVector(ArrowArray array, ArrowSchema schema) {
6769
return vector;
6870
}
6971

72+
@IcebergApi
7073
public void close() {
7174
provider.close();
7275
}

common/src/main/java/org/apache/comet/CometSchemaImporter.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
import org.apache.arrow.memory.BufferAllocator;
2424

2525
/** This is a simple wrapper around SchemaImporter to make it accessible from Java Arrow. */
26+
@IcebergApi
2627
public class CometSchemaImporter extends AbstractCometSchemaImporter {
28+
@IcebergApi
2729
public CometSchemaImporter(BufferAllocator allocator) {
2830
super(allocator);
2931
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.comet;
21+
22+
import java.lang.annotation.Documented;
23+
import java.lang.annotation.ElementType;
24+
import java.lang.annotation.Retention;
25+
import java.lang.annotation.RetentionPolicy;
26+
import java.lang.annotation.Target;
27+
28+
/**
29+
* Indicates that the annotated element is part of the public API used by Apache Iceberg.
30+
*
31+
* <p>This annotation marks classes, methods, constructors, and fields that form the contract
32+
* between Comet and Iceberg. Changes to these APIs may break Iceberg's Comet integration, so
33+
* contributors should exercise caution and consider backward compatibility when modifying annotated
34+
* elements.
35+
*
36+
* <p>The Iceberg integration uses Comet's native Parquet reader for accelerated vectorized reads.
37+
* See the contributor guide documentation for details on how Iceberg uses these APIs.
38+
*
39+
* @see <a href="https://iceberg.apache.org/">Apache Iceberg</a>
40+
*/
41+
@Documented
42+
@Retention(RetentionPolicy.RUNTIME)
43+
@Target({ElementType.TYPE, ElementType.METHOD, ElementType.CONSTRUCTOR, ElementType.FIELD})
44+
public @interface IcebergApi {}

common/src/main/java/org/apache/comet/parquet/AbstractColumnReader.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,11 @@
2828
import org.apache.spark.sql.types.TimestampNTZType$;
2929

3030
import org.apache.comet.CometConf;
31+
import org.apache.comet.IcebergApi;
3132
import org.apache.comet.vector.CometVector;
3233

3334
/** Base class for Comet Parquet column reader implementations. */
35+
@IcebergApi
3436
public abstract class AbstractColumnReader implements AutoCloseable {
3537
protected static final Logger LOG = LoggerFactory.getLogger(AbstractColumnReader.class);
3638

@@ -61,7 +63,7 @@ public abstract class AbstractColumnReader implements AutoCloseable {
6163
protected int batchSize;
6264

6365
/** A pointer to the native implementation of ColumnReader. */
64-
protected long nativeHandle;
66+
@IcebergApi protected long nativeHandle;
6567

6668
AbstractColumnReader(
6769
DataType type,
@@ -96,6 +98,7 @@ String getPath() {
9698
/**
9799
* Set the batch size of this reader to be 'batchSize'. Also initializes the native column reader.
98100
*/
101+
@IcebergApi
99102
public void setBatchSize(int batchSize) {
100103
assert nativeHandle == 0
101104
: "Native column reader shouldn't be initialized before " + "'setBatchSize' is called";
@@ -113,6 +116,7 @@ public void setBatchSize(int batchSize) {
113116
/** Returns the {@link CometVector} read by this reader. */
114117
public abstract CometVector currentBatch();
115118

119+
@IcebergApi
116120
@Override
117121
public void close() {
118122
if (nativeHandle != 0) {

common/src/main/java/org/apache/comet/parquet/BatchReader.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565

6666
import org.apache.comet.CometConf;
6767
import org.apache.comet.CometSchemaImporter;
68+
import org.apache.comet.IcebergApi;
6869
import org.apache.comet.shims.ShimBatchReader;
6970
import org.apache.comet.shims.ShimFileFormat;
7071
import org.apache.comet.vector.CometVector;
@@ -87,6 +88,7 @@
8788
* }
8889
* </pre>
8990
*/
91+
@IcebergApi
9092
public class BatchReader extends RecordReader<Void, ColumnarBatch> implements Closeable {
9193
private static final Logger LOG = LoggerFactory.getLogger(FileReader.class);
9294
protected static final BufferAllocator ALLOCATOR = new RootAllocator();
@@ -186,9 +188,9 @@ public BatchReader(
186188
}
187189

188190
/**
189-
* @deprecated since 0.10.0, will be removed in 0.11.0.
190191
* @see <a href="https://github.com/apache/datafusion-comet/issues/2079">Comet Issue #2079</a>
191192
*/
193+
@IcebergApi
192194
public BatchReader(AbstractColumnReader[] columnReaders) {
193195
// Todo: set useDecimal128 and useLazyMaterialization
194196
int numColumns = columnReaders.length;
@@ -384,17 +386,17 @@ public void init() throws URISyntaxException, IOException {
384386
}
385387

386388
/**
387-
* @deprecated since 0.10.0, will be removed in 0.11.0.
388389
* @see <a href="https://github.com/apache/datafusion-comet/issues/2079">Comet Issue #2079</a>
389390
*/
391+
@IcebergApi
390392
public void setSparkSchema(StructType schema) {
391393
this.sparkSchema = schema;
392394
}
393395

394396
/**
395-
* @deprecated since 0.10.0, will be removed in 0.11.0.
396397
* @see <a href="https://github.com/apache/datafusion-comet/issues/2079">Comet Issue #2079</a>
397398
*/
399+
@IcebergApi
398400
public AbstractColumnReader[] getColumnReaders() {
399401
return columnReaders;
400402
}
@@ -498,6 +500,7 @@ public boolean nextBatch() throws IOException {
498500
return nextBatch(batchSize);
499501
}
500502

503+
@IcebergApi
501504
public boolean nextBatch(int batchSize) {
502505
long totalDecodeTime = 0, totalLoadTime = 0;
503506
for (int i = 0; i < columnReaders.length; i++) {
@@ -524,6 +527,7 @@ public boolean nextBatch(int batchSize) {
524527
return true;
525528
}
526529

530+
@IcebergApi
527531
@Override
528532
public void close() throws IOException {
529533
if (columnReaders != null) {

0 commit comments

Comments
 (0)