Skip to content

Commit 9b5323b

Browse files
authored
fix: update sort order before columns to handle dropped columns in PK. Drop sort order when not-dedupe. (#75546)
1 parent 2621219 commit 9b5323b

File tree

4 files changed

+392
-1
lines changed

4 files changed

+392
-1
lines changed

airbyte-cdk/bulk/core/load/changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ The Load CDK provides functionality for destination connectors including stream-
99

1010
| Version | Date | Pull Request | Subject |
1111
|---------|------------|--------------|-------------------------------------------------------------------------------------------------|
12+
| 1.0.7 | 2026-03-27 | | Fix: update Iceberg sort order before schema evolution to prevent ValidationException when deleting columns referenced by the sort order. Handles Dedupe-to-Append mode switches and PK changes. |
1213
| 1.0.6 | 2026-03-12 | [#74715](https://github.com/airbytehq/airbyte/pull/74715) | Fix: drop temp table after successful upsert to prevent duplicate records across syncs. |
1314
| 1.0.5 | 2026-03-10 | [#74723](https://github.com/airbytehq/airbyte/pull/74723) | Fix schema evolution: defer identifier field update when replacing columns to avoid Iceberg conflict. |
1415
| 1.0.4 | 2026-03-05 | [#74328](https://github.com/airbytehq/airbyte/pull/74328) | Fix iceberg dedup: map PK NumberType to StringType instead of DecimalType for identifier field compatibility. |
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
version=1.0.6
1+
version=1.0.7

airbyte-cdk/bulk/toolkits/load-iceberg-parquet/src/main/kotlin/io/airbyte/cdk/load/toolkits/iceberg/parquet/IcebergTableSynchronizer.kt

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@ package io.airbyte.cdk.load.toolkits.iceberg.parquet
77
import io.airbyte.cdk.ConfigErrorException
88
import io.airbyte.cdk.load.toolkits.iceberg.parquet.IcebergTypesComparator.Companion.PARENT_CHILD_SEPARATOR
99
import io.airbyte.cdk.load.toolkits.iceberg.parquet.IcebergTypesComparator.Companion.splitIntoParentAndLeaf
10+
import io.github.oshai.kotlinlogging.KotlinLogging
1011
import jakarta.inject.Singleton
1112
import org.apache.iceberg.Schema
13+
import org.apache.iceberg.SortDirection
1214
import org.apache.iceberg.Table
1315
import org.apache.iceberg.UpdateSchema
1416
import org.apache.iceberg.types.Type
1517
import org.apache.iceberg.types.Type.PrimitiveType
1618

19+
private val logger = KotlinLogging.logger {}
20+
1721
/** Describes how the [IcebergTableSynchronizer] handles column type changes. */
1822
enum class ColumnTypeChangeBehavior {
1923
/**
@@ -86,6 +90,27 @@ class IcebergTableSynchronizer(
8690
return SchemaUpdateResult(existingSchema, pendingUpdates = emptyList())
8791
}
8892

93+
// Update the sort order before creating the UpdateSchema, because:
94+
// 1. Deleting a column referenced by the sort order will cause
95+
// SortOrder.checkCompatibility to throw ValidationException on commit.
96+
// 2. UpdateSchema captures the table's metadata version at creation time.
97+
// If we replace the sort order after creating it, the commit would fail
98+
// with a stale metadata error.
99+
val columnsBeingDeleted = buildList {
100+
addAll(diff.removedColumns)
101+
if (columnTypeChangeBehavior == ColumnTypeChangeBehavior.OVERWRITE) {
102+
// In OVERWRITE mode, type-changed columns are deleted and re-added
103+
// with new field IDs. The old sort field references become invalid.
104+
addAll(diff.updatedDataTypes)
105+
}
106+
}
107+
replaceSortOrderIfNeeded(
108+
table = table,
109+
columnsBeingDeleted = columnsBeingDeleted,
110+
identifierFieldsChanged = diff.identifierFieldsChanged,
111+
incomingIdentifierFieldNames = incomingSchema.identifierFieldNames(),
112+
)
113+
89114
val update: UpdateSchema = table.updateSchema().allowIncompatibleChanges()
90115

91116
// 1) Remove columns that no longer exist in the incoming schema
@@ -267,6 +292,90 @@ class IcebergTableSynchronizer(
267292
return SchemaUpdateResult(newSchema, pendingUpdates = listOf(update))
268293
}
269294
}
295+
296+
/**
297+
* Update the table's sort order if it would conflict with pending schema changes.
298+
*
299+
* Sort orders are set at table creation from identifier fields (PKs) and never updated. This
300+
* causes [org.apache.iceberg.exceptions.ValidationException] when schema evolution deletes a
301+
* column referenced by the sort order.
302+
*
303+
* This method handles three cases:
304+
* 1. Identifier fields changed → rebuild sort order from new identifiers (covers
305+
* ```
306+
* Dedupe→Append, PK changes within Dedupe)
307+
* ```
308+
* 2. Columns being deleted conflict with sort order → remove those fields
309+
* 3. Neither → no-op
310+
*
311+
* Must be called BEFORE creating the [UpdateSchema], since this commits a metadata change and
312+
* the subsequent UpdateSchema needs the refreshed metadata version.
313+
*/
314+
private fun replaceSortOrderIfNeeded(
315+
table: Table,
316+
columnsBeingDeleted: List<String>,
317+
identifierFieldsChanged: Boolean,
318+
incomingIdentifierFieldNames: Set<String>,
319+
) {
320+
val currentSortOrder = table.sortOrder()
321+
322+
// If the table has no sort order, there's nothing to conflict and nothing to update.
323+
// (Append→Dedupe would need a sort order added, but that case requires a reset.)
324+
if (currentSortOrder.isUnsorted) {
325+
return
326+
}
327+
328+
if (identifierFieldsChanged) {
329+
// Rebuild sort order from the new identifier fields.
330+
// For Dedupe→Append: incoming identifiers are empty → unsorted.
331+
// For PK changes within Dedupe: new identifiers → new sort order.
332+
val builder = table.replaceSortOrder()
333+
for (fieldName in incomingIdentifierFieldNames) {
334+
// Only include fields that exist in the current schema. Fields being
335+
// added in the same schema change can't be referenced yet.
336+
if (table.schema().findField(fieldName) != null) {
337+
builder.asc(fieldName)
338+
}
339+
}
340+
logger.info {
341+
"Replacing sort order due to identifier field change. " +
342+
"New sort fields: ${incomingIdentifierFieldNames.ifEmpty { setOf("(unsorted)") }}"
343+
}
344+
builder.commit()
345+
table.refresh()
346+
return
347+
}
348+
349+
// No identifier change — check if any deleted columns conflict with the sort order.
350+
if (columnsBeingDeleted.isEmpty()) {
351+
return
352+
}
353+
354+
val schema = table.schema()
355+
val fieldIdsBeingDeleted =
356+
columnsBeingDeleted.mapNotNull { schema.findField(it)?.fieldId() }.toSet()
357+
358+
val hasConflict = currentSortOrder.fields().any { it.sourceId() in fieldIdsBeingDeleted }
359+
if (!hasConflict) {
360+
return
361+
}
362+
363+
// Rebuild the sort order, keeping only fields that aren't being deleted.
364+
val builder = table.replaceSortOrder()
365+
for (sortField in currentSortOrder.fields()) {
366+
if (sortField.sourceId() !in fieldIdsBeingDeleted) {
367+
val fieldName = schema.findColumnName(sortField.sourceId())
368+
when (sortField.direction()) {
369+
SortDirection.ASC -> builder.asc(fieldName, sortField.nullOrder())
370+
SortDirection.DESC -> builder.desc(fieldName, sortField.nullOrder())
371+
else -> builder.asc(fieldName, sortField.nullOrder())
372+
}
373+
}
374+
}
375+
logger.info { "Replacing sort order to remove fields being deleted: $columnsBeingDeleted" }
376+
builder.commit()
377+
table.refresh()
378+
}
270379
}
271380

272381
data class SchemaUpdateResult(val schema: Schema, val pendingUpdates: List<UpdateSchema>)

0 commit comments

Comments
 (0)