fix(iceberg): defer identifier field update when replacing columns in schema evolution (#74723)

benmoriceau · devin-ai-integration[bot] · web-flow · commit 00f61e3d6fe2 · 2026-03-12T07:45:14.000-07:00
Co-authored-by: Devin AI &lt;158243242+devin-ai-integration[bot]@users.noreply.github.com&gt;
diff --git a/airbyte-cdk/bulk/core/load/changelog.md b/airbyte-cdk/bulk/core/load/changelog.md
@@ -9,6 +9,7 @@ The Load CDK provides functionality for destination connectors including stream-
 
 | Version | Date       | Pull Request | Subject                                                                                         |
 |---------|------------|--------------|-------------------------------------------------------------------------------------------------|
+| 1.0.5   | 2026-03-10 | [#74723](https://github.com/airbytehq/airbyte/pull/74723) | Fix schema evolution: defer identifier field update when replacing columns to avoid Iceberg conflict. |
 | 1.0.4   | 2026-03-05 | [#74328](https://github.com/airbytehq/airbyte/pull/74328) | Fix iceberg dedup: map PK NumberType to StringType instead of DecimalType for identifier field compatibility. |
 | 1.0.3   | 2026-03-05 | [#74272](https://github.com/airbytehq/airbyte/pull/74272) | Fix iceberg dedup.                                                                              |
 | 1.0.2   | 2026-02-24 | | Bump bulk-cdk-core-base to 1.0.1 to pick up CVE fixes (CVE-2021-47621, CVE-2022-36944).         |
diff --git a/airbyte-cdk/bulk/core/load/version.properties b/airbyte-cdk/bulk/core/load/version.properties
@@ -1 +1 @@
-version=1.0.4
+version=1.0.5
diff --git a/airbyte-cdk/bulk/toolkits/load-iceberg-parquet/src/main/kotlin/io/airbyte/cdk/load/toolkits/iceberg/parquet/IcebergTableSynchronizer.kt b/airbyte-cdk/bulk/toolkits/load-iceberg-parquet/src/main/kotlin/io/airbyte/cdk/load/toolkits/iceberg/parquet/IcebergTableSynchronizer.kt
@@ -94,6 +94,7 @@ class IcebergTableSynchronizer(
         // 2) Update types => find a supertype for each changed column
         val columnsToReplaceInSecondCommit =
             mutableMapOf<String, org.apache.iceberg.types.Types.NestedField>()
+        val replacedColumns = mutableSetOf<String>()
 
         diff.updatedDataTypes.forEach { columnName ->
             val existingField =
@@ -134,6 +135,7 @@ class IcebergTableSynchronizer(
                         update.deleteColumn(columnName)
                         update.addColumn(columnName, incomingField.type())
                     }
+                    replacedColumns.add(columnName)
                 }
             }
         }
@@ -188,8 +190,17 @@ class IcebergTableSynchronizer(
         }
 
         // 5) Update identifier fields
-        if (diff.identifierFieldsChanged) {
-            val updatedIdentifierFields = incomingSchema.identifierFieldNames().toList()
+        // Iceberg's requireColumn() fails for columns pending deletion (even if they're
+        // being re-added in the same update). When replaced columns are also identifier
+        // fields, we must defer the identifier field update to a follow-up commit.
+        val updatedIdentifierFields =
+            if (diff.identifierFieldsChanged) incomingSchema.identifierFieldNames().toList()
+            else emptyList()
+        val hasReplacedIdentifierFields =
+            replacedColumns.any { it in updatedIdentifierFields.toSet() }
+
+        if (diff.identifierFieldsChanged && !hasReplacedIdentifierFields) {
+            // No conflict: can update identifier fields in the same update
             updatedIdentifierFields.forEach { update.requireColumn(it) }
             update.setIdentifierFields(updatedIdentifierFields)
         }
@@ -211,6 +222,12 @@ class IcebergTableSynchronizer(
                 addUpdate.addColumn(null, columnName, field.type())
             }
 
+            // If identifier fields were deferred, handle them now (columns have been re-added)
+            if (hasReplacedIdentifierFields) {
+                updatedIdentifierFields.forEach { addUpdate.requireColumn(it) }
+                addUpdate.setIdentifierFields(updatedIdentifierFields)
+            }
+
             // Commit or defer the add operation based on columnTypeChangeBehavior
             val finalSchema = addUpdate.apply()
             return if (columnTypeChangeBehavior.commitImmediately) {
@@ -221,6 +238,25 @@ class IcebergTableSynchronizer(
             }
         }
 
+        // If replaced columns are also identifier fields, commit column replacements first,
+        // then handle identifier fields in a follow-up update.
+        if (hasReplacedIdentifierFields) {
+            update.commit()
+            table.refresh()
+
+            val identifierUpdate = table.updateSchema().allowIncompatibleChanges()
+            updatedIdentifierFields.forEach { identifierUpdate.requireColumn(it) }
+            identifierUpdate.setIdentifierFields(updatedIdentifierFields)
+
+            val newSchema = identifierUpdate.apply()
+            return if (columnTypeChangeBehavior.commitImmediately) {
+                identifierUpdate.commit()
+                SchemaUpdateResult(newSchema, pendingUpdates = emptyList())
+            } else {
+                SchemaUpdateResult(newSchema, pendingUpdates = listOf(identifierUpdate))
+            }
+        }
+
         // `apply` just validates that the schema change is valid, it doesn't actually commit().
         // It returns the schema that the table _would_ have after committing.
         val newSchema: Schema = update.apply()
diff --git a/airbyte-cdk/bulk/toolkits/load-iceberg-parquet/src/test/kotlin/io/airbyte/cdk/load/toolkits/iceberg/parquet/IcebergTableSynchronizerTest.kt b/airbyte-cdk/bulk/toolkits/load-iceberg-parquet/src/test/kotlin/io/airbyte/cdk/load/toolkits/iceberg/parquet/IcebergTableSynchronizerTest.kt
@@ -421,4 +421,56 @@ class IcebergTableSynchronizerTest {
         assertThat(schema).isSameAs(mockNewSchema)
         assertThat(pendingUpdates).hasSize(1)
     }
+
+    @Test
+    fun `test overwrite with replaced column as identifier field defers identifier update`() {
+        // Simulates the scenario where a PK column's type changes (e.g. Double -> String)
+        // and the column is also an identifier field. Iceberg's requireColumn() fails for
+        // columns pending deletion, so we must commit the column replacement first, then
+        // handle identifier fields in a follow-up update.
+        val existingSchema =
+            buildSchema(Types.NestedField.required(1, "pk_col", Types.DoubleType.get()))
+        val incomingSchema =
+            buildSchema(
+                Types.NestedField.required(1, "pk_col", Types.StringType.get()),
+                identifierFields = setOf(1)
+            )
+
+        every { mockTable.schema() } returns existingSchema
+
+        // After the first commit (column replacement), table.updateSchema() returns a new mock
+        val mockIdentifierUpdateSchema = mockk<UpdateSchema>(relaxed = true)
+        val mockIdentifierNewSchema = mockk<Schema>(relaxed = true)
+        every { mockIdentifierUpdateSchema.apply() } returns mockIdentifierNewSchema
+
+        // First call returns mockUpdateSchema, second call (after commit+refresh) returns
+        // the identifier update mock.
+        every { mockTable.updateSchema().allowIncompatibleChanges() } returnsMany
+            listOf(mockUpdateSchema, mockIdentifierUpdateSchema)
+
+        val (schema, pendingUpdates) =
+            synchronizer.maybeApplySchemaChanges(
+                mockTable,
+                incomingSchema,
+                ColumnTypeChangeBehavior.OVERWRITE
+            )
+
+        // First update: delete + add column (committed immediately due to deferred identifiers)
+        verify { mockUpdateSchema.deleteColumn("pk_col") }
+        verify { mockUpdateSchema.addColumn("pk_col", Types.StringType.get()) }
+        verify { mockUpdateSchema.commit() }
+
+        // Table is refreshed after the first commit
+        verify { mockTable.refresh() }
+
+        // Second update: identifier fields handled in a follow-up
+        verify { mockIdentifierUpdateSchema.requireColumn("pk_col") }
+        verify { mockIdentifierUpdateSchema.setIdentifierFields(listOf("pk_col")) }
+        // OVERWRITE mode doesn't commit immediately — returns as pending
+        verify(exactly = 0) { mockIdentifierUpdateSchema.commit() }
+
+        assertThat(schema).isSameAs(mockIdentifierNewSchema)
+        assertThat(pendingUpdates).hasSize(1)
+        assertThat(pendingUpdates.first()).isSameAs(mockIdentifierUpdateSchema)
+    }
 }