Skip to content

Commit 94f4527

Browse files
authored
fix: audit array_insert expression for correctness and test coverage (#3890)
1 parent ad838ad commit 94f4527

5 files changed

Lines changed: 328 additions & 3 deletions

File tree

.claude/skills/audit-comet-expression/SKILL.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,20 @@ After implementing tests, tell the user how to run them:
311311

312312
---
313313

314+
## Step 8: Update the Expression Audit Log
315+
316+
After completing the audit (whether or not tests were added), append a row to the audit log at
317+
`docs/source/contributor-guide/expression-audit-log.md`.
318+
319+
The row should include:
320+
321+
- Expression name
322+
- Spark versions checked (e.g. 3.4.3, 3.5.8, 4.0.1)
323+
- Today's date
324+
- A brief summary of findings (behavioral differences, bugs found/fixed, tests added, known incompatibilities)
325+
326+
---
327+
314328
## Output Format
315329

316330
Present the audit as:
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<!--
2+
Licensed to the Apache Software Foundation (ASF) under one
3+
or more contributor license agreements. See the NOTICE file
4+
distributed with this work for additional information
5+
regarding copyright ownership. The ASF licenses this file
6+
to you under the Apache License, Version 2.0 (the
7+
"License"); you may not use this file except in compliance
8+
with the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing,
13+
software distributed under the License is distributed on an
14+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
KIND, either express or implied. See the License for the
16+
specific language governing permissions and limitations
17+
under the License.
18+
-->
19+
20+
# Expression Audit Log
21+
22+
This document tracks which Comet expressions have been audited against their Spark
23+
implementations for correctness and test coverage.
24+
25+
Each audit compares the Comet implementation against the Spark source code for the listed
26+
versions, reviews existing test coverage, identifies gaps, and adds missing tests where needed.
27+
28+
## Audited Expressions
29+
30+
| Expression | Spark Versions Checked | Date | Findings |
31+
| -------------- | ---------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
32+
| `array_insert` | 3.4.3, 3.5.8, 4.0.1 | 2026-04-02 | No behavioral differences across Spark versions. Fixed `nullable()` metadata bug (did not account for `pos_expr`). Added SQL tests for multiple types (string, boolean, double, float, long, short, byte), literal arguments, null handling, negative indices, out-of-bounds padding, special float values (NaN, Infinity), multibyte UTF-8, and legacy negative index mode. Known incompatibility: pos=0 error message differs from Spark's `INVALID_INDEX_OF_ZERO`. |

native/spark-expr/src/array_funcs/array_insert.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ impl PhysicalExpr for ArrayInsert {
105105
}
106106

107107
fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
108-
self.src_array_expr.nullable(input_schema)
108+
Ok(self.src_array_expr.nullable(input_schema)? || self.pos_expr.nullable(input_schema)?)
109109
}
110110

111111
fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {

spark/src/test/resources/sql-tests/expressions/array/array_insert.sql

Lines changed: 224 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,234 @@
1616
-- under the License.
1717

1818
-- ConfigMatrix: parquet.enable.dictionary=false,true
19+
-- Config: spark.comet.expression.ArrayInsert.allowIncompatible=true
20+
21+
-- ============================================================
22+
-- Integer arrays with column arguments
23+
-- ============================================================
1924

2025
statement
2126
CREATE TABLE test_array_insert(arr array<int>, pos int, val int) USING parquet
2227

2328
statement
24-
INSERT INTO test_array_insert VALUES (array(1, 2, 3), 2, 10), (array(1, 2, 3), 1, 10), (array(1, 2, 3), 4, 10), (array(), 1, 10), (NULL, 1, 10)
29+
INSERT INTO test_array_insert VALUES
30+
(array(1, 2, 3), 2, 10),
31+
(array(1, 2, 3), 1, 10),
32+
(array(1, 2, 3), 4, 10),
33+
(array(), 1, 10),
34+
(NULL, 1, 10),
35+
(array(1, 2, 3), NULL, 10),
36+
(array(1, 2, 3), 3, NULL)
2537

26-
query spark_answer_only
38+
-- basic column arguments
39+
query
2740
SELECT array_insert(arr, pos, val) FROM test_array_insert
41+
42+
-- ============================================================
43+
-- Literal arguments (all-literal queries test native eval
44+
-- because CometSqlFileTestSuite disables constant folding)
45+
-- ============================================================
46+
47+
-- basic insert at various positions
48+
query
49+
SELECT array_insert(array(1, 2, 3), 2, 10)
50+
51+
-- prepend (pos=1)
52+
query
53+
SELECT array_insert(array(1, 2, 3), 1, 10)
54+
55+
-- append (pos = len+1)
56+
query
57+
SELECT array_insert(array(1, 2, 3), 4, 10)
58+
59+
-- insert into empty array
60+
query
61+
SELECT array_insert(array(), 1, 10)
62+
63+
-- ============================================================
64+
-- NULL handling
65+
-- ============================================================
66+
67+
-- null array
68+
query
69+
SELECT array_insert(CAST(NULL AS ARRAY<INT>), 1, 10)
70+
71+
-- null position
72+
query
73+
SELECT array_insert(array(1, 2, 3), CAST(NULL AS INT), 10)
74+
75+
-- null value (insert null into array)
76+
query
77+
SELECT array_insert(array(1, 2, 3), 2, CAST(NULL AS INT))
78+
79+
-- null value appended beyond end
80+
query
81+
SELECT array_insert(array(1, 2, 3, NULL), 4, CAST(NULL AS INT))
82+
83+
-- array with existing nulls
84+
query
85+
SELECT array_insert(array(1, NULL, 3), 2, 10)
86+
87+
-- ============================================================
88+
-- Positive out-of-bounds (null padding)
89+
-- ============================================================
90+
91+
-- one beyond end
92+
query
93+
SELECT array_insert(array(1, 2, 3), 5, 99)
94+
95+
-- far beyond end
96+
query
97+
SELECT array_insert(array(1, 2, 3), 10, 99)
98+
99+
-- ============================================================
100+
-- Negative indices (non-legacy mode, which is the default)
101+
-- ============================================================
102+
103+
-- -1 appends after last element
104+
query
105+
SELECT array_insert(array(1, 2, 3), -1, 10)
106+
107+
-- -2 inserts before last element
108+
query
109+
SELECT array_insert(array(1, 2, 3), -2, 10)
110+
111+
-- -3 inserts before second-to-last
112+
query
113+
SELECT array_insert(array(1, 2, 3), -3, 10)
114+
115+
-- -4 inserts before first element (len = 3, so -4 = before start)
116+
query
117+
SELECT array_insert(array(1, 2, 3), -4, 10)
118+
119+
-- negative beyond start (null padding)
120+
query
121+
SELECT array_insert(array(1, 2, 3), -5, 10)
122+
123+
-- far negative beyond start
124+
query
125+
SELECT array_insert(array(1, 2, 3), -10, 10)
126+
127+
-- ============================================================
128+
-- pos=0 error
129+
-- Note: Spark throws INVALID_INDEX_OF_ZERO, Comet throws a different message.
130+
-- Cannot use expect_error because patterns differ. This is a known incompatibility.
131+
-- ============================================================
132+
133+
-- ============================================================
134+
-- String arrays
135+
-- ============================================================
136+
137+
statement
138+
CREATE TABLE test_array_insert_str(arr array<string>, pos int, val string) USING parquet
139+
140+
statement
141+
INSERT INTO test_array_insert_str VALUES
142+
(array('a', 'b', 'c'), 2, 'd'),
143+
(array('a', 'b', 'c'), 1, 'd'),
144+
(array('a', 'b', 'c'), 4, 'd'),
145+
(array(), 1, 'x'),
146+
(NULL, 1, 'x'),
147+
(array('a', NULL, 'c'), 2, 'z'),
148+
(array('a', 'b', 'c'), 3, NULL)
149+
150+
-- column arguments with strings
151+
query
152+
SELECT array_insert(arr, pos, val) FROM test_array_insert_str
153+
154+
-- literal string arrays
155+
query
156+
SELECT array_insert(array('hello', 'world'), 1, 'first')
157+
158+
query
159+
SELECT array_insert(array('hello', 'world'), 3, 'last')
160+
161+
-- empty strings
162+
query
163+
SELECT array_insert(array('', 'a', ''), 2, '')
164+
165+
-- multibyte UTF-8 characters
166+
query
167+
SELECT array_insert(array('hello', 'world'), 2, 'cafe\u0301')
168+
169+
query
170+
SELECT array_insert(array('abc', 'def'), 1, '中文')
171+
172+
-- ============================================================
173+
-- Boolean arrays
174+
-- ============================================================
175+
176+
statement
177+
CREATE TABLE test_array_insert_bool(arr array<boolean>, pos int, val boolean) USING parquet
178+
179+
statement
180+
INSERT INTO test_array_insert_bool VALUES
181+
(array(true, false, true), 2, false),
182+
(array(true, false), 3, true),
183+
(NULL, 1, true),
184+
(array(true), 1, NULL)
185+
186+
query
187+
SELECT array_insert(arr, pos, val) FROM test_array_insert_bool
188+
189+
query
190+
SELECT array_insert(array(true, false, true), 3, true)
191+
192+
-- ============================================================
193+
-- Double arrays
194+
-- ============================================================
195+
196+
statement
197+
CREATE TABLE test_array_insert_double(arr array<double>, pos int, val double) USING parquet
198+
199+
statement
200+
INSERT INTO test_array_insert_double VALUES
201+
(array(1.1, 2.2, 3.3), 2, 4.4),
202+
(array(1.1, 2.2), 3, 0.0),
203+
(NULL, 1, 1.0),
204+
(array(1.1, 2.2), 1, NULL)
205+
206+
query
207+
SELECT array_insert(arr, pos, val) FROM test_array_insert_double
208+
209+
-- special float values
210+
query
211+
SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 2, CAST('NaN' AS DOUBLE))
212+
213+
query
214+
SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 2, CAST('Infinity' AS DOUBLE))
215+
216+
query
217+
SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 2, CAST('-Infinity' AS DOUBLE))
218+
219+
-- negative zero
220+
query
221+
SELECT array_insert(array(CAST(1.0 AS DOUBLE), CAST(2.0 AS DOUBLE)), 1, CAST(-0.0 AS DOUBLE))
222+
223+
-- ============================================================
224+
-- Long arrays
225+
-- ============================================================
226+
227+
query
228+
SELECT array_insert(array(CAST(1 AS BIGINT), CAST(2 AS BIGINT)), 2, CAST(3 AS BIGINT))
229+
230+
-- ============================================================
231+
-- Short arrays
232+
-- ============================================================
233+
234+
query
235+
SELECT array_insert(array(CAST(1 AS SMALLINT), CAST(2 AS SMALLINT)), 2, CAST(3 AS SMALLINT))
236+
237+
-- ============================================================
238+
-- Byte arrays
239+
-- ============================================================
240+
241+
query
242+
SELECT array_insert(array(CAST(1 AS TINYINT), CAST(2 AS TINYINT)), 2, CAST(3 AS TINYINT))
243+
244+
-- ============================================================
245+
-- Float arrays
246+
-- ============================================================
247+
248+
query
249+
SELECT array_insert(array(CAST(1.1 AS FLOAT), CAST(2.2 AS FLOAT)), 2, CAST(3.3 AS FLOAT))
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
-- Licensed to the Apache Software Foundation (ASF) under one
2+
-- or more contributor license agreements. See the NOTICE file
3+
-- distributed with this work for additional information
4+
-- regarding copyright ownership. The ASF licenses this file
5+
-- to you under the Apache License, Version 2.0 (the
6+
-- "License"); you may not use this file except in compliance
7+
-- with the License. You may obtain a copy of the License at
8+
--
9+
-- http://www.apache.org/licenses/LICENSE-2.0
10+
--
11+
-- Unless required by applicable law or agreed to in writing,
12+
-- software distributed under the License is distributed on an
13+
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
-- KIND, either express or implied. See the License for the
15+
-- specific language governing permissions and limitations
16+
-- under the License.
17+
18+
-- Tests for array_insert with legacy negative index mode enabled.
19+
-- In legacy mode, -1 inserts BEFORE the last element (not after).
20+
21+
-- ConfigMatrix: parquet.enable.dictionary=false,true
22+
-- Config: spark.comet.expression.ArrayInsert.allowIncompatible=true
23+
-- Config: spark.sql.legacy.negativeIndexInArrayInsert=true
24+
25+
-- -1 inserts before last element in legacy mode
26+
query
27+
SELECT array_insert(array(1, 2, 3), -1, 10)
28+
29+
-- -2 inserts before second-to-last
30+
query
31+
SELECT array_insert(array(1, 2, 3), -2, 10)
32+
33+
-- -3 inserts before first element
34+
query
35+
SELECT array_insert(array(1, 2, 3), -3, 10)
36+
37+
-- negative beyond start with null padding (legacy mode pads differently)
38+
query
39+
SELECT array_insert(array(1, 2, 3), -5, 10)
40+
41+
-- far negative beyond start
42+
query
43+
SELECT array_insert(array(1, 3, 4), -2, 2)
44+
45+
-- column-based test
46+
statement
47+
CREATE TABLE test_ai_legacy(arr array<int>, pos int, val int) USING parquet
48+
49+
statement
50+
INSERT INTO test_ai_legacy VALUES
51+
(array(1, 2, 3), -1, 10),
52+
(array(4, 5), -1, 20),
53+
(array(1, 2, 3), -4, 10),
54+
(NULL, -1, 10)
55+
56+
query
57+
SELECT array_insert(arr, pos, val) FROM test_ai_legacy

0 commit comments

Comments
 (0)