Skip to content

Commit a0a5802

Browse files
committed
add comments and additional test
1 parent d4ed023 commit a0a5802

4 files changed

Lines changed: 33 additions & 2 deletions

File tree

docs/source/user-guide/latest/compatibility.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,27 @@ suffixes (e.g. `Europe/Moscow`), and the full Spark timestamp year range
154154
(-290308 to 294247). Note that `CAST(string AS DATE)` is only compatible for years between
155155
262143 BC and 262142 AD due to an underlying library limitation.
156156

157+
### TimestampNTZ (Timestamp Without Timezone)
158+
159+
Comet supports casting to and from `TimestampNTZType` with the following compatibility:
160+
161+
| Cast Direction | Compatible? | Timezone Dependent? | Notes |
162+
|---|---|---|---|
163+
| TimestampNTZ -> String | C | No | Formats local time as-is |
164+
| TimestampNTZ -> Date | C | No | Extracts date from local datetime |
165+
| TimestampNTZ -> Timestamp | C | Yes | Interprets NTZ value as local time in session timezone, converts to UTC |
166+
| Date -> TimestampNTZ | C | No | Midnight on the given date |
167+
| Timestamp -> TimestampNTZ | C | Yes | Converts UTC epoch to local time in session timezone |
168+
| String -> TimestampNTZ | I | - | Not yet implemented ([#378](https://github.com/apache/datafusion-comet/issues/378)) |
169+
170+
**Timezone-independent casts** (NTZ to/from String, Date) use pure arithmetic on the stored
171+
microsecond value and are not affected by the session timezone setting.
172+
173+
**Timezone-dependent casts** (NTZ to/from Timestamp) use the session timezone
174+
(`spark.sql.session.timeZone`) to convert between local time and UTC. DST transitions are handled
175+
correctly: ambiguous times (fall-back) resolve to the earlier occurrence, and gap times
176+
(spring-forward) are adjusted forward.
177+
157178
### Legacy Mode
158179

159180
<!--BEGIN:CAST_LEGACY_TABLE-->

native/spark-expr/src/conversion_funcs/cast.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,9 @@ pub(crate) fn cast_array(
453453
(Timestamp(_, Some(_)), Timestamp(_, None)) => {
454454
Ok(cast_timestamp_to_ntz(array, &cast_options.timezone)?)
455455
}
456+
// NTZ → Date32 and NTZ → Utf8 are handled by the DataFusion fall-through below
457+
// (is_df_cast_from_timestamp_spark_compatible returns true for Date32 and Utf8).
458+
// These casts are timezone-independent and DataFusion's implementation matches Spark.
456459
_ if cast_options.is_adapting_schema
457460
|| is_datafusion_spark_compatible(&from_type, to_type) =>
458461
{

native/spark-expr/src/conversion_funcs/string.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1516,6 +1516,11 @@ fn extract_offset_suffix(value: &str) -> Option<(&str, timezone::Tz)> {
15161516

15171517
type TimestampParsePattern<T> = (&'static Regex, fn(&str, &T) -> SparkResult<Option<i64>>);
15181518

1519+
// RE_YEAR allows only 4-6 digits (not 7) because a bare 7-digit string like "0119704"
1520+
// is ambiguous and Spark rejects it. The other patterns (RE_MONTH, RE_DAY, etc.) keep
1521+
// \d{4,7} because the `-` separator disambiguates the year portion, so "0002020-01-01"
1522+
// is validly year 2020 with leading zeros. date_parser's is_valid_digits also allows up
1523+
// to 7 year digits for the same reason.
15191524
static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,6}$").unwrap());
15201525
static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}$").unwrap());
15211526
static RE_DAY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}$").unwrap());

spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
-- under the License.
1717

1818
-- Run once per session timezone to exercise TZ-sensitive casts (NTZ↔Timestamp)
19-
-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles,Asia/Kolkata
19+
-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles,America/New_York,Asia/Kolkata
2020

2121
statement
2222
CREATE TABLE test_ts_ntz(ts_ntz timestamp_ntz, ts timestamp, d date, id int) USING parquet
@@ -27,13 +27,15 @@ INSERT INTO test_ts_ntz VALUES
2727
(TIMESTAMP_NTZ'2023-06-15 12:30:45.123456', TIMESTAMP'2023-06-15 12:30:45.123456 UTC', DATE'2023-06-15', 2),
2828
(TIMESTAMP_NTZ'1970-01-01 00:00:00', TIMESTAMP'1970-01-01 00:00:00 UTC', DATE'1970-01-01', 3),
2929
(TIMESTAMP_NTZ'2024-03-10 02:30:00', TIMESTAMP'2024-03-10 10:00:00 UTC', DATE'2024-03-10', 4),
30-
(NULL, NULL, NULL, 5)
30+
(TIMESTAMP_NTZ'2020-06-15 23:00:00', TIMESTAMP'2020-06-15 23:00:00 UTC', DATE'2020-06-15', 5),
31+
(NULL, NULL, NULL, 6)
3132

3233
-- NTZ → String (timezone-independent: formats local time as-is)
3334
query
3435
SELECT cast(ts_ntz as string), id FROM test_ts_ntz ORDER BY id
3536

3637
-- NTZ → Date (timezone-independent: extracts date treating NTZ value as UTC)
38+
-- Row 5 (23:00) would produce 2020-06-16 in Kolkata (+5:30) if TZ were wrongly applied
3739
query
3840
SELECT cast(ts_ntz as date), id FROM test_ts_ntz ORDER BY id
3941

0 commit comments

Comments
 (0)