add comments and additional test

parthchandra · parthchandra · commit a0a580273c51 · 2026-04-21T11:23:09.000-07:00
diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md
@@ -154,6 +154,27 @@ suffixes (e.g. `Europe/Moscow`), and the full Spark timestamp year range
 (-290308 to 294247). Note that `CAST(string AS DATE)` is only compatible for years between
 262143 BC and 262142 AD due to an underlying library limitation.
 
+### TimestampNTZ (Timestamp Without Timezone)
+
+Comet supports casting to and from `TimestampNTZType` with the following compatibility:
+
+| Cast Direction | Compatible? | Timezone Dependent? | Notes |
+|---|---|---|---|
+| TimestampNTZ -> String | C | No | Formats local time as-is |
+| TimestampNTZ -> Date | C | No | Extracts date from local datetime |
+| TimestampNTZ -> Timestamp | C | Yes | Interprets NTZ value as local time in session timezone, converts to UTC |
+| Date -> TimestampNTZ | C | No | Midnight on the given date |
+| Timestamp -> TimestampNTZ | C | Yes | Converts UTC epoch to local time in session timezone |
+| String -> TimestampNTZ | I | - | Not yet implemented ([#378](https://github.com/apache/datafusion-comet/issues/378)) |
+
+**Timezone-independent casts** (NTZ to/from String, Date) use pure arithmetic on the stored
+microsecond value and are not affected by the session timezone setting.
+
+**Timezone-dependent casts** (NTZ to/from Timestamp) use the session timezone
+(`spark.sql.session.timeZone`) to convert between local time and UTC. DST transitions are handled
+correctly: ambiguous times (fall-back) resolve to the earlier occurrence, and gap times
+(spring-forward) are adjusted forward.
+
 ### Legacy Mode
 
 <!--BEGIN:CAST_LEGACY_TABLE-->
diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs
@@ -453,6 +453,9 @@ pub(crate) fn cast_array(
         (Timestamp(_, Some(_)), Timestamp(_, None)) => {
             Ok(cast_timestamp_to_ntz(array, &cast_options.timezone)?)
         }
+        // NTZ → Date32 and NTZ → Utf8 are handled by the DataFusion fall-through below
+        // (is_df_cast_from_timestamp_spark_compatible returns true for Date32 and Utf8).
+        // These casts are timezone-independent and DataFusion's implementation matches Spark.
         _ if cast_options.is_adapting_schema
             || is_datafusion_spark_compatible(&from_type, to_type) =>
         {
diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs
@@ -1516,6 +1516,11 @@ fn extract_offset_suffix(value: &str) -> Option<(&str, timezone::Tz)> {
 
 type TimestampParsePattern<T> = (&'static Regex, fn(&str, &T) -> SparkResult<Option<i64>>);
 
+// RE_YEAR allows only 4-6 digits (not 7) because a bare 7-digit string like "0119704"
+// is ambiguous and Spark rejects it. The other patterns (RE_MONTH, RE_DAY, etc.) keep
+// \d{4,7} because the `-` separator disambiguates the year portion, so "0002020-01-01"
+// is validly year 2020 with leading zeros. date_parser's is_valid_digits also allows up
+// to 7 year digits for the same reason.
 static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,6}$").unwrap());
 static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}$").unwrap());
 static RE_DAY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}$").unwrap());
diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_timestamp_ntz.sql
@@ -16,7 +16,7 @@
 -- under the License.
 
 -- Run once per session timezone to exercise TZ-sensitive casts (NTZ↔Timestamp)
--- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles,Asia/Kolkata
+-- ConfigMatrix: spark.sql.session.timeZone=UTC,America/Los_Angeles,America/New_York,Asia/Kolkata
 
 statement
 CREATE TABLE test_ts_ntz(ts_ntz timestamp_ntz, ts timestamp, d date, id int) USING parquet
@@ -27,13 +27,15 @@ INSERT INTO test_ts_ntz VALUES
   (TIMESTAMP_NTZ'2023-06-15 12:30:45.123456', TIMESTAMP'2023-06-15 12:30:45.123456 UTC', DATE'2023-06-15', 2),
   (TIMESTAMP_NTZ'1970-01-01 00:00:00', TIMESTAMP'1970-01-01 00:00:00 UTC', DATE'1970-01-01', 3),
   (TIMESTAMP_NTZ'2024-03-10 02:30:00', TIMESTAMP'2024-03-10 10:00:00 UTC', DATE'2024-03-10', 4),
-  (NULL, NULL, NULL, 5)
+  (TIMESTAMP_NTZ'2020-06-15 23:00:00', TIMESTAMP'2020-06-15 23:00:00 UTC', DATE'2020-06-15', 5),
+  (NULL, NULL, NULL, 6)
 
 -- NTZ → String (timezone-independent: formats local time as-is)
 query
 SELECT cast(ts_ntz as string), id FROM test_ts_ntz ORDER BY id
 
 -- NTZ → Date (timezone-independent: extracts date treating NTZ value as UTC)
+-- Row 5 (23:00) would produce 2020-06-16 in Kolkata (+5:30) if TZ were wrongly applied
 query
 SELECT cast(ts_ntz as date), id FROM test_ts_ntz ORDER BY id
 

Original file line number	Diff line number	Diff line change
`@@ -453,6 +453,9 @@ pub(crate) fn cast_array(`
`453`	`453`	`(Timestamp(_, Some(_)), Timestamp(_, None)) => {`
`454`	`454`	`Ok(cast_timestamp_to_ntz(array, &cast_options.timezone)?)`
`455`	`455`	`}`
	`456`	`+ // NTZ → Date32 and NTZ → Utf8 are handled by the DataFusion fall-through below`
	`457`	`+ // (is_df_cast_from_timestamp_spark_compatible returns true for Date32 and Utf8).`
	`458`	`+ // These casts are timezone-independent and DataFusion's implementation matches Spark.`
`456`	`459`	`_ if cast_options.is_adapting_schema`
`457`	`460`	`\|\| is_datafusion_spark_compatible(&from_type, to_type) =>`
`458`	`461`	`{`