-
Notifications
You must be signed in to change notification settings - Fork 9
fix(core): backend-independent V10 leaf canon — xsd:dateTime/time (OT-RFC-57, Tactical) #1399
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
3b2c37c
a1953dd
0df137d
e1946cd
609b0a1
3d23242
2fcf691
de5fd97
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -159,10 +159,6 @@ function decodeIriEscapes(iri: string): string { | |
| }); | ||
| } | ||
|
|
||
| // oxigraph normalizes the "negative zero" year -0000 to 0000. (Only -0000 reaches | ||
| // here: a leading-zero 5+-digit negative year fails the YEAR pattern → verbatim.) | ||
| const normYear = (yy: string) => (yy === '-0000' ? '0000' : yy); | ||
|
|
||
| // oxigraph stores temporal values as seconds-since-0001-01-01 in the same i128/1e18 | ||
| // Decimal as xsd:decimal/duration. A date/time whose scaled seconds overflow i128 | ||
| // fails to parse and is kept VERBATIM, so a foldable timezone / T24 roll / fraction | ||
|
|
@@ -177,6 +173,37 @@ function daysFromCivil(y: bigint, m: bigint, d: bigint): bigint { | |
| const doe = yoe * 365n + yoe / 4n - yoe / 100n + doy; | ||
| return era * 146097n + doe - 719468n; | ||
| } | ||
| // Inverse of daysFromCivil: proleptic-Gregorian (y,m,d) from a signed day count | ||
| // (days since 1970-01-01). Standard Howard Hinnant algorithm. Used to roll the | ||
| // DATE when a timezone offset pushes a dateTime across midnight during the | ||
| // backend-independent UTC normalization (OT-RFC-57). | ||
| function civilFromDays(zIn: bigint): { y: bigint; m: bigint; d: bigint } { | ||
| const z = zIn + 719468n; | ||
| const era = (z >= 0n ? z : z - 146096n) / 146097n; | ||
| const doe = z - era * 146097n; // [0, 146096] | ||
| const yoe = (doe - doe / 1460n + doe / 36524n - doe / 146096n) / 365n; // [0, 399] | ||
| const y = yoe + era * 400n; | ||
| const doy = doe - (365n * yoe + yoe / 4n - yoe / 100n); // [0, 365] | ||
| const mp = (5n * doy + 2n) / 153n; // [0, 11] | ||
| const d = doy - (153n * mp + 2n) / 5n + 1n; // [1, 31] | ||
| const m = mp < 10n ? mp + 3n : mp - 9n; // [1, 12] | ||
| return { y: m <= 2n ? y + 1n : y, m, d }; | ||
| } | ||
|
|
||
| // OT-RFC-57: the UTC date of "midnight in the given tz" — the backend-independent | ||
| // form for xsd:date / gYear / gYearMonth. Blazegraph interprets the value at 00:00 | ||
| // in its tz, converts to UTC, and takes the UTC date; a positive offset rolls the | ||
| // date back a day. offsetMin=0 (Z / no-tz) ⇒ the date is unchanged. | ||
| function utcDateFromMidnight( | ||
| y: bigint, | ||
| mo: bigint, | ||
| d: bigint, | ||
| offsetMin: number, | ||
| ): { y: bigint; m: bigint; d: bigint } { | ||
| const days = daysFromCivil(y, mo, d) + BigInt(Math.floor((0 - offsetMin) / 1440)); | ||
| return civilFromDays(days); | ||
| } | ||
|
|
||
| function temporalInRange(yearStr: string, mo: number, dd: number, hh = 0, mi = 0, ss = 0): boolean { | ||
| const seconds = | ||
| (daysFromCivil(BigInt(yearStr), BigInt(mo), BigInt(dd)) + 719162n) * 86400n + | ||
|
|
@@ -225,7 +252,11 @@ function canonDouble(lex: string, isFloat: boolean): string { | |
| if (Number.isNaN(n)) return 'NaN'; | ||
| if (n === Infinity) return 'INF'; | ||
| if (n === -Infinity) return '-INF'; | ||
| if (n === 0) return Object.is(n, -0) ? '-0' : '0'; | ||
| // OT-RFC-57: negative zero folds to "0". Blazegraph drops the sign on write | ||
| // ("-0.0"^^double → stored "0.0" → value 0), while oxigraph keeps "-0"; emitting | ||
| // "0" for both signed zeros makes canon(input) == canon(store-readback) on either | ||
| // backend. (The IEEE-754 -0/+0 distinction is not consensus-observable here.) | ||
| if (n === 0) return '0'; | ||
| const neg = n < 0; | ||
| const a = Math.abs(n); | ||
| // double: V8's a.toString() IS the shortest round-trip; only ties need the | ||
|
|
@@ -329,27 +360,31 @@ function stripTrailingZeros(s: string): string { | |
| } | ||
|
|
||
| // ── date/time family ─────────────────────────────────────────────────────────── | ||
| // Split + validate the trailing timezone, folding +00:00/-00:00 to Z. oxigraph | ||
| // accepts Z or ±HH:MM with |offset| ≤ 14:00 (HH≤14, MM≤59, total ≤ 840 min); | ||
| // anything else (incl. a malformed +0:00) leaves the timezone in `body`, where the | ||
| // per-type grammar then rejects it → the whole literal is kept verbatim. | ||
| function splitTz(s: string): { body: string; tz: string } { | ||
| // Returns the offset MAGNITUDE in minutes (signed) for the | ||
| // backend-independent UTC normalization of xsd:dateTime/xsd:time (OT-RFC-57). | ||
| // hadTz=false ⇒ no timezone present (a bare dateTime is normalized to UTC and | ||
| // gains a Z, matching Blazegraph/Neptune). Malformed/out-of-range tz → throw | ||
| // (→ the literal is kept verbatim, as oxigraph does). | ||
| function splitTzToOffset(s: string): { body: string; offsetMin: number; hadTz: boolean } { | ||
| const m = /(Z|[+-]\d{2}:\d{2})$/.exec(s); | ||
| if (!m) return { body: s, tz: '' }; | ||
| if (!m) return { body: s, offsetMin: 0, hadTz: false }; | ||
| const tz = m[1]; | ||
| const body = s.slice(0, s.length - tz.length); | ||
| if (tz === 'Z') return { body, tz: 'Z' }; | ||
| if (tz === 'Z') return { body, offsetMin: 0, hadTz: true }; | ||
| const h = parseInt(tz.slice(1, 3), 10); | ||
| const mi = parseInt(tz.slice(4, 6), 10); | ||
| if (mi > 59 || h * 60 + mi > 840) throw new Error(`invalid tz: ${tz}`); | ||
| return { body, tz: tz === '+00:00' || tz === '-00:00' ? 'Z' : tz }; | ||
| const mag = h * 60 + mi; | ||
| return { body, offsetMin: tz[0] === '-' ? -mag : mag, hadTz: true }; | ||
| } | ||
|
|
||
| // Normalize a fractional-seconds group ('.ddd' or undefined): strip trailing | ||
| // zeros, drop entirely if it becomes empty. | ||
| // Normalize a fractional-seconds group ('.ddd' or undefined): TRUNCATE to at most | ||
| // 3 digits (milliseconds — the backend-independent precision floor; a lossy store | ||
| // such as Blazegraph keeps only ms), then strip trailing zeros; drop entirely if | ||
| // empty. Truncate, NOT round (matches Blazegraph). (OT-RFC-57) | ||
| function normFrac(frac: string | undefined): string { | ||
| if (frac === undefined) return ''; | ||
| const d = frac.slice(1).replace(/0+$/, ''); | ||
| const d = frac.slice(1, 4).replace(/0+$/, ''); // at most 3 digits, then strip trailing zeros | ||
| return d === '' ? '' : `.${d}`; | ||
| } | ||
|
|
||
|
|
@@ -396,14 +431,21 @@ function validateClock(hh: number, mi: number, ss: number, fracNorm: string): { | |
| return { rolls: false }; | ||
| } | ||
|
|
||
| // A valid XSD year is EXACTLY 4 digits (leading zeros allowed) OR 5+ digits with | ||
| // NO leading zero. oxigraph rejects a leading-zero 5+-digit year (e.g. 09508) and | ||
| // keeps the whole literal verbatim — so we must too, or we'd normalize tz/fraction | ||
| // on a literal oxigraph leaves untouched. | ||
| const YEAR = '-?(?:\\d{4}|[1-9]\\d{4,})'; | ||
|
|
||
| // OT-RFC-57: the backend-independent value canon accepts any 4+-digit year (any | ||
| // number of leading zeros) and normalizes it via BigInt+fmtYear (min-4-digit, no | ||
| // leading zero). This matches Blazegraph, which on write STRIPS a leading-zero | ||
| // year to its value ("02026"^^gYear → "2026") — oxigraph instead keeps the invalid | ||
| // literal verbatim, but the CONVERGENCE oracle holds either way since canon(input) | ||
| // and canon(store-readback) both fold to the same value form (OT-RFC-57 §7.5). | ||
| const YEAR = '-?\\d{4,}'; | ||
|
|
||
| // OT-RFC-57 backend-independent form: normalize to UTC (subtract the tz offset, | ||
| // rolling the DATE across midnight), truncate fraction to ms, always emit Z. A | ||
| // no-timezone dateTime is treated as UTC and gains a Z (matching Blazegraph / | ||
| // Neptune). This is the value-space form the publisher's input AND every | ||
| // backend's read-back converge to. | ||
| function canonDateTime(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body, offsetMin } = splitTzToOffset(lex); | ||
| const m = new RegExp(`^(${YEAR})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2}):(\\d{2})(\\.\\d+)?$`).exec(body); | ||
| if (!m) throw new Error('invalid xsd:dateTime'); | ||
| const [, yy, mo, dd, hh, mi, ss, frac] = m; | ||
|
|
@@ -413,90 +455,95 @@ function canonDateTime(lex: string): string { | |
| if (ddN < 1 || ddN > daysInMonth(yy, moN)) throw new Error('day'); | ||
| if (!temporalInRange(yy, moN, ddN, +hh, +mi, +ss)) throw new Error('year overflows i128 seconds'); | ||
| const fracNorm = normFrac(frac); | ||
| if (fracNorm.length - 1 > 18) throw new Error('sub-1e-18 seconds'); // oxigraph stores ≤18 frac digits | ||
| const { rolls } = validateClock(+hh, +mi, +ss, fracNorm); | ||
| if (rolls) { | ||
| return `${rollNextDay(yy, moN, ddN)}T00:${mi}:${ss}${fracNorm}${tz}`; | ||
| } | ||
| // KNOWN oxigraph 0.5.5 DEFECT (documented, NOT mirrored): a BEFORE-EPOCH dateTime | ||
| // (before 0001-01-01T00:00:00, i.e. year ≤ 0000) with seconds == 59 AND a non-zero | ||
| // fraction has its minute bumped by +1 on every load→serialize round-trip. Far | ||
| // before the epoch it never stabilises (-1711-…T15:19:59.6 → :20:59.6 → :21:59.6 → | ||
| // …); near it the bump just crosses into year 0001 once. Either way the store has | ||
| // no stable form for these, so no canonicalization can make them consensus-safe. | ||
| // We deliberately do NOT replicate the bump: canon stays DETERMINISTIC + IDEMPOTENT | ||
| // (the best achievable), normalising tz/fraction like any other dateTime and | ||
| // leaving the wall-clock untouched. Residual exposure = a pre-existing oxigraph | ||
| // storage defect for an essentially-nonexistent input class (BCE / year-0 timestamps | ||
| // at :59 with sub-second precision) — escalated to the store layer, off this canon. | ||
| return `${normYear(yy)}-${mo}-${dd}T${hh}:${mi}:${ss}${fracNorm}${tz}`; | ||
| // Base date as a day count; a T24:00 clock rolls one day and resets the hour to 0. | ||
| let days = daysFromCivil(BigInt(yy), BigInt(moN), BigInt(ddN)); | ||
| const hourN = rolls ? 0 : +hh; | ||
| if (rolls) days += 1n; | ||
| // UTC: subtract the offset (whole minutes); roll the date across midnight. | ||
| const totalMin = hourN * 60 + +mi - offsetMin; | ||
| days += BigInt(Math.floor(totalMin / 1440)); | ||
| const minInDay = ((totalMin % 1440) + 1440) % 1440; | ||
| const { y, m: mm, d } = civilFromDays(days); | ||
| return `${fmtYear(y)}-${pad2(Number(mm))}-${pad2(Number(d))}T${pad2(Math.floor(minInDay / 60))}:${pad2(minInDay % 60)}:${ss}${fracNorm}Z`; | ||
| } | ||
|
|
||
| // OT-RFC-57: time has no date, so a tz offset just wraps the wall clock mod 24h; | ||
| // normalize to UTC + Z, ms-truncated. | ||
| function canonTime(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body, offsetMin } = splitTzToOffset(lex); | ||
| const m = /^(\d{2}):(\d{2}):(\d{2})(\.\d+)?$/.exec(body); | ||
| if (!m) throw new Error('invalid xsd:time'); | ||
| const [, hh, mi, ss, frac] = m; | ||
| const fracNorm = normFrac(frac); | ||
| if (fracNorm.length - 1 > 18) throw new Error('sub-1e-18 seconds'); | ||
| const { rolls } = validateClock(+hh, +mi, +ss, fracNorm); | ||
| // time has no date to roll; hour 24 → 00 of the same wall clock. | ||
| return `${rolls ? '00' : hh}:${mi}:${ss}${fracNorm}${tz}`; | ||
| const hourN = rolls ? 0 : +hh; | ||
| const minInDay = (((hourN * 60 + +mi - offsetMin) % 1440) + 1440) % 1440; | ||
| return `${pad2(Math.floor(minInDay / 60))}:${pad2(minInDay % 60)}:${ss}${fracNorm}Z`; | ||
| } | ||
|
|
||
| // OT-RFC-57: xsd:date / gYear / gYearMonth normalize to the UTC date of | ||
| // midnight-in-tz, with NO timezone emitted (Blazegraph's value form). | ||
| function canonDate(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body, offsetMin } = splitTzToOffset(lex); | ||
| const m = new RegExp(`^(${YEAR})-(\\d{2})-(\\d{2})$`).exec(body); | ||
| if (!m) throw new Error('invalid xsd:date'); | ||
| const moN = +m[2]; | ||
| const ddN = +m[3]; | ||
| if (moN < 1 || moN > 12) throw new Error('month'); | ||
| if (ddN < 1 || ddN > daysInMonth(m[1], moN)) throw new Error('day'); | ||
| if (!temporalInRange(m[1], moN, ddN)) throw new Error('year overflows i128 seconds'); | ||
| return `${normYear(m[1])}-${m[2]}-${m[3]}${tz}`; | ||
| const { y, m: mm, d } = utcDateFromMidnight(BigInt(m[1]), BigInt(moN), BigInt(ddN), offsetMin); | ||
| return `${fmtYear(y)}-${pad2(Number(mm))}-${pad2(Number(d))}`; | ||
| } | ||
|
|
||
| function canonGYear(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body, offsetMin } = splitTzToOffset(lex); | ||
| if (!new RegExp(`^${YEAR}$`).test(body)) throw new Error('invalid xsd:gYear'); | ||
| if (!temporalInRange(body, 1, 1)) throw new Error('year overflows i128 seconds'); | ||
| return `${normYear(body)}${tz}`; | ||
| const { y } = utcDateFromMidnight(BigInt(body), 1n, 1n, offsetMin); | ||
| return fmtYear(y); | ||
| } | ||
|
|
||
| function canonGYearMonth(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body, offsetMin } = splitTzToOffset(lex); | ||
| const m = new RegExp(`^(${YEAR})-(\\d{2})$`).exec(body); | ||
| if (!m || +m[2] < 1 || +m[2] > 12) throw new Error('invalid xsd:gYearMonth'); | ||
| if (!temporalInRange(m[1], +m[2], 1)) throw new Error('year overflows i128 seconds'); | ||
| return `${normYear(m[1])}-${m[2]}${tz}`; | ||
| const { y, m: mm } = utcDateFromMidnight(BigInt(m[1]), BigInt(+m[2]), 1n, offsetMin); | ||
| return `${fmtYear(y)}-${pad2(Number(mm))}`; | ||
| } | ||
|
|
||
| // gMonthDay day bounds. oxigraph 0.5.5 validates --MM-DD against a NON-leap | ||
| // reference year, so --02-29 is rejected (kept verbatim) — February's max is 28 | ||
| // here, unlike a real leap date which needs the year context of xsd:date. | ||
| const MONTH_MAX_DAY = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]; | ||
| // OT-RFC-57: gMonthDay / gMonth / gDay have no year to convert, so a timezone is | ||
| // just STRIPPED (Blazegraph's value form). NB the oracle battery only exercises | ||
| // Z/+00:00 here; a non-UTC offset on these bare types is undefined across backends | ||
| // and not consensus-verified — see OT-RFC-57 §7.8. | ||
| function canonGMonthDay(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body } = splitTzToOffset(lex); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 Bug: Non-UTC offsets are silently stripped from bare Gregorian types What's wrong Example Suggested direction For Agents Separate the bare-gregorian timezone policy from UTC normalization What's wrong Example Suggested direction For Agents There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Issue: Timezone normalization is split across several hand-rolled paths What's wrong Example Suggested direction For Agents |
||
| const m = /^--(\d{2})-(\d{2})$/.exec(body); | ||
| if (!m) throw new Error('invalid xsd:gMonthDay'); | ||
| const moN = +m[1]; | ||
| const ddN = +m[2]; | ||
| if (moN < 1 || moN > 12 || ddN < 1 || ddN > MONTH_MAX_DAY[moN - 1]) throw new Error('range'); | ||
| return `${body}${tz}`; | ||
| return body; | ||
| } | ||
|
|
||
| function canonGMonth(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body } = splitTzToOffset(lex); | ||
| const m = /^--(\d{2})$/.exec(body); | ||
| if (!m || +m[1] < 1 || +m[1] > 12) throw new Error('invalid xsd:gMonth'); | ||
| return `${body}${tz}`; | ||
| return body; | ||
| } | ||
|
|
||
| function canonGDay(lex: string): string { | ||
| const { body, tz } = splitTz(lex); | ||
| const { body } = splitTzToOffset(lex); | ||
| const m = /^---(\d{2})$/.exec(body); | ||
| if (!m || +m[1] < 1 || +m[1] > 31) throw new Error('invalid xsd:gDay'); | ||
| return `${body}${tz}`; | ||
| return body; | ||
| } | ||
|
|
||
| // ── xsd:duration / dayTimeDuration / yearMonthDuration ───────────────────────── | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔴 Bug: Timezone folding happens after the overflow guard
What's wrong
The overflow guard is meant to prevent canonicalizing temporal values that the store cannot parse stably. Because the new UTC conversion runs after that guard, boundary literals can pass validation and then be shifted outside the supported range, producing a protocol leaf for a value the storage backend may reject or preserve differently.
Example
"5391559471919-03-30T14:00:00-14:00"^^<http://www.w3.org/2001/XMLSchema#dateTime>is still within the checked local i128 seconds range, but subtracting-14:00emits5391559471919-03-31T04:00:00Z, which is past the max representable second. Expected behavior is to leave an overflowed temporal literal verbatim rather than normalize it into an unrepresentable UTC value.Suggested direction
Apply timezone/T24 normalization before the i128 range decision, or include the offset and roll in the range calculation; if the normalized value is outside the supported store range, fall back to verbatim.
Confidence note
This follows from the code's own i128 range invariant; the exact store behavior at the far boundary should be confirmed, but the canonicalizer now emits a UTC value outside the range it just validated against.
For Agents
In
packages/core/src/crypto/term-canon.ts, updatecanonDateTimeand the date/gYear/gYearMonth paths to validate the normalized UTC instant/date, not only the original lexical components. Preserve normal timezone folding, and add max/min boundary tests where the offset or T24 roll crosses the i128 limit.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔴 Bug: Sub-millisecond fractions can make invalid hour-24 times roll into valid leaves
What's wrong
The new millisecond truncation runs before
validateClock. For hour 24, validity depends on whether the original seconds value including fraction is zero. A non-zero sub-millisecond fraction is truncated away, so invalid literals are treated as valid and normalized into a UTC leaf, collapsing distinct invalid inputs into the same hashable value instead of preserving them verbatim.Example
"2026-06-29T24:12:00.0005"^^<http://www.w3.org/2001/XMLSchema#dateTime>has a non-zero seconds fraction with a non-zero minute, so the hour-24 form should be kept verbatim/rejected by the temporal validator. With the new code,.0005is truncated to an empty millisecond fraction beforevalidateClock, so it rolls and hashes as"2026-06-30T00:12:00Z"^^<...#dateTime>. The same applies to"24:12:00.0005"^^<...#time>.Suggested direction
Separate lexical validity from output precision: decide whether hour 24 is rollable using the original fractional seconds, then truncate only after the value has passed validation.
For Agents
In
packages/core/src/crypto/term-canon.ts, validate the hour-24 rule against the raw fractional seconds value, or preserve a boolean for whether the original fraction was numerically non-zero, before applying millisecond truncation. Add dateTime and time cases with24:MM:00.0005whereMM != 00proving they stay verbatim while valid millisecond truncation still works for ordinary times.