Skip to content
Merged
7 changes: 6 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,12 @@ jobs:
# Stock image ships the default `kb` namespace; that's all the
# adapter needs (it isolates per-test via unique GRAPH IRIs).
BLAZEGRAPH_TEST_URL: http://127.0.0.1:9999/bigdata/namespace/kb/sparql
run: pnpm --filter @origintrail-official/dkg-storage exec vitest run test/blazegraph.integration.test.ts
# term-canon-blazegraph-oracle: cross-backend V10 leaf agreement (OT-RFC-57)
# — proves an oxigraph node and a Blazegraph node compute the SAME merkle
# leaf for the same typed literal (else RandomSampling forks). dateTime/time
# are fixed here; date/gregorian/double/escaping remain it.fails pending
# the rest of the backend-independent canon.
run: pnpm --filter @origintrail-official/dkg-storage exec vitest run test/blazegraph.integration.test.ts test/term-canon-blazegraph-oracle.test.ts

# ------------------------------------------------------------------
# Tornado publisher lane — sharded across 4 parallel runners.
Expand Down
157 changes: 102 additions & 55 deletions packages/core/src/crypto/term-canon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,6 @@ function decodeIriEscapes(iri: string): string {
});
}

// oxigraph normalizes the "negative zero" year -0000 to 0000. (Only -0000 reaches
// here: a leading-zero 5+-digit negative year fails the YEAR pattern → verbatim.)
const normYear = (yy: string) => (yy === '-0000' ? '0000' : yy);

// oxigraph stores temporal values as seconds-since-0001-01-01 in the same i128/1e18
// Decimal as xsd:decimal/duration. A date/time whose scaled seconds overflow i128
// fails to parse and is kept VERBATIM, so a foldable timezone / T24 roll / fraction
Expand All @@ -177,6 +173,37 @@ function daysFromCivil(y: bigint, m: bigint, d: bigint): bigint {
const doe = yoe * 365n + yoe / 4n - yoe / 100n + doy;
return era * 146097n + doe - 719468n;
}
// Inverse of daysFromCivil: proleptic-Gregorian (y,m,d) from a signed day count
// (days since 1970-01-01). Standard Howard Hinnant algorithm. Used to roll the
// DATE when a timezone offset pushes a dateTime across midnight during the
// backend-independent UTC normalization (OT-RFC-57).
function civilFromDays(zIn: bigint): { y: bigint; m: bigint; d: bigint } {
const z = zIn + 719468n;
const era = (z >= 0n ? z : z - 146096n) / 146097n;
const doe = z - era * 146097n; // [0, 146096]
const yoe = (doe - doe / 1460n + doe / 36524n - doe / 146096n) / 365n; // [0, 399]
const y = yoe + era * 400n;
const doy = doe - (365n * yoe + yoe / 4n - yoe / 100n); // [0, 365]
const mp = (5n * doy + 2n) / 153n; // [0, 11]
const d = doy - (153n * mp + 2n) / 5n + 1n; // [1, 31]
const m = mp < 10n ? mp + 3n : mp - 9n; // [1, 12]
return { y: m <= 2n ? y + 1n : y, m, d };
}

// OT-RFC-57: the UTC date of "midnight in the given tz" — the backend-independent
// form for xsd:date / gYear / gYearMonth. Blazegraph interprets the value at 00:00
// in its tz, converts to UTC, and takes the UTC date; a positive offset rolls the
// date back a day. offsetMin=0 (Z / no-tz) ⇒ the date is unchanged.
function utcDateFromMidnight(
y: bigint,
mo: bigint,
d: bigint,
offsetMin: number,
): { y: bigint; m: bigint; d: bigint } {
const days = daysFromCivil(y, mo, d) + BigInt(Math.floor((0 - offsetMin) / 1440));
return civilFromDays(days);
}

function temporalInRange(yearStr: string, mo: number, dd: number, hh = 0, mi = 0, ss = 0): boolean {
const seconds =
(daysFromCivil(BigInt(yearStr), BigInt(mo), BigInt(dd)) + 719162n) * 86400n +
Expand Down Expand Up @@ -225,7 +252,11 @@ function canonDouble(lex: string, isFloat: boolean): string {
if (Number.isNaN(n)) return 'NaN';
if (n === Infinity) return 'INF';
if (n === -Infinity) return '-INF';
if (n === 0) return Object.is(n, -0) ? '-0' : '0';
// OT-RFC-57: negative zero folds to "0". Blazegraph drops the sign on write
// ("-0.0"^^double → stored "0.0" → value 0), while oxigraph keeps "-0"; emitting
// "0" for both signed zeros makes canon(input) == canon(store-readback) on either
// backend. (The IEEE-754 -0/+0 distinction is not consensus-observable here.)
if (n === 0) return '0';
const neg = n < 0;
const a = Math.abs(n);
// double: V8's a.toString() IS the shortest round-trip; only ties need the
Expand Down Expand Up @@ -329,27 +360,31 @@ function stripTrailingZeros(s: string): string {
}

// ── date/time family ───────────────────────────────────────────────────────────
// Split + validate the trailing timezone, folding +00:00/-00:00 to Z. oxigraph
// accepts Z or ±HH:MM with |offset| ≤ 14:00 (HH≤14, MM≤59, total ≤ 840 min);
// anything else (incl. a malformed +0:00) leaves the timezone in `body`, where the
// per-type grammar then rejects it → the whole literal is kept verbatim.
function splitTz(s: string): { body: string; tz: string } {
// Returns the offset MAGNITUDE in minutes (signed) for the
// backend-independent UTC normalization of xsd:dateTime/xsd:time (OT-RFC-57).
// hadTz=false ⇒ no timezone present (a bare dateTime is normalized to UTC and
// gains a Z, matching Blazegraph/Neptune). Malformed/out-of-range tz → throw
// (→ the literal is kept verbatim, as oxigraph does).
function splitTzToOffset(s: string): { body: string; offsetMin: number; hadTz: boolean } {
const m = /(Z|[+-]\d{2}:\d{2})$/.exec(s);
if (!m) return { body: s, tz: '' };
if (!m) return { body: s, offsetMin: 0, hadTz: false };
const tz = m[1];
const body = s.slice(0, s.length - tz.length);
if (tz === 'Z') return { body, tz: 'Z' };
if (tz === 'Z') return { body, offsetMin: 0, hadTz: true };
const h = parseInt(tz.slice(1, 3), 10);
const mi = parseInt(tz.slice(4, 6), 10);
if (mi > 59 || h * 60 + mi > 840) throw new Error(`invalid tz: ${tz}`);
return { body, tz: tz === '+00:00' || tz === '-00:00' ? 'Z' : tz };
const mag = h * 60 + mi;
return { body, offsetMin: tz[0] === '-' ? -mag : mag, hadTz: true };
}

// Normalize a fractional-seconds group ('.ddd' or undefined): strip trailing
// zeros, drop entirely if it becomes empty.
// Normalize a fractional-seconds group ('.ddd' or undefined): TRUNCATE to at most
// 3 digits (milliseconds — the backend-independent precision floor; a lossy store
// such as Blazegraph keeps only ms), then strip trailing zeros; drop entirely if
// empty. Truncate, NOT round (matches Blazegraph). (OT-RFC-57)
function normFrac(frac: string | undefined): string {
if (frac === undefined) return '';
const d = frac.slice(1).replace(/0+$/, '');
const d = frac.slice(1, 4).replace(/0+$/, ''); // at most 3 digits, then strip trailing zeros
return d === '' ? '' : `.${d}`;
}

Expand Down Expand Up @@ -396,14 +431,21 @@ function validateClock(hh: number, mi: number, ss: number, fracNorm: string): {
return { rolls: false };
}

// A valid XSD year is EXACTLY 4 digits (leading zeros allowed) OR 5+ digits with
// NO leading zero. oxigraph rejects a leading-zero 5+-digit year (e.g. 09508) and
// keeps the whole literal verbatim — so we must too, or we'd normalize tz/fraction
// on a literal oxigraph leaves untouched.
const YEAR = '-?(?:\\d{4}|[1-9]\\d{4,})';

// OT-RFC-57: the backend-independent value canon accepts any 4+-digit year (any
// number of leading zeros) and normalizes it via BigInt+fmtYear (min-4-digit, no
// leading zero). This matches Blazegraph, which on write STRIPS a leading-zero
// year to its value ("02026"^^gYear → "2026") — oxigraph instead keeps the invalid
// literal verbatim, but the CONVERGENCE oracle holds either way since canon(input)
// and canon(store-readback) both fold to the same value form (OT-RFC-57 §7.5).
const YEAR = '-?\\d{4,}';

// OT-RFC-57 backend-independent form: normalize to UTC (subtract the tz offset,
// rolling the DATE across midnight), truncate fraction to ms, always emit Z. A
// no-timezone dateTime is treated as UTC and gains a Z (matching Blazegraph /
// Neptune). This is the value-space form the publisher's input AND every
// backend's read-back converge to.
function canonDateTime(lex: string): string {
const { body, tz } = splitTz(lex);
const { body, offsetMin } = splitTzToOffset(lex);
const m = new RegExp(`^(${YEAR})-(\\d{2})-(\\d{2})T(\\d{2}):(\\d{2}):(\\d{2})(\\.\\d+)?$`).exec(body);
if (!m) throw new Error('invalid xsd:dateTime');
const [, yy, mo, dd, hh, mi, ss, frac] = m;
Expand All @@ -413,90 +455,95 @@ function canonDateTime(lex: string): string {
if (ddN < 1 || ddN > daysInMonth(yy, moN)) throw new Error('day');
if (!temporalInRange(yy, moN, ddN, +hh, +mi, +ss)) throw new Error('year overflows i128 seconds');
const fracNorm = normFrac(frac);
if (fracNorm.length - 1 > 18) throw new Error('sub-1e-18 seconds'); // oxigraph stores ≤18 frac digits
const { rolls } = validateClock(+hh, +mi, +ss, fracNorm);
if (rolls) {
return `${rollNextDay(yy, moN, ddN)}T00:${mi}:${ss}${fracNorm}${tz}`;
}
// KNOWN oxigraph 0.5.5 DEFECT (documented, NOT mirrored): a BEFORE-EPOCH dateTime
// (before 0001-01-01T00:00:00, i.e. year ≤ 0000) with seconds == 59 AND a non-zero
// fraction has its minute bumped by +1 on every load→serialize round-trip. Far
// before the epoch it never stabilises (-1711-…T15:19:59.6 → :20:59.6 → :21:59.6 →
// …); near it the bump just crosses into year 0001 once. Either way the store has
// no stable form for these, so no canonicalization can make them consensus-safe.
// We deliberately do NOT replicate the bump: canon stays DETERMINISTIC + IDEMPOTENT
// (the best achievable), normalising tz/fraction like any other dateTime and
// leaving the wall-clock untouched. Residual exposure = a pre-existing oxigraph
// storage defect for an essentially-nonexistent input class (BCE / year-0 timestamps
// at :59 with sub-second precision) — escalated to the store layer, off this canon.
return `${normYear(yy)}-${mo}-${dd}T${hh}:${mi}:${ss}${fracNorm}${tz}`;
// Base date as a day count; a T24:00 clock rolls one day and resets the hour to 0.
let days = daysFromCivil(BigInt(yy), BigInt(moN), BigInt(ddN));
const hourN = rolls ? 0 : +hh;
if (rolls) days += 1n;
// UTC: subtract the offset (whole minutes); roll the date across midnight.
const totalMin = hourN * 60 + +mi - offsetMin;

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Bug: Timezone folding happens after the overflow guard

What's wrong
The overflow guard is meant to prevent canonicalizing temporal values that the store cannot parse stably. Because the new UTC conversion runs after that guard, boundary literals can pass validation and then be shifted outside the supported range, producing a protocol leaf for a value the storage backend may reject or preserve differently.

Example
"5391559471919-03-30T14:00:00-14:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> is still within the checked local i128 seconds range, but subtracting -14:00 emits 5391559471919-03-31T04:00:00Z, which is past the max representable second. Expected behavior is to leave an overflowed temporal literal verbatim rather than normalize it into an unrepresentable UTC value.

Suggested direction
Apply timezone/T24 normalization before the i128 range decision, or include the offset and roll in the range calculation; if the normalized value is outside the supported store range, fall back to verbatim.

Confidence note
This follows from the code's own i128 range invariant; the exact store behavior at the far boundary should be confirmed, but the canonicalizer now emits a UTC value outside the range it just validated against.

For Agents
In packages/core/src/crypto/term-canon.ts, update canonDateTime and the date/gYear/gYearMonth paths to validate the normalized UTC instant/date, not only the original lexical components. Preserve normal timezone folding, and add max/min boundary tests where the offset or T24 roll crosses the i128 limit.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Bug: Sub-millisecond fractions can make invalid hour-24 times roll into valid leaves

What's wrong
The new millisecond truncation runs before validateClock. For hour 24, validity depends on whether the original seconds value including fraction is zero. A non-zero sub-millisecond fraction is truncated away, so invalid literals are treated as valid and normalized into a UTC leaf, collapsing distinct invalid inputs into the same hashable value instead of preserving them verbatim.

Example
"2026-06-29T24:12:00.0005"^^<http://www.w3.org/2001/XMLSchema#dateTime> has a non-zero seconds fraction with a non-zero minute, so the hour-24 form should be kept verbatim/rejected by the temporal validator. With the new code, .0005 is truncated to an empty millisecond fraction before validateClock, so it rolls and hashes as "2026-06-30T00:12:00Z"^^<...#dateTime>. The same applies to "24:12:00.0005"^^<...#time>.

Suggested direction
Separate lexical validity from output precision: decide whether hour 24 is rollable using the original fractional seconds, then truncate only after the value has passed validation.

For Agents
In packages/core/src/crypto/term-canon.ts, validate the hour-24 rule against the raw fractional seconds value, or preserve a boolean for whether the original fraction was numerically non-zero, before applying millisecond truncation. Add dateTime and time cases with 24:MM:00.0005 where MM != 00 proving they stay verbatim while valid millisecond truncation still works for ordinary times.

days += BigInt(Math.floor(totalMin / 1440));
const minInDay = ((totalMin % 1440) + 1440) % 1440;
const { y, m: mm, d } = civilFromDays(days);
return `${fmtYear(y)}-${pad2(Number(mm))}-${pad2(Number(d))}T${pad2(Math.floor(minInDay / 60))}:${pad2(minInDay % 60)}:${ss}${fracNorm}Z`;
}

// OT-RFC-57: time has no date, so a tz offset just wraps the wall clock mod 24h;
// normalize to UTC + Z, ms-truncated.
function canonTime(lex: string): string {
const { body, tz } = splitTz(lex);
const { body, offsetMin } = splitTzToOffset(lex);
const m = /^(\d{2}):(\d{2}):(\d{2})(\.\d+)?$/.exec(body);
if (!m) throw new Error('invalid xsd:time');
const [, hh, mi, ss, frac] = m;
const fracNorm = normFrac(frac);
if (fracNorm.length - 1 > 18) throw new Error('sub-1e-18 seconds');
const { rolls } = validateClock(+hh, +mi, +ss, fracNorm);
// time has no date to roll; hour 24 → 00 of the same wall clock.
return `${rolls ? '00' : hh}:${mi}:${ss}${fracNorm}${tz}`;
const hourN = rolls ? 0 : +hh;
const minInDay = (((hourN * 60 + +mi - offsetMin) % 1440) + 1440) % 1440;
return `${pad2(Math.floor(minInDay / 60))}:${pad2(minInDay % 60)}:${ss}${fracNorm}Z`;
}

// OT-RFC-57: xsd:date / gYear / gYearMonth normalize to the UTC date of
// midnight-in-tz, with NO timezone emitted (Blazegraph's value form).
function canonDate(lex: string): string {
const { body, tz } = splitTz(lex);
const { body, offsetMin } = splitTzToOffset(lex);
const m = new RegExp(`^(${YEAR})-(\\d{2})-(\\d{2})$`).exec(body);
if (!m) throw new Error('invalid xsd:date');
const moN = +m[2];
const ddN = +m[3];
if (moN < 1 || moN > 12) throw new Error('month');
if (ddN < 1 || ddN > daysInMonth(m[1], moN)) throw new Error('day');
if (!temporalInRange(m[1], moN, ddN)) throw new Error('year overflows i128 seconds');
return `${normYear(m[1])}-${m[2]}-${m[3]}${tz}`;
const { y, m: mm, d } = utcDateFromMidnight(BigInt(m[1]), BigInt(moN), BigInt(ddN), offsetMin);
return `${fmtYear(y)}-${pad2(Number(mm))}-${pad2(Number(d))}`;
}

function canonGYear(lex: string): string {
const { body, tz } = splitTz(lex);
const { body, offsetMin } = splitTzToOffset(lex);
if (!new RegExp(`^${YEAR}$`).test(body)) throw new Error('invalid xsd:gYear');
if (!temporalInRange(body, 1, 1)) throw new Error('year overflows i128 seconds');
return `${normYear(body)}${tz}`;
const { y } = utcDateFromMidnight(BigInt(body), 1n, 1n, offsetMin);
return fmtYear(y);
}

function canonGYearMonth(lex: string): string {
const { body, tz } = splitTz(lex);
const { body, offsetMin } = splitTzToOffset(lex);
const m = new RegExp(`^(${YEAR})-(\\d{2})$`).exec(body);
if (!m || +m[2] < 1 || +m[2] > 12) throw new Error('invalid xsd:gYearMonth');
if (!temporalInRange(m[1], +m[2], 1)) throw new Error('year overflows i128 seconds');
return `${normYear(m[1])}-${m[2]}${tz}`;
const { y, m: mm } = utcDateFromMidnight(BigInt(m[1]), BigInt(+m[2]), 1n, offsetMin);
return `${fmtYear(y)}-${pad2(Number(mm))}`;
}

// gMonthDay day bounds. oxigraph 0.5.5 validates --MM-DD against a NON-leap
// reference year, so --02-29 is rejected (kept verbatim) — February's max is 28
// here, unlike a real leap date which needs the year context of xsd:date.
const MONTH_MAX_DAY = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31];
// OT-RFC-57: gMonthDay / gMonth / gDay have no year to convert, so a timezone is
// just STRIPPED (Blazegraph's value form). NB the oracle battery only exercises
// Z/+00:00 here; a non-UTC offset on these bare types is undefined across backends
// and not consensus-verified — see OT-RFC-57 §7.8.
function canonGMonthDay(lex: string): string {
const { body, tz } = splitTz(lex);
const { body } = splitTzToOffset(lex);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Bug: Non-UTC offsets are silently stripped from bare Gregorian types

What's wrong
For gMonthDay, gMonth, and gDay, the code parses a valid timezone and then returns only body, so every non-zero offset is lost. Since these types lack enough calendar context to roll across a date, stripping the offset can conflate distinct literals and create consensus assumptions the code comment says are not verified.

Example
"--06-29+14:00"^^<http://www.w3.org/2001/XMLSchema#gMonthDay> and "--06-29-14:00"^^<http://www.w3.org/2001/XMLSchema#gMonthDay> both canonicalize to "--06-29"^^<...#gMonthDay>. Those inputs carry different timezone offsets, but the leaf drops that distinction without a date context to convert it safely.

Suggested direction
Only strip absent/zero timezones for these no-year/no-date types unless OT-RFC-57 defines a safe value-space mapping for non-zero offsets; otherwise keep the original lexical form so different values do not collapse to the same leaf.

For Agents
In packages/core/src/crypto/term-canon.ts, look at canonGMonthDay, canonGMonth, and canonGDay. Preserve the existing Z/+00:00 behavior, but leave non-zero offsets verbatim or define an explicit reject/normalization rule; add cases for +14:00 and -14:00 proving distinct or rejected behavior.

Separate the bare-gregorian timezone policy from UTC normalization

What's wrong
The PR introduces a helper whose contract is UTC offset normalization, then uses it in callers that intentionally ignore the offset. That is a boundary smell: future maintainers have to infer from comments that some datatypes normalize to UTC while others strip timezone syntax, including cases the comment says are not consensus-verified.

Example
canonGMonthDay('--06-29+02:00') goes through a UTC-offset parser, then silently drops +02:00 because the caller ignores offsetMin. That makes the timezone policy for partial Gregorian types implicit and easy to accidentally expand.

Suggested direction
Do not reuse splitTzToOffset as a generic stripper. A small typed dispatcher or separate helper per temporal policy would make the invariants visible and remove the current “parse then ignore” coupling.

For Agents
Look in packages/core/src/crypto/term-canon.ts around the date/time family. Preserve the current canonical outputs, but split the timezone handling into explicit policy helpers: one for UTC-normalized types that consumes offsetMin, and one for bare gMonth/gDay/gMonthDay that intentionally strips only the supported timezone forms or names the unsupported policy directly. Add/keep cases proving non-UTC partial-gregorian behavior is deliberate.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Issue: Timezone normalization is split across several hand-rolled paths

What's wrong
The PR introduces multiple local implementations of the same temporal normalization concept. For a consensus-critical canonicalizer, this is a structural maintenance risk: future changes to timezone, T24, or range semantics must be mirrored across several branches, and the reader has to re-prove each formula independently.

Example
dateTime, time, date, gYear, and gYearMonth all subtract offsetMin, roll across day/year boundaries, then format and range-check. Today those rules are spread across utcDateFromMidnight, inline dateTime arithmetic, and a time-only modulo path.

Suggested direction
Collapse the duplicated UTC-roll logic into one helper or typed value model, then have each datatype parser feed that helper and format its own lexical shape. That would make the invariant auditable in one place instead of relying on several similar formulas staying aligned.

For Agents
In term-canon.ts, extract a small temporal normalization model/helper that accepts date fields when present, clock fields, T24 rollover state, and offset minutes, then returns normalized date/time fields. Preserve current output for dateTime/time/date/gYear/gYearMonth and keep the existing oracle coverage green.

const m = /^--(\d{2})-(\d{2})$/.exec(body);
if (!m) throw new Error('invalid xsd:gMonthDay');
const moN = +m[1];
const ddN = +m[2];
if (moN < 1 || moN > 12 || ddN < 1 || ddN > MONTH_MAX_DAY[moN - 1]) throw new Error('range');
return `${body}${tz}`;
return body;
}

function canonGMonth(lex: string): string {
const { body, tz } = splitTz(lex);
const { body } = splitTzToOffset(lex);
const m = /^--(\d{2})$/.exec(body);
if (!m || +m[1] < 1 || +m[1] > 12) throw new Error('invalid xsd:gMonth');
return `${body}${tz}`;
return body;
}

function canonGDay(lex: string): string {
const { body, tz } = splitTz(lex);
const { body } = splitTzToOffset(lex);
const m = /^---(\d{2})$/.exec(body);
if (!m || +m[1] < 1 || +m[1] > 31) throw new Error('invalid xsd:gDay');
return `${body}${tz}`;
return body;
}

// ── xsd:duration / dayTimeDuration / yearMonthDuration ─────────────────────────
Expand Down
23 changes: 14 additions & 9 deletions packages/publisher/test/term-canon-exhaustive.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,21 @@ async function oxiForms(objects: string[]): Promise<string[]> {
async function proveParity(label: string, objects: string[]): Promise<void> {
const oxi = await oxiForms(objects);
const mismatches: string[] = [];
// forward parity
// OT-RFC-57: the canon is now a backend-INDEPENDENT value canon — for temporal
// types it emits the UTC value form, NOT oxigraph's preserved lexical form. So
// the old "core == oxigraph" identity no longer holds. Assert CONVERGENCE
// (canon(oxigraph_readback) == canon(input)) — the property consensus needs —
// and true idempotence (canon(canon(x)) == canon(x)).
objects.forEach((obj, i) => {
const got = canonicalizeObjectTermForHash(obj);
if (got !== oxi[i]) mismatches.push(`FWD in=${obj}\n core=${got}\n oxi =${oxi[i]}`);
});
// no-migration: core is the identity on oxigraph's own canonical output
oxi.forEach((o) => {
if (o === '(DROPPED)') return;
const re = canonicalizeObjectTermForHash(o);
if (re !== o) mismatches.push(`IDEMPOTENCE BROKEN oxi=${o}\n core(oxi)=${re}`);
if (oxi[i] === '(DROPPED)') return;
const canonIn = canonicalizeObjectTermForHash(obj);
const canonOxi = canonicalizeObjectTermForHash(oxi[i]);
if (canonIn !== canonOxi) {
mismatches.push(`CONVERGENCE in=${obj}\n canon(in) =${canonIn}\n canon(oxi ${oxi[i]})=${canonOxi}`);
}
if (canonicalizeObjectTermForHash(canonIn) !== canonIn) {
mismatches.push(`IDEMPOTENCE BROKEN in=${obj}\n canon=${canonIn}\n canon(canon)=${canonicalizeObjectTermForHash(canonIn)}`);
}
});
if (mismatches.length) {
throw new Error(`${label}: ${mismatches.length} mismatch(es):\n${mismatches.slice(0, 30).join('\n')}`);
Expand Down
30 changes: 24 additions & 6 deletions packages/publisher/test/term-canon-oracle.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,27 @@ async function oxigraphForms(objects: string[]): Promise<string[]> {
return objects.map((_, i) => byPred.get(`urn:p#${i}`) ?? '(DROPPED)');
}

/** Assert the pure core canonicalizer reproduces oxigraph's form for every input. */
/**
* OT-RFC-57: the canon is now a backend-INDEPENDENT value canon — it no longer
* reproduces oxigraph's stored lexical form (for temporal types it emits the UTC,
* ms-truncated form so oxigraph and Blazegraph nodes agree). So we assert
* CONVERGENCE, not identity: `canon(oxigraph_readback) === canon(input)`. This is
* exactly what consensus needs — the publisher (input) and a prover reading from
* an oxigraph store compute the same leaf — and it holds for BOTH the types the
* canon rewrites (dateTime/time) and the types it leaves as oxigraph's form.
*/
async function expectMatchesOxigraph(objects: string[]): Promise<void> {
const oxi = await oxigraphForms(objects);
const mismatches: string[] = [];
objects.forEach((obj, i) => {
const got = canonicalizeObjectTermForHash(obj);
if (got !== oxi[i]) mismatches.push(` in: ${obj}\n core:${got}\n oxi: ${oxi[i]}`);
const canonInput = canonicalizeObjectTermForHash(obj);
const canonOxi = canonicalizeObjectTermForHash(oxi[i]);
if (canonInput !== canonOxi) {
mismatches.push(` in: ${obj}\n canon(input): ${canonInput}\n canon(oxi-store ${oxi[i]}): ${canonOxi}`);
}
});
if (mismatches.length) {
throw new Error(`core canon diverged from oxigraph (${mismatches.length}/${objects.length}):\n${mismatches.join('\n')}`);
throw new Error(`canon(input) != canon(oxigraph-readback) — publisher/prover would fork (${mismatches.length}/${objects.length}):\n${mismatches.join('\n')}`);
}
expect(mismatches.length).toBe(0);
}
Expand Down Expand Up @@ -322,8 +333,15 @@ describe('term-canon oracle: fuzz-hardened edge classes (#1386)', () => {
const oxi = await oxigraphForms(battery);
for (let i = 0; i < battery.length; i++) {
const once = canonicalizeObjectTermForHash(battery[i]);
expect(canonicalizeObjectTermForHash(once)).toBe(once); // idempotent
if (oxi[i] !== '(DROPPED)') expect(canonicalizeObjectTermForHash(oxi[i])).toBe(oxi[i]); // identity on store output
expect(canonicalizeObjectTermForHash(once)).toBe(once); // idempotent (fixed point)
// OT-RFC-57: the canon is no longer the IDENTITY on oxigraph's output — for
// temporal types it normalizes oxigraph's preserved form to the UTC value
// form (so oxigraph nodes agree with Blazegraph). That is the intended
// oxigraph/devnet migration (mainnet = Blazegraph is unchanged; asserted by
// the Blazegraph oracle). What MUST hold for consensus is CONVERGENCE:
// canon(oxigraph_readback) == canon(input) ⇒ publisher and an oxigraph
// prover compute the same leaf.
if (oxi[i] !== '(DROPPED)') expect(canonicalizeObjectTermForHash(oxi[i])).toBe(once);
}
});

Expand Down
Loading
Loading