From bf2caaea6935b06565a7614863274cef43395e18 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 19:27:01 +0200 Subject: [PATCH 01/12] Chunk oversized schema text literals --- packages/agent/src/dkg-agent-publish.ts | 12 +- .../agent/test/publish-literal-size.test.ts | 58 +- packages/cli/src/daemon/http-utils.ts | 29 +- .../daemon/routes/knowledge-assets-import.ts | 18 +- .../cli/src/daemon/routes/knowledge-assets.ts | 28 +- packages/cli/src/daemon/routes/memory.ts | 36 +- .../daemon/routes/shared-assertion-helpers.ts | 6 - .../test/http-literal-size-validation.test.ts | 47 +- ...ssue-306-787-write-quad-validation.test.ts | 14 +- packages/core/src/index.ts | 27 + packages/core/src/rdf-literal-size.ts | 498 ++++++++++++++++++ packages/core/test/rdf-literal-size.test.ts | 213 ++++++++ packages/publisher/src/dkg-publisher.ts | 27 +- .../test/dkg-publisher-compat.test.ts | 126 ++++- packages/storage/test/blazegraph.unit.test.ts | 25 + packages/storage/test/sparql-http.test.ts | 23 + 16 files changed, 1113 insertions(+), 74 deletions(-) diff --git a/packages/agent/src/dkg-agent-publish.ts b/packages/agent/src/dkg-agent-publish.ts index 801994616..111a30cf3 100644 --- a/packages/agent/src/dkg-agent-publish.ts +++ b/packages/agent/src/dkg-agent-publish.ts @@ -98,6 +98,7 @@ import { sharedMemoryReadBothFilter, partitionCatalogQuads, assertQuadLiteralsMutf8Safe, + normalizeLargeRdfLiteralsForBlazegraph, } from '@origintrail-official/dkg-core'; import { GraphManager, PrivateContentStore, createTripleStore, type TripleStore, type TripleStoreConfig, type Quad, type LargeLiteralStorageConfig } from '@origintrail-official/dkg-storage'; import { EVMChainAdapter, NoChainAdapter, enrichEvmError, buildKnowledgeAssetUal, type EVMAdapterConfig, type ChainAdapter, type CreateContextGraphParams, type CreateOnChainContextGraphParams, type CreateOnChainContextGraphResult, type TxResult, type V10PublishingConvictionAccountInfo } from '@origintrail-official/dkg-chain'; @@ -426,6 +427,10 @@ function rejectOversizedRdfLiterals(quads: Quad[] | undefined, label: string): v assertQuadLiteralsMutf8Safe(quads, { label }); } +function normalizePublicRdfLiterals(quads: Quad[], label: string): Quad[] { + return normalizeLargeRdfLiteralsForBlazegraph(quads, { label }).quads as Quad[]; +} + export class PublishMethods extends DKGAgentBase { async publishWorkspaceGossip(this: DKGAgent, contextGraphId: string, @@ -995,7 +1000,7 @@ export class PublishMethods extends DKGAgentBase { if (publicQuads.length === 0 && privateQuads.length === 0) { throw new InvalidContentError('Content must include at least one public or private payload'); } - rejectOversizedRdfLiterals(publicQuads, 'publishAsync.publicQuads'); + publicQuads = normalizePublicRdfLiterals(publicQuads, 'publishAsync.publicQuads'); rejectOversizedRdfLiterals(privateQuads, 'publishAsync.privateQuads'); const partitioned = partitionPublishAsyncQuads(publicQuads, privateQuads); @@ -1292,8 +1297,8 @@ export class PublishMethods extends DKGAgentBase { ): Promise { const ctx = opts?.operationCtx ?? createOperationContext('publish'); const onPhase = opts?.onPhase; + quads = normalizePublicRdfLiterals(quads, 'agent.publish.quads'); this.log.info(ctx, `Starting publish to context graph "${contextGraphId}" with ${quads.length} triples`); - rejectOversizedRdfLiterals(quads, 'agent.publish.quads'); rejectOversizedRdfLiterals(privateQuads, 'agent.publish.privateQuads'); const isSystem = contextGraphId === SYSTEM_CONTEXT_GRAPHS.AGENTS || contextGraphId === SYSTEM_CONTEXT_GRAPHS.ONTOLOGY; @@ -1528,8 +1533,8 @@ export class PublishMethods extends DKGAgentBase { ): Promise { const ctx = opts?.operationCtx ?? createOperationContext('update'); const onPhase = opts?.onPhase; + quads = normalizePublicRdfLiterals(quads, 'agent.update.quads'); this.log.info(ctx, `Starting update of kaId=${kaId} in context graph "${contextGraphId}" with ${quads.length} triples`); - rejectOversizedRdfLiterals(quads, 'agent.update.quads'); rejectOversizedRdfLiterals(privateQuads, 'agent.update.privateQuads'); // GH #842: thread the on-chain cgId so the publisher can promote the update // payload into the per-cgId partition the RS prover reads. Without it, @@ -2640,6 +2645,7 @@ export class PublishMethods extends DKGAgentBase { privateQuads?: Quad[]; }, ): Promise { + quads = normalizePublicRdfLiterals(quads, '_buildPrecomputedAttestationForSelection.quads'); if ( opts?.authorAgentAddress != null && opts?.preSignedAuthorAttestation != null diff --git a/packages/agent/test/publish-literal-size.test.ts b/packages/agent/test/publish-literal-size.test.ts index bc626dee6..7a4368762 100644 --- a/packages/agent/test/publish-literal-size.test.ts +++ b/packages/agent/test/publish-literal-size.test.ts @@ -1,6 +1,10 @@ import { describe, expect, it, vi } from 'vitest'; +import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, +} from '@origintrail-official/dkg-core'; import { PublishMethods } from '../src/dkg-agent-publish.js'; -import type { Quad } from '@origintrail-official/dkg-storage'; +import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; const OVERSIZED_TEXT_QUAD: Quad = { subject: 'http://example.org/root', @@ -9,6 +13,11 @@ const OVERSIZED_TEXT_QUAD: Quad = { graph: 'http://example.org/graph', }; +const OVERSIZED_NAME_QUAD: Quad = { + ...OVERSIZED_TEXT_QUAD, + predicate: 'http://schema.org/name', +}; + describe('agent publish literal size validation', () => { it('rejects publishAsync private quads before workspace staging', async () => { const agentStub = { @@ -32,7 +41,48 @@ describe('agent publish literal size validation', () => { }); }); - it('rejects direct publish quads before chain or publisher work', async () => { + it('chunks publishAsync public schema:text before workspace staging', async () => { + const writeToWorkspace = vi.fn(async () => ({ + shareOperationId: 'swm-test', + message: new Uint8Array([1, 2, 3]), + })); + const agentStub = { + contextGraphExists: vi.fn(async () => true), + publisher: { + writeToWorkspace, + }, + peerId: 'peer-test', + store: new OxigraphStore(), + log: { + warn: vi.fn(), + }, + buildAsyncLiftSeal: vi.fn(async () => undefined), + }; + + const result = await PublishMethods.prototype.publishAsync.call( + agentStub as never, + 'computer-history', + { + publicQuads: [OVERSIZED_TEXT_QUAD], + privateQuads: [], + }, + { localOnly: true }, + ); + + expect(result.captureID).toEqual(expect.any(String)); + const stagedQuads = writeToWorkspace.mock.calls[0]?.[1] as Quad[]; + expect(stagedQuads.some((quad) => + quad.subject === OVERSIZED_TEXT_QUAD.subject && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(stagedQuads.some((quad) => + quad.subject === OVERSIZED_TEXT_QUAD.subject && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(stagedQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + }); + + it('rejects direct publish non-text quads before chain or publisher work', async () => { const agentStub = { log: { info: vi.fn() }, }; @@ -41,13 +91,13 @@ describe('agent publish literal size validation', () => { PublishMethods.prototype._publish.call( agentStub as never, 'computer-history', - [OVERSIZED_TEXT_QUAD], + [OVERSIZED_NAME_QUAD], ), ).rejects.toMatchObject({ code: 'OVERSIZED_RDF_LITERAL', actualBytes: 60_002, maxBytes: 60_000, - predicate: 'http://schema.org/text', + predicate: 'http://schema.org/name', }); }); }); diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index d77338ad1..e09c30759 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -9,6 +9,8 @@ import { PayloadTooLargeError, assertQuadLiteralsMutf8Safe, isOversizedRdfLiteralError, + normalizeLargeRdfLiteralsForBlazegraph, + type RdfTextLiteralRewrite, validateContextGraphId, validateSubGraphName, isSafeIri, @@ -36,6 +38,7 @@ export interface PublishRequestBody { contextGraphId: string; quads: PublishQuad[]; privateQuads?: PublishQuad[]; + literalRewrites?: RdfTextLiteralRewrite[]; accessPolicy?: PublishAccessPolicy; allowedPeers?: string[]; subGraphName?: string; @@ -113,6 +116,8 @@ export function respondWithDaemonError(res: ServerResponse, err: any): void { (typeof err?.message === 'string' && err.message.includes('reserved namespace')) ) { jsonResponse(res, 400, { error: err.message }); + } else if (isOversizedRdfLiteralError(err)) { + jsonResponse(res, 400, oversizedRdfLiteralResponseBody(err)); } else if (isNoFundedPublisherWalletLike(err)) { // Funded-wallet selection found no operational wallet with gas + TRAC — a // user-actionable funding condition (4xx), not a server bug. @@ -217,6 +222,21 @@ export function validateWritableQuadLiteralSizes( } } +export function normalizeWritableQuadLiterals( + label: string, + quads: T[], +): { ok: true; quads: T[]; rewrites: RdfTextLiteralRewrite[] } | { ok: false; body: Record } { + try { + const result = normalizeLargeRdfLiteralsForBlazegraph(quads, { label }); + return { ok: true, quads: result.quads as T[], rewrites: result.rewrites }; + } catch (err) { + if (isOversizedRdfLiteralError(err)) { + return { ok: false, body: oversizedRdfLiteralResponseBody(err) }; + } + throw err; + } +} + /** * GH #306 / #787 (follow-up) — validate each quad's `object` term is either a * quoted RDF literal (`"…"`) or an absolute IRI. Shared by the publish path AND @@ -407,9 +427,9 @@ export function parsePublishRequestBody( } const quadObjectError = validateQuadObjectTerms("quads", quads); if (quadObjectError) return { ok: false, error: quadObjectError }; - const quadSize = validateWritableQuadLiteralSizes("quads", quads); - if (!quadSize.ok) { - return { ok: false, error: String(quadSize.body.error ?? 'Oversized RDF literal'), body: quadSize.body }; + const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + if (!normalizedQuads.ok) { + return { ok: false, error: String(normalizedQuads.body.error ?? 'Oversized RDF literal'), body: normalizedQuads.body }; } if ( @@ -501,8 +521,9 @@ export function parsePublishRequestBody( ok: true, value: { contextGraphId, - quads, + quads: normalizedQuads.quads, privateQuads, + ...(normalizedQuads.rewrites.length > 0 ? { literalRewrites: normalizedQuads.rewrites } : {}), accessPolicy, allowedPeers, subGraphName: subGraphName as string | undefined, diff --git a/packages/cli/src/daemon/routes/knowledge-assets-import.ts b/packages/cli/src/daemon/routes/knowledge-assets-import.ts index d31e4d03d..f6090571c 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets-import.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets-import.ts @@ -25,6 +25,7 @@ import { validateAssertionName, PayloadTooLargeError, assertQuadLiteralsMutf8Safe, + normalizeLargeRdfLiteralsForBlazegraph, IMPORTED_ARTIFACT_MAX_PAGE_BYTES, isDkgContentHash, verifyDkgContentHash, @@ -42,6 +43,7 @@ import { normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, oversizedRdfLiteralResponseBody, + normalizeWritableQuadLiterals, SMALL_BODY_BYTES, MAX_UPLOAD_BYTES, type ImportFileExtractionPayload, @@ -637,7 +639,11 @@ export async function handleKaSemanticEnrichmentWrite(ctx: RequestContext): Prom generationMethod, semanticQuads, }); - const quads = [...semanticQuads, ...provenanceQuads]; + const normalizedQuads = normalizeWritableQuadLiterals("semanticQuads", [...semanticQuads, ...provenanceQuads]); + if (!normalizedQuads.ok) { + return jsonResponse(res, 400, normalizedQuads.body); + } + const quads = normalizedQuads.quads; const targetAssertionUri = contextGraphAssertionUri( artifact.contextGraphId, artifact.assertionAgentAddress, @@ -1840,9 +1846,17 @@ export async function handleKaImportFile(ctx: RequestContext, name: string): Pro }); } try { - assertQuadLiteralsMutf8Safe([...dataGraphQuads, ...metaQuads], { + const normalizedImportQuads = normalizeLargeRdfLiteralsForBlazegraph([...dataGraphQuads, ...metaQuads], { label: 'import-file.quads', }); + const normalizedGraphQuads = normalizedImportQuads.quads as Array<{ + subject: string; + predicate: string; + object: string; + graph: string; + }>; + dataGraphQuads = normalizedGraphQuads.filter((q) => q.graph === assertionGraph); + metaQuads.splice(0, metaQuads.length, ...normalizedGraphQuads.filter((q) => q.graph === metaGraph)); } catch (err: any) { if (err?.code === "OVERSIZED_RDF_LITERAL") { const oversizedBody = oversizedRdfLiteralResponseBody(err); diff --git a/packages/cli/src/daemon/routes/knowledge-assets.ts b/packages/cli/src/daemon/routes/knowledge-assets.ts index 8962b2369..109875a13 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets.ts @@ -41,7 +41,7 @@ import { respondIfReconcileUnavailable, respondIfChainRpcTransportError, sanitizeRpcMessage, - validateWritableQuadLiteralSizes, + normalizeWritableQuadLiterals, normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, isNoFundedPublisherWalletLike, @@ -677,6 +677,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< contextGraphId, quads, privateQuads, + literalRewrites, accessPolicy, allowedPeers, subGraphName, @@ -736,6 +737,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< ...(chain?.blockNumber !== undefined ? { blockNumber: chain.blockNumber } : {}), ...(chain?.batchId !== undefined ? { batchId: String(chain.batchId) } : {}), ...(chain?.publisherAddress ? { publisherAddress: chain.publisherAddress } : {}), + ...(literalRewrites && literalRewrites.length > 0 ? { literalRewrites } : {}), ...(typeof pub?.contextGraphError === "string" ? { contextGraphError: pub.contextGraphError } : {}), ...(reason ? { error: reason } : {}), }); @@ -819,12 +821,16 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< // author attestation and reserves the on-chain identity, so it requires the // CG to be registered). OT-RFC-43 §10.5.5. const hasQuads = Array.isArray(quads) && quads.length > 0; + let quadsToWrite = quads; if (hasQuads) { if (!quads.every(isWritableQuad)) { return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const objErr = validateQuadObjectTerms("quads", quads); + if (objErr) return jsonResponse(res, 400, { error: objErr }); + const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quadsToWrite = normalizedQuads.quads; } const shouldFinalize = hasQuads && finalize !== false; // #1116 D5: the create ROUTE stays a primitive — create+write+seal, with @@ -897,9 +903,9 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< // explicit finalize:false leaves an editable WM draft and never touches // the chain (OT-RFC-43 §10.5.5). `also*` are opt-in transitions on top. if (hasQuads) { - await agent.assertion.write(resolvedContextGraphId, name, quads, { subGraphName, ...atomicAuthorLane }); - result.written = quads.length; - emitMemoryGraphChanged?.({ contextGraphId: resolvedContextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: quads.length } }); + await agent.assertion.write(resolvedContextGraphId, name, quadsToWrite, { subGraphName, ...atomicAuthorLane }); + result.written = quadsToWrite.length; + emitMemoryGraphChanged?.({ contextGraphId: resolvedContextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: quadsToWrite.length } }); } if (shouldFinalize) { const seal = await agent.assertion.finalize(resolvedContextGraphId, name, { @@ -1143,8 +1149,8 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< // literal nor an absolute IRI before they reach (and crash) the parser. const wmObjErr = validateQuadObjectTerms("quads", parsed.quads); if (wmObjErr) return jsonResponse(res, 400, { error: wmObjErr }); - const literalSize = validateWritableQuadLiteralSizes("quads", parsed.quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = normalizeWritableQuadLiterals("quads", parsed.quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); // A bare write to a name that was never created used to fall through to // the legacy `/assertion/{addr}/{name}` graph and produce a KA that is // permanently 404 in the descriptor API (no `_meta` lifecycle record, @@ -1170,9 +1176,9 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< ...writeAuthorLane, }); } - await agent.assertion.write(contextGraphId, name, parsed.quads, { subGraphName, ...writeAuthorLane }); - emitMemoryGraphChanged?.({ contextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: parsed.quads.length } }); - return jsonResponse(res, 200, { written: parsed.quads.length }); + await agent.assertion.write(contextGraphId, name, normalizedQuads.quads, { subGraphName, ...writeAuthorLane }); + emitMemoryGraphChanged?.({ contextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: normalizedQuads.quads.length } }); + return jsonResponse(res, 200, { written: normalizedQuads.quads.length }); } if (verb === "finalize") { const finalizeOptions = resolveFinalizeOptions(parsed, res, writePreflightCallerAgentAddress); diff --git a/packages/cli/src/daemon/routes/memory.ts b/packages/cli/src/daemon/routes/memory.ts index adeef15fc..3d8cdac2e 100644 --- a/packages/cli/src/daemon/routes/memory.ts +++ b/packages/cli/src/daemon/routes/memory.ts @@ -200,7 +200,7 @@ import { isPublishQuad, isWritableQuad, validateQuadObjectTerms, - validateWritableQuadLiteralSizes, + normalizeWritableQuadLiterals, oversizedRdfLiteralResponseBody, parsePublishRequestBody, jsonResponse, @@ -639,7 +639,7 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { const graph = `did:dkg:context-graph:${resolvedContextGraphId}/meta/query-catalog`; try { assertSafeIri(graph); - const normalized = quads.map((quad: unknown, index: number) => { + let normalized = quads.map((quad: unknown, index: number) => { if (!quad || typeof quad !== "object" || Array.isArray(quad)) { throw new Error(`quads[${index}] must be an object`); } @@ -670,8 +670,9 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { }; }); - const literalSize = validateWritableQuadLiteralSizes("quads", normalized); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = normalizeWritableQuadLiterals("quads", normalized); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + normalized = normalizedQuads.quads; await agent.store.insert(normalized); return jsonResponse(res, 200, { ok: true, @@ -1629,7 +1630,7 @@ WHERE { const body = await readBody(req); const parsed = safeParseJson(body, res); if (!parsed) return; - const { quads, subGraphName } = parsed; + let { quads, subGraphName } = parsed; const localOnly = parsed.localOnly === true; if ( parsed.localOnly !== undefined && @@ -1661,8 +1662,9 @@ WHERE { const objErr = validateQuadObjectTerms("quads", quads); if (objErr) return jsonResponse(res, 400, { error: objErr }); } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quads = normalizedQuads.quads; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2251,7 +2253,7 @@ WHERE { const body = await readBody(req); const parsed = safeParseJson(body, res); if (!parsed) return; - const { quads, conditions, subGraphName } = parsed; + let { quads, conditions, subGraphName } = parsed; const contextGraphId = parsed.contextGraphId; if (!quads?.length) return jsonResponse(res, 400, { error: 'Missing "quads"' }); @@ -2265,8 +2267,9 @@ WHERE { const objErr = validateQuadObjectTerms("quads", quads); if (objErr) return jsonResponse(res, 400, { error: objErr }); } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quads = normalizedQuads.quads; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2462,14 +2465,15 @@ WHERE { }); } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + const quadsToWrite = normalizedQuads.quads; // 5. Write to target layer try { if (targetLayer === 'swm') { // agent.share sets the graph field itself — pass quads with empty graph - const shareQuads = quads.map(({ subject, predicate, object }) => ({ subject, predicate, object, graph: '' })); + const shareQuads = quadsToWrite.map(({ subject, predicate, object }) => ({ subject, predicate, object, graph: '' })); const ctx = createOperationContext('share'); tracker.start(ctx, { contextGraphId: resolvedContextGraphId, details: { tripleCount: shareQuads.length, source: 'memory-turn', subGraphName } }); try { @@ -2487,7 +2491,7 @@ WHERE { throw err; } } else { - await agent.store.insert(quads); + await agent.store.insert(quadsToWrite); } } catch (err: any) { if (err?.code === "OVERSIZED_RDF_LITERAL") { @@ -2501,7 +2505,7 @@ WHERE { subGraphName, operation: "memory_turn_written", source: "memory-turn", - counts: { triples: quads.length }, + counts: { triples: quadsToWrite.length }, }); // 6. Generate embedding (best-effort, non-blocking for response) @@ -2532,7 +2536,7 @@ WHERE { graph: targetGraph, structuralTripleCount: extractResult.triples.length, semanticTripleCount: semanticTriples.length, - totalQuads: quads.length, + totalQuads: quadsToWrite.length, embeddingId, sessionUri: sessionUri ?? null, }); diff --git a/packages/cli/src/daemon/routes/shared-assertion-helpers.ts b/packages/cli/src/daemon/routes/shared-assertion-helpers.ts index 967ef1fe3..2a13f60d0 100644 --- a/packages/cli/src/daemon/routes/shared-assertion-helpers.ts +++ b/packages/cli/src/daemon/routes/shared-assertion-helpers.ts @@ -18,7 +18,6 @@ import { ImportedArtifactMetadataError, isDkgContentHash, resolveImportedArtifactMetadata, - assertRdfLiteralMutf8Safe, } from '@origintrail-official/dkg-core'; import { type PromoteJob, type PromoteJobState } from '@origintrail-official/dkg-publisher'; import { daemonState } from '../state.js'; @@ -325,11 +324,6 @@ export function normalizeSemanticQuads(raw: unknown): Array<{ subject: string; p } else { normalizedObject = rdfLiteral(object); } - assertRdfLiteralMutf8Safe(normalizedObject, { - label: `semanticQuads[${index}].object`, - subject, - predicate, - }); return { subject, predicate, object: normalizedObject }; }); } diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index 190daf3f0..9f13c8412 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -1,4 +1,8 @@ import { describe, expect, it } from 'vitest'; +import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, +} from '@origintrail-official/dkg-core'; import { buildImportFileResponse, parsePublishRequestBody, @@ -8,7 +12,7 @@ import { const OVERSIZED_LITERAL = `"${'x'.repeat(60_000)}"`; describe('HTTP RDF literal size validation', () => { - it('returns structured parse failure for oversized direct publish literals', () => { + it('normalizes oversized direct publish schema:text literals', () => { const parsed = parsePublishRequestBody(JSON.stringify({ contextGraphId: 'literal-size-cg', quads: [{ @@ -19,6 +23,43 @@ describe('HTTP RDF literal size validation', () => { }], })); + expect(parsed.ok).toBe(true); + if (parsed.ok) { + expect(parsed.value.literalRewrites).toHaveLength(1); + expect(parsed.value.literalRewrites?.[0]).toMatchObject({ + subject: 'http://example.org/s', + predicate: 'http://schema.org/text', + originalMutf8Bytes: 60_002, + }); + expect(parsed.value.quads.some((quad) => + quad.subject === 'http://example.org/s' && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(parsed.value.quads.some((quad) => + quad.subject === 'http://example.org/s' && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(parsed.value.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + } + }); + + it('returns structured parse failure for oversized private publish literals', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + quads: [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + object: '"safe"', + graph: 'http://example.org/g', + }], + privateQuads: [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: 'http://example.org/g', + }], + })); + expect(parsed.ok).toBe(false); if (!parsed.ok) { expect(parsed.body).toMatchObject({ @@ -30,10 +71,10 @@ describe('HTTP RDF literal size validation', () => { } }); - it('returns structured validation failure for writable route quads', () => { + it('returns structured validation failure for reject-only private/writable guards', () => { const result = validateWritableQuadLiteralSizes('quads', [{ subject: 'http://example.org/s', - predicate: 'http://schema.org/text', + predicate: 'http://schema.org/name', object: OVERSIZED_LITERAL, }]); diff --git a/packages/cli/test/issue-306-787-write-quad-validation.test.ts b/packages/cli/test/issue-306-787-write-quad-validation.test.ts index c59003532..1ff9f9916 100644 --- a/packages/cli/test/issue-306-787-write-quad-validation.test.ts +++ b/packages/cli/test/issue-306-787-write-quad-validation.test.ts @@ -54,14 +54,13 @@ describe('GH #787 — POST /api/shared-memory/write quad-shape validation', () = expect(status, JSON.stringify(body)).toBe(200); }); - it('returns 400 for oversized RDF literals before SWM write', async () => { + it('chunks oversized schema:text literals before SWM write', async () => { const { status, body } = await postJson(daemon!, '/api/shared-memory/write', { contextGraphId: CG, quads: [{ subject: 'urn:wq:oversized-swm', predicate: 'http://schema.org/text', object: OVERSIZED_LITERAL }], }); - expect(status, JSON.stringify(body)).toBe(400); - expect(body.code).toBe('OVERSIZED_RDF_LITERAL'); - expect(body.actualBytes).toBeGreaterThan(60_000); + expect(status, JSON.stringify(body)).toBe(200); + expect(body.triplesWritten).toBeGreaterThan(1); }); }); @@ -86,16 +85,15 @@ describe('GH #306 — POST /api/knowledge-assets/{name}/wm/write quad-shape vali expect(status, JSON.stringify(body)).toBe(200); }); - it('returns 400 for oversized RDF literals before WM write', async () => { + it('chunks oversized schema:text literals before WM write', async () => { const created = await postJson(daemon!, '/api/knowledge-assets', { contextGraphId: CG, name: 'ka-306-oversized' }); expect(created.status).toBeLessThan(300); const { status, body } = await postJson(daemon!, '/api/knowledge-assets/ka-306-oversized/wm/write', { contextGraphId: CG, quads: [{ subject: 'urn:wq:oversized-wm', predicate: 'http://schema.org/text', object: OVERSIZED_LITERAL }], }); - expect(status, JSON.stringify(body)).toBe(400); - expect(body.code).toBe('OVERSIZED_RDF_LITERAL'); - expect(body.actualBytes).toBeGreaterThan(60_000); + expect(status, JSON.stringify(body)).toBe(200); + expect(body.written).toBeGreaterThan(1); }); }); diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 03cbb02b5..c3bbe57c9 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -206,13 +206,40 @@ export { DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, OVERSIZED_RDF_LITERAL_ERROR_CODE, OversizedRdfLiteralError, + RDF_TYPE_IRI, + XSD_INTEGER_IRI, + XSD_STRING_IRI, + SCHEMA_TEXT_PREDICATES, + DKG_TEXT_BODY_CLASS, + DKG_TEXT_CHUNK_CLASS, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, + DKG_TEXT_SOURCE_PREDICATE, + DKG_TEXT_CONTENT_SHA256, + DKG_TEXT_LITERAL_TERM_SHA256, + DKG_TEXT_LITERAL_MUTF8_BYTES, + DKG_TEXT_UTF8_BYTES, + DKG_TEXT_CHUNK_COUNT, + DKG_TEXT_CHUNK_LIMIT, + DKG_TEXT_LANGUAGE, + DKG_TEXT_DATATYPE, + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, javaModifiedUtf8ByteLength, rdfLiteralTermMutf8ByteLength, + parseRdfLiteralTerm, isOversizedRdfLiteralError, assertRdfLiteralMutf8Safe, assertQuadLiteralsMutf8Safe, + normalizeLargeRdfLiteralsForBlazegraph, + reconstructChunkedTextBodies, type RdfLiteralSizeContext, type QuadLiteralLike, + type ParsedRdfLiteralTerm, + type RdfTextLiteralRewrite, + type RdfLiteralNormalizationResult, + type RdfLiteralNormalizationOptions, + type ReconstructedChunkedTextBody, } from './rdf-literal-size.js'; export { DKGError, diff --git a/packages/core/src/rdf-literal-size.ts b/packages/core/src/rdf-literal-size.ts index c98a810cd..05478a373 100644 --- a/packages/core/src/rdf-literal-size.ts +++ b/packages/core/src/rdf-literal-size.ts @@ -1,9 +1,35 @@ +import { createHash } from 'node:crypto'; import { DKGUserError } from './errors.js'; +import { isSafeIri } from './sparql-safe.js'; export const JAVA_WRITE_UTF_MAX_BYTES = 65_535; export const DKG_RDF_LITERAL_SAFE_MUTF8_BYTES = 60_000; export const OVERSIZED_RDF_LITERAL_ERROR_CODE = 'OVERSIZED_RDF_LITERAL'; +export const RDF_TYPE_IRI = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; +export const XSD_INTEGER_IRI = 'http://www.w3.org/2001/XMLSchema#integer'; +export const XSD_STRING_IRI = 'http://www.w3.org/2001/XMLSchema#string'; +export const SCHEMA_TEXT_PREDICATES = [ + 'http://schema.org/text', + 'https://schema.org/text', +] as const; + +export const DKG_TEXT_BODY_CLASS = 'http://dkg.io/ontology/TextBody'; +export const DKG_TEXT_CHUNK_CLASS = 'http://dkg.io/ontology/TextChunk'; +export const DKG_HAS_TEXT_BODY = 'http://dkg.io/ontology/hasTextBody'; +export const DKG_HAS_TEXT_CHUNK = 'http://dkg.io/ontology/hasTextChunk'; +export const DKG_TEXT_SOURCE_PREDICATE = 'http://dkg.io/ontology/textSourcePredicate'; +export const DKG_TEXT_CONTENT_SHA256 = 'http://dkg.io/ontology/textContentSha256'; +export const DKG_TEXT_LITERAL_TERM_SHA256 = 'http://dkg.io/ontology/textLiteralTermSha256'; +export const DKG_TEXT_LITERAL_MUTF8_BYTES = 'http://dkg.io/ontology/textLiteralMutf8Bytes'; +export const DKG_TEXT_UTF8_BYTES = 'http://dkg.io/ontology/textUtf8Bytes'; +export const DKG_TEXT_CHUNK_COUNT = 'http://dkg.io/ontology/textChunkCount'; +export const DKG_TEXT_CHUNK_LIMIT = 'http://dkg.io/ontology/textChunkMutf8Limit'; +export const DKG_TEXT_LANGUAGE = 'http://dkg.io/ontology/textLanguage'; +export const DKG_TEXT_DATATYPE = 'http://dkg.io/ontology/textDatatype'; +export const DKG_CHUNK_INDEX = 'http://dkg.io/ontology/chunkIndex'; +export const DKG_CHUNK_VALUE = 'http://dkg.io/ontology/chunkValue'; + export interface RdfLiteralSizeContext { readonly label?: string; readonly subject?: string; @@ -19,6 +45,50 @@ export interface QuadLiteralLike { readonly graph?: string; } +export interface ParsedRdfLiteralTerm { + readonly lexical: string; + readonly suffix: string; + readonly language?: string; + readonly datatype?: string; +} + +export interface RdfTextLiteralRewrite { + readonly subject: string; + readonly predicate: string; + readonly graph?: string; + readonly bodySubject: string; + readonly originalMutf8Bytes: number; + readonly originalUtf8Bytes: number; + readonly chunkCount: number; + readonly lexicalSha256: string; + readonly literalTermSha256: string; +} + +export interface RdfLiteralNormalizationResult { + readonly quads: QuadLiteralLike[]; + readonly rewrites: RdfTextLiteralRewrite[]; +} + +export interface RdfLiteralNormalizationOptions { + readonly maxBytes?: number; + readonly chunkMaxBytes?: number; + readonly textPredicates?: Iterable; + readonly label?: string; +} + +export interface ReconstructedChunkedTextBody { + readonly subject: string; + readonly bodySubject: string; + readonly sourcePredicate: string; + readonly lexical: string; + readonly literalTerm: string; + readonly chunkCount: number; + readonly lexicalSha256: string; + readonly literalTermSha256: string; + readonly language?: string; + readonly datatype?: string; +} + export class OversizedRdfLiteralError extends DKGUserError { readonly code = OVERSIZED_RDF_LITERAL_ERROR_CODE; readonly actualBytes: number; @@ -102,6 +172,37 @@ export function rdfLiteralTermMutf8ByteLength(term: string): number | undefined return javaModifiedUtf8ByteLength(term); } +export function parseRdfLiteralTerm(term: string): ParsedRdfLiteralTerm | null { + if (!term.startsWith('"')) return null; + let escaped = false; + for (let i = 1; i < term.length; i++) { + const ch = term[i]!; + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (ch !== '"') continue; + + try { + const body = term.slice(1, i); + const suffix = term.slice(i + 1); + const metadata = parseLiteralSuffix(suffix); + return { + lexical: decodeRdfLiteralBody(body), + suffix, + ...metadata, + }; + } catch { + return null; + } + } + return null; +} + export function isOversizedRdfLiteralError(err: unknown): err is OversizedRdfLiteralError { if (err instanceof OversizedRdfLiteralError) return true; if (!err || typeof err !== 'object') return false; @@ -141,3 +242,400 @@ export function assertQuadLiteralsMutf8Safe( }); } } + +export function normalizeLargeRdfLiteralsForBlazegraph( + quads: readonly T[], + options: RdfLiteralNormalizationOptions = {}, +): RdfLiteralNormalizationResult { + const maxBytes = options.maxBytes ?? DKG_RDF_LITERAL_SAFE_MUTF8_BYTES; + const chunkMaxBytes = options.chunkMaxBytes ?? maxBytes; + validateNormalizationLimits(maxBytes, chunkMaxBytes); + + const textPredicates = new Set(options.textPredicates ?? SCHEMA_TEXT_PREDICATES); + const normalized: T[] = []; + const rewrites: RdfTextLiteralRewrite[] = []; + + for (let i = 0; i < quads.length; i++) { + const quad = quads[i]!; + const literalBytes = rdfLiteralTermMutf8ByteLength(quad.object); + if (literalBytes === undefined || literalBytes <= maxBytes) { + normalized.push({ ...quad }); + continue; + } + + if (!textPredicates.has(quad.predicate)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const parsed = parseRdfLiteralTerm(quad.object); + if (!parsed || !isChunkableTextLiteral(parsed)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + if (!isSafeIri(quad.subject)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const canonicalLiteralTerm = rdfLiteralTerm(parsed.lexical, parsed.suffix); + const literalTermSha256 = sha256Hex(canonicalLiteralTerm); + const lexicalSha256 = sha256Hex(parsed.lexical); + const bodyIdentitySha256 = sha256Hex(`${quad.predicate}\u0000${canonicalLiteralTerm}`); + const bodySubject = `${quad.subject}/.well-known/genid/dkg-text-body-${bodyIdentitySha256}`; + if (!isSafeIri(bodySubject)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const chunks = splitLexicalIntoSafeChunks(parsed.lexical, chunkMaxBytes); + const graph = quad.graph ?? ''; + const bodyQuads = buildTextBodyQuads({ + source: quad, + graph, + bodySubject, + parsed, + chunks, + maxBytes, + chunkMaxBytes, + originalMutf8Bytes: literalBytes, + lexicalSha256, + literalTermSha256, + }) as T[]; + normalized.push(...bodyQuads); + rewrites.push({ + subject: quad.subject, + predicate: quad.predicate, + graph: quad.graph, + bodySubject, + originalMutf8Bytes: literalBytes, + originalUtf8Bytes: utf8ByteLength(parsed.lexical), + chunkCount: chunks.length, + lexicalSha256, + literalTermSha256, + }); + } + + assertQuadLiteralsMutf8Safe(normalized, { + label: options.label ? `${options.label}.normalized` : 'normalizedQuads', + maxBytes, + }); + return { quads: normalized, rewrites }; +} + +export function reconstructChunkedTextBodies( + quads: readonly QuadLiteralLike[], + options: { subject?: string; bodySubject?: string; sourcePredicate?: string } = {}, +): ReconstructedChunkedTextBody[] { + const bySubject = indexQuadsBySubject(quads); + const bodyLinks = quads.filter((q) => + q.predicate === DKG_HAS_TEXT_BODY && + (!options.subject || q.subject === options.subject) && + (!options.bodySubject || q.object === options.bodySubject) + ); + const explicitBody = options.bodySubject && bodyLinks.length === 0 + ? [{ subject: findOwnerSubject(quads, options.bodySubject), object: options.bodySubject }] + : []; + const bodies = [...bodyLinks, ...explicitBody]; + const reconstructed: ReconstructedChunkedTextBody[] = []; + + for (const link of bodies) { + const bodySubject = link.object; + const bodyQuads = bySubject.get(bodySubject) ?? []; + const sourcePredicate = iriObject(bodyQuads, DKG_TEXT_SOURCE_PREDICATE); + if (!sourcePredicate) throw new Error(`Chunked text body ${bodySubject} is missing source predicate`); + if (options.sourcePredicate && sourcePredicate !== options.sourcePredicate) { + continue; + } + + const count = integerObject(bodyQuads, DKG_TEXT_CHUNK_COUNT); + if (count === undefined) throw new Error(`Chunked text body ${bodySubject} is missing chunk count`); + const lexicalSha256 = literalObject(bodyQuads, DKG_TEXT_CONTENT_SHA256); + if (!lexicalSha256) throw new Error(`Chunked text body ${bodySubject} is missing content hash`); + const literalTermSha256 = literalObject(bodyQuads, DKG_TEXT_LITERAL_TERM_SHA256); + if (!literalTermSha256) throw new Error(`Chunked text body ${bodySubject} is missing literal term hash`); + const language = literalObject(bodyQuads, DKG_TEXT_LANGUAGE); + const datatype = iriObject(bodyQuads, DKG_TEXT_DATATYPE); + const chunkSubjects = bodyQuads.filter((q) => q.predicate === DKG_HAS_TEXT_CHUNK).map((q) => q.object); + if (chunkSubjects.length !== count) { + throw new Error(`Chunked text body ${bodySubject} expected ${count} chunks but found ${chunkSubjects.length}`); + } + + const chunks = chunkSubjects.map((chunkSubject) => { + const chunkQuads = bySubject.get(chunkSubject) ?? []; + const index = integerObject(chunkQuads, DKG_CHUNK_INDEX); + if (index === undefined) throw new Error(`Chunk ${chunkSubject} is missing chunkIndex`); + const valueTerm = literalTermObject(chunkQuads, DKG_CHUNK_VALUE); + if (!valueTerm) throw new Error(`Chunk ${chunkSubject} is missing chunkValue`); + const parsed = parseRdfLiteralTerm(valueTerm); + if (!parsed) throw new Error(`Chunk ${chunkSubject} has invalid chunkValue`); + return { index, lexical: parsed.lexical }; + }).sort((a, b) => a.index - b.index); + + for (let i = 0; i < chunks.length; i++) { + if (chunks[i]!.index !== i) { + throw new Error(`Chunked text body ${bodySubject} has non-contiguous chunk index ${chunks[i]!.index}`); + } + } + + const lexical = chunks.map((chunk) => chunk.lexical).join(''); + const suffix = suffixFromMetadata(language, datatype); + const literalTerm = rdfLiteralTerm(lexical, suffix); + if (sha256Hex(lexical) !== lexicalSha256) { + throw new Error(`Chunked text body ${bodySubject} content hash mismatch`); + } + if (sha256Hex(literalTerm) !== literalTermSha256) { + throw new Error(`Chunked text body ${bodySubject} literal term hash mismatch`); + } + + reconstructed.push({ + subject: link.subject, + bodySubject, + sourcePredicate, + lexical, + literalTerm, + chunkCount: chunks.length, + lexicalSha256, + literalTermSha256, + ...(language !== undefined ? { language } : {}), + ...(datatype !== undefined ? { datatype } : {}), + }); + } + + return reconstructed; +} + +function validateNormalizationLimits(maxBytes: number, chunkMaxBytes: number): void { + if (!Number.isInteger(maxBytes) || maxBytes <= 0 || maxBytes > JAVA_WRITE_UTF_MAX_BYTES) { + throw new Error(`Invalid maxBytes: ${maxBytes}`); + } + if (!Number.isInteger(chunkMaxBytes) || chunkMaxBytes <= 0 || chunkMaxBytes > maxBytes) { + throw new Error(`Invalid chunkMaxBytes: ${chunkMaxBytes}`); + } +} + +function labelFor(label: string | undefined, index: number): string { + return label ? `${label}[${index}].object` : `quads[${index}].object`; +} + +function throwOversizedForQuad( + quad: QuadLiteralLike, + actualBytes: number, + maxBytes: number, + label: string, +): never { + throw new OversizedRdfLiteralError({ + actualBytes, + maxBytes, + label, + subject: quad.subject, + predicate: quad.predicate, + graph: quad.graph, + }); +} + +function buildTextBodyQuads(args: { + source: QuadLiteralLike; + graph: string; + bodySubject: string; + parsed: ParsedRdfLiteralTerm; + chunks: readonly string[]; + maxBytes: number; + chunkMaxBytes: number; + originalMutf8Bytes: number; + lexicalSha256: string; + literalTermSha256: string; +}): QuadLiteralLike[] { + const quads: QuadLiteralLike[] = [ + { subject: args.source.subject, predicate: DKG_HAS_TEXT_BODY, object: args.bodySubject, graph: args.graph }, + { subject: args.bodySubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_BODY_CLASS, graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_SOURCE_PREDICATE, object: args.source.predicate, graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CONTENT_SHA256, object: rdfLiteralTerm(args.lexicalSha256), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_TERM_SHA256, object: rdfLiteralTerm(args.literalTermSha256), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_MUTF8_BYTES, object: xsdInteger(args.originalMutf8Bytes), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_UTF8_BYTES, object: xsdInteger(utf8ByteLength(args.parsed.lexical)), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_COUNT, object: xsdInteger(args.chunks.length), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_LIMIT, object: xsdInteger(args.chunkMaxBytes), graph: args.graph }, + ]; + + if (args.parsed.language) { + quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_LANGUAGE, object: rdfLiteralTerm(args.parsed.language), graph: args.graph }); + } + if (args.parsed.datatype) { + quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_DATATYPE, object: args.parsed.datatype, graph: args.graph }); + } + + args.chunks.forEach((chunk, index) => { + const chunkSubject = `${args.bodySubject}/chunk-${index}`; + if (!isSafeIri(chunkSubject)) { + throw new OversizedRdfLiteralError({ + actualBytes: args.originalMutf8Bytes, + maxBytes: args.maxBytes, + subject: args.source.subject, + predicate: args.source.predicate, + graph: args.source.graph, + }); + } + quads.push( + { subject: args.bodySubject, predicate: DKG_HAS_TEXT_CHUNK, object: chunkSubject, graph: args.graph }, + { subject: chunkSubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_CHUNK_CLASS, graph: args.graph }, + { subject: chunkSubject, predicate: DKG_CHUNK_INDEX, object: xsdInteger(index), graph: args.graph }, + { subject: chunkSubject, predicate: DKG_CHUNK_VALUE, object: rdfLiteralTerm(chunk), graph: args.graph }, + ); + }); + + return quads; +} + +function isChunkableTextLiteral(parsed: ParsedRdfLiteralTerm): boolean { + return parsed.suffix === '' || + parsed.language !== undefined || + parsed.datatype === XSD_STRING_IRI; +} + +function splitLexicalIntoSafeChunks(lexical: string, maxBytes: number): string[] { + const chunks: string[] = []; + let current = ''; + let currentBytes = javaModifiedUtf8ByteLength('""'); + + for (const ch of lexical) { + const chBytes = serializedLiteralBodyMutf8ByteLength(ch); + if (current.length > 0 && currentBytes + chBytes > maxBytes) { + chunks.push(current); + current = ch; + currentBytes = javaModifiedUtf8ByteLength('""') + chBytes; + } else { + current += ch; + currentBytes += chBytes; + } + + if (currentBytes > maxBytes) { + throw new Error('A single literal character exceeds the MUTF-8 chunk budget'); + } + } + + if (current.length > 0 || lexical.length === 0) chunks.push(current); + return chunks; +} + +function serializedLiteralBodyMutf8ByteLength(value: string): number { + const serialized = JSON.stringify(value); + return javaModifiedUtf8ByteLength(serialized.slice(1, -1)); +} + +function parseLiteralSuffix(suffix: string): { language?: string; datatype?: string } { + if (suffix === '') return {}; + const language = /^@([A-Za-z]+(?:-[A-Za-z0-9]+)*)$/.exec(suffix); + if (language) return { language: language[1] }; + const datatype = /^\^\^<([^<>"{}|\\^`\x00-\x20>]+)>$/.exec(suffix); + if (datatype) return { datatype: datatype[1] }; + throw new Error(`Invalid RDF literal suffix: ${suffix.slice(0, 80)}`); +} + +function decodeRdfLiteralBody(body: string): string { + let out = ''; + for (let i = 0; i < body.length; i++) { + const ch = body[i]!; + if (ch !== '\\') { + out += ch; + continue; + } + i += 1; + if (i >= body.length) throw new Error('Invalid trailing RDF literal escape'); + const escaped = body[i]!; + switch (escaped) { + case 't': + out += '\t'; + break; + case 'b': + out += '\b'; + break; + case 'n': + out += '\n'; + break; + case 'r': + out += '\r'; + break; + case 'f': + out += '\f'; + break; + case '"': + case "'": + case '\\': + out += escaped; + break; + case 'u': { + const hex = body.slice(i + 1, i + 5); + if (!/^[0-9a-fA-F]{4}$/.test(hex)) throw new Error('Invalid RDF \\u escape'); + out += String.fromCharCode(parseInt(hex, 16)); + i += 4; + break; + } + case 'U': { + const hex = body.slice(i + 1, i + 9); + if (!/^[0-9a-fA-F]{8}$/.test(hex)) throw new Error('Invalid RDF \\U escape'); + out += String.fromCodePoint(parseInt(hex, 16)); + i += 8; + break; + } + default: + throw new Error(`Invalid RDF literal escape: \\${escaped}`); + } + } + return out; +} + +function rdfLiteralTerm(lexical: string, suffix = ''): string { + return `${JSON.stringify(lexical)}${suffix}`; +} + +function xsdInteger(value: number): string { + return `"${value}"^^<${XSD_INTEGER_IRI}>`; +} + +function sha256Hex(value: string): string { + return createHash('sha256').update(value, 'utf8').digest('hex'); +} + +function utf8ByteLength(value: string): number { + return new TextEncoder().encode(value).length; +} + +function indexQuadsBySubject(quads: readonly QuadLiteralLike[]): Map { + const map = new Map(); + for (const quad of quads) { + const list = map.get(quad.subject); + if (list) list.push(quad); + else map.set(quad.subject, [quad]); + } + return map; +} + +function findOwnerSubject(quads: readonly QuadLiteralLike[], bodySubject: string): string { + return quads.find((q) => q.predicate === DKG_HAS_TEXT_BODY && q.object === bodySubject)?.subject ?? ''; +} + +function literalTermObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object?.startsWith('"') ? object : undefined; +} + +function literalObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { + const term = literalTermObject(quads, predicate); + if (!term) return undefined; + const parsed = parseRdfLiteralTerm(term); + return parsed?.lexical; +} + +function iriObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object && !object.startsWith('"') ? object : undefined; +} + +function integerObject(quads: readonly QuadLiteralLike[], predicate: string): number | undefined { + const value = literalObject(quads, predicate); + if (value === undefined || !/^\d+$/.test(value)) return undefined; + return Number(value); +} + +function suffixFromMetadata(language?: string, datatype?: string): string { + if (language) return `@${language}`; + if (datatype) return `^^<${datatype}>`; + return ''; +} diff --git a/packages/core/test/rdf-literal-size.test.ts b/packages/core/test/rdf-literal-size.test.ts index 87b4fea06..d6b54628d 100644 --- a/packages/core/test/rdf-literal-size.test.ts +++ b/packages/core/test/rdf-literal-size.test.ts @@ -1,11 +1,20 @@ import { describe, expect, it } from 'vitest'; import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, + DKG_TEXT_CONTENT_SHA256, + DKG_TEXT_LITERAL_TERM_SHA256, OVERSIZED_RDF_LITERAL_ERROR_CODE, + XSD_STRING_IRI, assertQuadLiteralsMutf8Safe, assertRdfLiteralMutf8Safe, javaModifiedUtf8ByteLength, + normalizeLargeRdfLiteralsForBlazegraph, + parseRdfLiteralTerm, rdfLiteralTermMutf8ByteLength, + reconstructChunkedTextBodies, } from '../src/rdf-literal-size.js'; describe('rdf literal Java MUTF-8 sizing', () => { @@ -69,4 +78,208 @@ describe('rdf literal Java MUTF-8 sizing', () => { ], { label: 'publish.quads' }), ).toThrow(/publish\.quads\[1\]\.object/); }); + + it('chunks oversized schema text into DKG chunk values and reconstructs exactly', () => { + const lexical = `${'computer history '.repeat(200)}"quoted"\n\\slash 😀`; + const literal = `${JSON.stringify(lexical)}@en`; + const result = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/text', + object: literal, + graph: 'did:dkg:context-graph:test', + }, + ], { + maxBytes: 400, + chunkMaxBytes: 180, + label: 'test.quads', + }); + + expect(result.rewrites).toHaveLength(1); + expect(result.rewrites[0].chunkCount).toBeGreaterThan(1); + expect(result.quads.some((quad) => + quad.subject === 'http://example.org/root' && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + + const bodySubject = result.quads.find((quad) => quad.predicate === DKG_HAS_TEXT_BODY)?.object; + expect(bodySubject).toContain('/.well-known/genid/dkg-text-body-'); + expect(result.quads.some((quad) => + quad.predicate === 'http://schema.org/text' && + quad.subject.includes('/chunk-') + )).toBe(false); + + const chunkValueQuads = result.quads.filter((quad) => quad.predicate === DKG_CHUNK_VALUE); + expect(chunkValueQuads).toHaveLength(result.rewrites[0].chunkCount); + expect(chunkValueQuads.every((quad) => javaModifiedUtf8ByteLength(quad.object) <= 180)).toBe(true); + + const reconstructed = reconstructChunkedTextBodies(result.quads, { + subject: 'http://example.org/root', + sourcePredicate: 'http://schema.org/text', + }); + expect(reconstructed).toHaveLength(1); + expect(reconstructed[0]).toMatchObject({ + subject: 'http://example.org/root', + bodySubject, + sourcePredicate: 'http://schema.org/text', + lexical, + literalTerm: literal, + language: 'en', + chunkCount: result.rewrites[0].chunkCount, + }); + }); + + it('normalizes idempotently and keeps generated literals below the safe limit', () => { + const oversized = JSON.stringify('x'.repeat(1_000)); + const first = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/idempotent', + predicate: 'https://schema.org/text', + object: oversized, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200, chunkMaxBytes: 100 }); + const second = normalizeLargeRdfLiteralsForBlazegraph(first.quads, { maxBytes: 200, chunkMaxBytes: 100 }); + + expect(second.rewrites).toHaveLength(0); + expect(second.quads).toEqual(first.quads); + expect(first.quads.every((quad) => { + const bytes = rdfLiteralTermMutf8ByteLength(quad.object); + return bytes === undefined || bytes <= 200; + })).toBe(true); + }); + + it('keeps same-subject http and https schema:text bodies distinct', () => { + const object = JSON.stringify('shared body '.repeat(120)); + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/dual', + predicate: 'http://schema.org/text', + object, + graph: 'did:dkg:context-graph:test', + }, + { + subject: 'http://example.org/dual', + predicate: 'https://schema.org/text', + object, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 400, chunkMaxBytes: 180 }); + + expect(normalized.rewrites).toHaveLength(2); + expect(new Set(normalized.rewrites.map((rewrite) => rewrite.bodySubject)).size).toBe(2); + expect(reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/dual', + sourcePredicate: 'http://schema.org/text', + })).toHaveLength(1); + expect(reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/dual', + sourcePredicate: 'https://schema.org/text', + })).toHaveLength(1); + }); + + it('uses canonical literal hashes so escaped source forms reconstruct', () => { + const escaped = `"${'\\u0061'.repeat(260)}"`; + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/escaped', + predicate: 'http://schema.org/text', + object: escaped, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 600, chunkMaxBytes: 120 }); + + const reconstructed = reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/escaped', + }); + expect(reconstructed).toHaveLength(1); + expect(reconstructed[0].lexical).toBe('a'.repeat(260)); + expect(reconstructed[0].literalTerm).toBe(JSON.stringify('a'.repeat(260))); + }); + + it('preserves xsd:string metadata but rejects arbitrary typed oversized text', () => { + const body = 'typed text '.repeat(100); + const typedString = `${JSON.stringify(body)}^^<${XSD_STRING_IRI}>`; + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/typed', + predicate: 'http://schema.org/text', + object: typedString, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 250, chunkMaxBytes: 100 }); + expect(reconstructChunkedTextBodies(normalized.quads)[0]).toMatchObject({ + datatype: XSD_STRING_IRI, + literalTerm: typedString, + }); + + const arbitraryTyped = `${JSON.stringify(body)}^^`; + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/custom', + predicate: 'http://schema.org/text', + object: arbitraryTyped, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 250, chunkMaxBytes: 100 }), + ).toThrow(/Blazegraph-compatible safe limit/); + }); + + it('rejects oversized non-text literals and unsafe chunk subjects', () => { + const oversized = JSON.stringify('x'.repeat(500)); + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/name', + predicate: 'http://schema.org/name', + object: oversized, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200 }), + ).toThrow(/Blazegraph-compatible safe limit/); + + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: '_:blank', + predicate: 'http://schema.org/text', + object: oversized, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200 }), + ).toThrow(/Blazegraph-compatible safe limit/); + }); + + it('detects corrupt chunk metadata during reconstruction', () => { + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/corrupt', + predicate: 'http://schema.org/text', + object: JSON.stringify('z'.repeat(500)), + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200, chunkMaxBytes: 100 }); + const withoutOneChunk = normalized.quads.filter((quad) => quad.predicate !== DKG_HAS_TEXT_CHUNK || !quad.object.endsWith('/chunk-1')); + expect(() => reconstructChunkedTextBodies(withoutOneChunk)).toThrow(/expected .* chunks/); + + const withoutCount = normalized.quads.filter((quad) => quad.predicate !== 'http://dkg.io/ontology/textChunkCount'); + expect(() => reconstructChunkedTextBodies(withoutCount)).toThrow(/missing chunk count/); + + const withoutHash = normalized.quads.filter((quad) => quad.predicate !== DKG_TEXT_CONTENT_SHA256); + expect(() => reconstructChunkedTextBodies(withoutHash)).toThrow(/missing content hash/); + + const withBadHash = normalized.quads.map((quad) => + quad.predicate === DKG_TEXT_CONTENT_SHA256 || quad.predicate === DKG_TEXT_LITERAL_TERM_SHA256 + ? { ...quad, object: '"bad"' } + : quad + ); + expect(() => reconstructChunkedTextBodies(withBadHash)).toThrow(/hash mismatch/); + }); + + it('parses RDF literal suffixes supported by the chunker', () => { + expect(parseRdfLiteralTerm('"hello"')?.lexical).toBe('hello'); + expect(parseRdfLiteralTerm('"hello"@en-US')?.language).toBe('en-US'); + expect(parseRdfLiteralTerm(`"hello"^^<${XSD_STRING_IRI}>`)?.datatype).toBe(XSD_STRING_IRI); + }); }); diff --git a/packages/publisher/src/dkg-publisher.ts b/packages/publisher/src/dkg-publisher.ts index c1701798c..d4f76beea 100644 --- a/packages/publisher/src/dkg-publisher.ts +++ b/packages/publisher/src/dkg-publisher.ts @@ -2,7 +2,7 @@ import type { Quad, TripleStore } from '@origintrail-official/dkg-storage'; import type { ChainAdapter, OnChainPublishResult, AddBatchToContextGraphParams } from '@origintrail-official/dkg-chain'; import { enrichEvmError } from '@origintrail-official/dkg-chain'; import type { EventBus, OperationContext } from '@origintrail-official/dkg-core'; -import { DKGEvent, Logger, createOperationContext, sha256, encodeWorkspacePublishRequest, encodeEncryptedWorkspacePayload, encryptWorkspacePayload, contextGraphDataUri, contextGraphDataGraphUri, contextGraphMetaUri, contextGraphAssertionUri, contextGraphLayerUri, MemoryLayer, assertionLifecycleUri, contextGraphSubGraphUri, contextGraphSubGraphMetaUri, SYSTEM_CONTEXT_GRAPHS, validateSubGraphName, isSafeIri, assertSafeIri, assertSafeRdfTerm, assertQuadLiteralsMutf8Safe, DKG_GOSSIP_MAX_MESSAGE_BYTES, SwmGossipPayloadTooLargeError, type Ed25519Keypair, buildAuthorAttestationTypedData, buildUpdateAuthorAttestationTypedData, AUTHOR_SCHEME_VERSION_V1, TrustLevel, TRUST_LEVEL_PREDICATE, assertNoUserAuthoredTrustLevelQuads, buildTrustLevelQuads, isTrustLevelQuad, DKG_ENTITY, DKG_ROOT_ENTITY_LEGACY, ENTITY_PRED_ALT, parseAssertionSealQuads, ASSERTION_SEAL_PREDICATES, sharedMemoryReadBothFilter, DKG_ONTOLOGY } from '@origintrail-official/dkg-core'; +import { DKGEvent, Logger, createOperationContext, sha256, encodeWorkspacePublishRequest, encodeEncryptedWorkspacePayload, encryptWorkspacePayload, contextGraphDataUri, contextGraphDataGraphUri, contextGraphMetaUri, contextGraphAssertionUri, contextGraphLayerUri, MemoryLayer, assertionLifecycleUri, contextGraphSubGraphUri, contextGraphSubGraphMetaUri, SYSTEM_CONTEXT_GRAPHS, validateSubGraphName, isSafeIri, assertSafeIri, assertSafeRdfTerm, assertQuadLiteralsMutf8Safe, normalizeLargeRdfLiteralsForBlazegraph, DKG_GOSSIP_MAX_MESSAGE_BYTES, SwmGossipPayloadTooLargeError, type Ed25519Keypair, buildAuthorAttestationTypedData, buildUpdateAuthorAttestationTypedData, AUTHOR_SCHEME_VERSION_V1, TrustLevel, TRUST_LEVEL_PREDICATE, assertNoUserAuthoredTrustLevelQuads, buildTrustLevelQuads, isTrustLevelQuad, DKG_ENTITY, DKG_ROOT_ENTITY_LEGACY, ENTITY_PRED_ALT, parseAssertionSealQuads, ASSERTION_SEAL_PREDICATES, sharedMemoryReadBothFilter, DKG_ONTOLOGY } from '@origintrail-official/dkg-core'; import { GraphManager, PrivateContentStore } from '@origintrail-official/dkg-storage'; import { DEFAULT_PUBLISH_EPOCHS, MAX_PUBLISH_EPOCHS, type Publisher, type PublishOptions, type PublishResult, type KAManifestEntry, type PhaseCallback, type V10CoreNodeACK } from './publisher.js'; import { skolemizeByEntity } from './auto-partition.js'; @@ -414,6 +414,10 @@ function rejectOversizedRdfLiterals(quads: Quad[], label: string): void { assertQuadLiteralsMutf8Safe(quads, { label }); } +function normalizePublicRdfLiterals(quads: Quad[], label: string): Quad[] { + return normalizeLargeRdfLiteralsForBlazegraph(quads, { label }).quads as Quad[]; +} + async function stampTrustLevel( store: TripleStore, graph: string, @@ -1074,7 +1078,7 @@ export class DKGPublisher implements Publisher { // reserved-namespace violation cannot be masked by a lock timeout // or subject-level validation error downstream. rejectUserAuthoredProtocolMetadata(quads); - rejectOversizedRdfLiterals(quads, 'share.quads'); + quads = normalizePublicRdfLiterals(quads, 'share.quads'); const subjects = [...new Set(quads.map(q => q.subject))]; const lockPrefix = options.subGraphName ? `${contextGraphId}\0${options.subGraphName}` : contextGraphId; const lockKeys = subjects.map(s => `${lockPrefix}\0${s}`); @@ -1364,12 +1368,17 @@ export class DKGPublisher implements Publisher { // violation with a StaleWriteError). Short-circuit per // `19_MARKDOWN_CONTENT_TYPE.md §10.2`. rejectUserAuthoredProtocolMetadata(quads); - rejectOversizedRdfLiterals(quads, 'conditionalShare.quads'); + quads = normalizePublicRdfLiterals(quads, 'conditionalShare.quads'); for (const cond of options.conditions) { assertSafeIri(cond.subject); assertSafeIri(cond.predicate); if (cond.expectedValue !== null) { assertSafeRdfTerm(cond.expectedValue); + assertQuadLiteralsMutf8Safe([{ + subject: cond.subject, + predicate: cond.predicate, + object: cond.expectedValue, + }], { label: 'conditionalShare.conditions' }); } } @@ -1972,7 +1981,7 @@ export class DKGPublisher implements Publisher { }; } - const { + let { contextGraphId, quads, privateQuads = [], @@ -1996,7 +2005,7 @@ export class DKGPublisher implements Publisher { rejectUserAuthoredProtocolMetadata(quads); if (privateQuads.length > 0) rejectUserAuthoredProtocolMetadata(privateQuads); } - rejectOversizedRdfLiterals(quads, 'publish.quads'); + quads = normalizePublicRdfLiterals(quads, 'publish.quads'); if (privateQuads.length > 0) rejectOversizedRdfLiterals(privateQuads, 'publish.privateQuads'); const ctx: OperationContext = operationCtx ?? createOperationContext('publish'); const effectiveAccessPolicy = accessPolicy ?? (privateQuads.length > 0 ? 'ownerOnly' : 'public'); @@ -3452,7 +3461,7 @@ export class DKGPublisher implements Publisher { 'Publish a new KC instead, or remove and recreate the sub-graph.', ); } - const { contextGraphId, quads, privateQuads = [], operationCtx, onPhase } = options; + let { contextGraphId, quads, privateQuads = [], operationCtx, onPhase } = options; // Round 12 Bug 34: `update()` is a Bucket A public write entry // point (accepts user-authored quads) that Round 9 missed. Apply // the same reserved-namespace guard as `publish()` / `assertionWrite` @@ -3465,7 +3474,7 @@ export class DKGPublisher implements Publisher { rejectUserAuthoredProtocolMetadata(quads); if (privateQuads.length > 0) rejectUserAuthoredProtocolMetadata(privateQuads); } - rejectOversizedRdfLiterals(quads, 'update.quads'); + quads = normalizePublicRdfLiterals(quads, 'update.quads'); if (privateQuads.length > 0) rejectOversizedRdfLiterals(privateQuads, 'update.privateQuads'); const ctx: OperationContext = operationCtx ?? createOperationContext('publish'); let publisherContextGraphId: bigint | undefined; @@ -5354,8 +5363,8 @@ export class DKGPublisher implements Publisher { // Round 9 Bug 25: reject user-authored quads whose subject is in a // protocol-reserved URN namespace. See RESERVED_SUBJECT_PREFIXES above. rejectUserAuthoredProtocolMetadata(quads); - rejectOversizedRdfLiterals(quads, 'assertionWrite.quads'); - await this.store.insert(quads); + const normalizedQuads = normalizePublicRdfLiterals(quads, 'assertionWrite.quads'); + await this.store.insert(normalizedQuads); } async assertionQuery( diff --git a/packages/publisher/test/dkg-publisher-compat.test.ts b/packages/publisher/test/dkg-publisher-compat.test.ts index b3f4d6db4..1a26403c5 100644 --- a/packages/publisher/test/dkg-publisher-compat.test.ts +++ b/packages/publisher/test/dkg-publisher-compat.test.ts @@ -1,8 +1,11 @@ import { describe, expect, it } from 'vitest'; import { NoChainAdapter } from '@origintrail-official/dkg-chain'; import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, TypedEventBus, generateEd25519Keypair, + reconstructChunkedTextBodies, } from '@origintrail-official/dkg-core'; import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; import { DKGPublisher } from '../src/dkg-publisher.js'; @@ -16,18 +19,20 @@ function q(s: string, p: string, o: string): Quad { }; } -async function makePublisher(): Promise { - return new DKGPublisher({ - store: new OxigraphStore(), +async function makePublisher(): Promise<{ publisher: DKGPublisher; store: OxigraphStore }> { + const store = new OxigraphStore(); + const publisher = new DKGPublisher({ + store, chain: new NoChainAdapter(), eventBus: new TypedEventBus(), keypair: await generateEd25519Keypair(), }); + return { publisher, store }; } describe('DKGPublisher compatibility aliases', () => { it('keeps autoPartition as a deprecated alias for skolemizeByEntity', async () => { - const publisher = await makePublisher(); + const { publisher } = await makePublisher(); const quads = [ q('urn:compat:one', 'http://schema.org/name', '"One"'), q('urn:compat:two', 'http://schema.org/name', '"Two"'), @@ -36,22 +41,127 @@ describe('DKGPublisher compatibility aliases', () => { expect(publisher.autoPartition(quads)).toEqual(publisher.skolemizeByEntity(quads)); }); - it('rejects oversized RDF literals at shared-memory producer boundary', async () => { - const publisher = await makePublisher(); + it('chunks oversized schema:text literals at shared-memory producer boundary', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:oversized'; + + await publisher.share('test', [ + q(root, 'http://schema.org/text', oversized), + ], { publisherPeerId: 'test-peer', localOnly: true }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(result.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + + const reconstructed = reconstructChunkedTextBodies(result.quads, { subject: root }); + expect(reconstructed).toHaveLength(1); + expect(reconstructed[0].lexical).toBe('x'.repeat(60_000)); + }); + + it('rejects oversized non-text RDF literals at shared-memory producer boundary', async () => { + const { publisher } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; await expect( publisher.share('test', [ - q('urn:compat:oversized', 'http://schema.org/text', oversized), + q('urn:compat:oversized-name', 'http://schema.org/name', oversized), ], { publisherPeerId: 'test-peer' }), ).rejects.toMatchObject({ code: 'OVERSIZED_RDF_LITERAL', actualBytes: 60_002, + predicate: 'http://schema.org/name', + }); + }); + + it('keeps reserved-subject failures ahead of large-text chunking', async () => { + const { publisher } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + + await expect( + publisher.share('test', [ + q('urn:dkg:file:reserved', 'http://schema.org/text', oversized), + ], { publisherPeerId: 'test-peer' }), + ).rejects.toThrow(/reserved namespace/i); + }); + + it('rejects oversized conditional-write expected values instead of chunking conditions', async () => { + const { publisher } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + + await expect( + publisher.conditionalShare('test', [ + q('urn:compat:cas', 'http://schema.org/name', '"CAS"'), + ], { + publisherPeerId: 'test-peer', + conditions: [{ + subject: 'urn:compat:cas', + predicate: 'http://schema.org/text', + expectedValue: oversized, + }], + }), + ).rejects.toMatchObject({ + code: 'OVERSIZED_RDF_LITERAL', + predicate: 'http://schema.org/text', }); }); + it('chunks oversized schema:text literals during direct publish', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:publish-oversized'; + + await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/text', oversized)], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(reconstructChunkedTextBodies(result.quads, { subject: root })[0].lexical).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals during assertion write', async () => { + const { publisher } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:assertion-oversized'; + const agent = '0x1234567890abcdef1234567890abcdef12345678'; + + await publisher.assertionCreate('test', 'large-text', agent); + await publisher.assertionWrite('test', 'large-text', agent, [ + q(root, 'http://schema.org/text', oversized), + ]); + + const quads = await publisher.assertionQuery('test', 'large-text', agent); + expect(quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(reconstructChunkedTextBodies(quads, { subject: root })[0].lexical).toBe('x'.repeat(60_000)); + }); + it('rejects oversized private literals before publish canonicalization', async () => { - const publisher = await makePublisher(); + const { publisher } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; await expect( diff --git a/packages/storage/test/blazegraph.unit.test.ts b/packages/storage/test/blazegraph.unit.test.ts index 066f4e609..16d56a484 100644 --- a/packages/storage/test/blazegraph.unit.test.ts +++ b/packages/storage/test/blazegraph.unit.test.ts @@ -3,6 +3,10 @@ * no live Blazegraph required). */ import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { + DKG_CHUNK_VALUE, + normalizeLargeRdfLiteralsForBlazegraph, +} from '@origintrail-official/dkg-core'; import { BlazegraphStore } from '../src/adapters/blazegraph.js'; describe('BlazegraphStore (mocked HTTP)', () => { @@ -300,6 +304,27 @@ describe('BlazegraphStore (mocked HTTP)', () => { expect(fetchCalls).toHaveLength(0); }); + it('insert accepts producer-normalized chunked schema:text payloads', async () => { + setFetch(async () => new Response(null, { status: 200 })); + const s = new BlazegraphStore(baseUrl); + const original = 'x'.repeat(70_000); + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://s-chunked', + predicate: 'http://schema.org/text', + object: JSON.stringify(original), + graph: 'http://g', + }, + ]); + + await s.insert(normalized.quads); + + expect(fetchCalls).toHaveLength(1); + const body = String(fetchCalls[0][1]?.body); + expect(body).toContain(DKG_CHUNK_VALUE); + expect(body).not.toContain(original); + }); + it('insert allows 25KB ASCII literal (under MUTF-8 limit)', async () => { setFetch(async () => new Response(null, { status: 200 })); const s = new BlazegraphStore(baseUrl); diff --git a/packages/storage/test/sparql-http.test.ts b/packages/storage/test/sparql-http.test.ts index 918737e76..c21b890ec 100644 --- a/packages/storage/test/sparql-http.test.ts +++ b/packages/storage/test/sparql-http.test.ts @@ -1,5 +1,9 @@ import { createServer, type Server } from 'node:http'; import { describe, it, expect, beforeAll, afterAll, vi } from 'vitest'; +import { + DKG_CHUNK_VALUE, + normalizeLargeRdfLiteralsForBlazegraph, +} from '@origintrail-official/dkg-core'; import { SparqlHttpStore, createTripleStore, type Quad, type SparqlHttpSlowQueryEvent } from '../src/index.js'; let server: Server; @@ -115,6 +119,25 @@ describe('SparqlHttpStore (test server)', () => { expect(insertedQuads).toHaveLength(0); }); + it('accepts producer-normalized chunked schema:text payloads', async () => { + insertedQuads.length = 0; + const original = 'x'.repeat(70_000); + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://ex.org/chunked', + predicate: 'http://schema.org/text', + object: JSON.stringify(original), + graph: 'http://ex.org/g', + }, + ]); + + await store.insert(normalized.quads); + + expect(insertedQuads).toHaveLength(1); + expect(insertedQuads[0]).toContain(DKG_CHUNK_VALUE); + expect(insertedQuads[0]).not.toContain(original); + }); + it('query SELECT sends query to query endpoint and parses bindings', async () => { const result = await store.query( 'SELECT ?name WHERE { GRAPH { ?name } }', From f6374818015aa865c8431ceb0d324e1e8cc24628 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 19:54:09 +0200 Subject: [PATCH 02/12] Address large literal chunking review --- packages/agent/src/dkg-agent-publish.ts | 11 +++- .../agent/test/publish-literal-size.test.ts | 44 +++++++++++++ packages/cli/src/daemon/http-utils.ts | 25 +++++--- .../daemon/routes/knowledge-assets-import.ts | 6 +- .../cli/src/daemon/routes/knowledge-assets.ts | 14 ++--- packages/cli/src/daemon/routes/memory.ts | 24 +++++--- .../test/http-literal-size-validation.test.ts | 60 ++++++++++++++++++ packages/core/src/index.ts | 2 - packages/core/src/rdf-literal-size.ts | 8 +-- packages/publisher/src/dkg-publisher.ts | 11 +++- .../test/dkg-publisher-compat.test.ts | 61 +++++++++++++++++-- 11 files changed, 226 insertions(+), 40 deletions(-) diff --git a/packages/agent/src/dkg-agent-publish.ts b/packages/agent/src/dkg-agent-publish.ts index 111a30cf3..0c68faf40 100644 --- a/packages/agent/src/dkg-agent-publish.ts +++ b/packages/agent/src/dkg-agent-publish.ts @@ -428,7 +428,16 @@ function rejectOversizedRdfLiterals(quads: Quad[] | undefined, label: string): v } function normalizePublicRdfLiterals(quads: Quad[], label: string): Quad[] { - return normalizeLargeRdfLiteralsForBlazegraph(quads, { label }).quads as Quad[]; + return normalizeLargeRdfLiteralsForBlazegraph(quads, { label }).quads.map(toQuad); +} + +function toQuad(quad: { subject: string; predicate: string; object: string; graph?: string }): Quad { + return { + subject: quad.subject, + predicate: quad.predicate, + object: quad.object, + graph: quad.graph ?? '', + }; } export class PublishMethods extends DKGAgentBase { diff --git a/packages/agent/test/publish-literal-size.test.ts b/packages/agent/test/publish-literal-size.test.ts index 7a4368762..08a4dd51a 100644 --- a/packages/agent/test/publish-literal-size.test.ts +++ b/packages/agent/test/publish-literal-size.test.ts @@ -2,7 +2,9 @@ import { describe, expect, it, vi } from 'vitest'; import { DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY, + normalizeLargeRdfLiteralsForBlazegraph, } from '@origintrail-official/dkg-core'; +import { canonicalPublishPayload } from '@origintrail-official/dkg-publisher'; import { PublishMethods } from '../src/dkg-agent-publish.js'; import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; @@ -100,4 +102,46 @@ describe('agent publish literal size validation', () => { predicate: 'http://schema.org/name', }); }); + + it('builds selection precomputed attestations over chunked public text quads', async () => { + const authorAddress = '0x000000000000000000000000000000000000dEaD'; + const agentStub = { + chain: { + getEvmChainId: vi.fn(async () => 31337n), + getKnowledgeAssetsLifecycleAddress: vi.fn(async () => '0x000000000000000000000000000000000000c0de'), + }, + getContextGraphOnChainId: vi.fn(async () => 42n), + isPrivateContextGraph: vi.fn(async () => false), + publisher: { + publisherFallbackAuthorAddress: vi.fn(async () => authorAddress), + signAuthorAttestationAsPublisher: vi.fn(async () => ({ + r: new Uint8Array(32).fill(1), + vs: new Uint8Array(32).fill(2), + })), + }, + kaNumberAllocator: { + reconcile: vi.fn(), + markReconciled: vi.fn(), + allocate: vi.fn(() => ({ number: 7n })), + }, + reconciledKaAuthors: new Set(), + }; + + const attestation = + await PublishMethods.prototype._buildPrecomputedAttestationForSelection.call( + agentStub as never, + 'computer-history', + [OVERSIZED_TEXT_QUAD], + ); + + const chunkedQuads = normalizeLargeRdfLiteralsForBlazegraph( + [OVERSIZED_TEXT_QUAD], + { label: 'test.expected' }, + ).quads.map((quad) => ({ ...quad, graph: quad.graph ?? '' })); + const expectedChunkedRoot = canonicalPublishPayload(chunkedQuads, []).kcMerkleRoot; + const unchunkedRoot = canonicalPublishPayload([OVERSIZED_TEXT_QUAD], []).kcMerkleRoot; + + expect(Array.from(attestation?.expectedMerkleRoot ?? [])).toEqual(Array.from(expectedChunkedRoot)); + expect(Array.from(attestation?.expectedMerkleRoot ?? [])).not.toEqual(Array.from(unchunkedRoot)); + }); }); diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index e09c30759..afa1f2a11 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -222,13 +222,18 @@ export function validateWritableQuadLiteralSizes( } } -export function normalizeWritableQuadLiterals( +export interface PreparedPublicWriteQuads { + readonly quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>; + readonly rewrites: RdfTextLiteralRewrite[]; +} + +export function preparePublicWriteQuads( label: string, - quads: T[], -): { ok: true; quads: T[]; rewrites: RdfTextLiteralRewrite[] } | { ok: false; body: Record } { + quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>, +): { ok: true; value: PreparedPublicWriteQuads } | { ok: false; body: Record } { try { const result = normalizeLargeRdfLiteralsForBlazegraph(quads, { label }); - return { ok: true, quads: result.quads as T[], rewrites: result.rewrites }; + return { ok: true, value: { quads: result.quads, rewrites: result.rewrites } }; } catch (err) { if (isOversizedRdfLiteralError(err)) { return { ok: false, body: oversizedRdfLiteralResponseBody(err) }; @@ -427,10 +432,16 @@ export function parsePublishRequestBody( } const quadObjectError = validateQuadObjectTerms("quads", quads); if (quadObjectError) return { ok: false, error: quadObjectError }; - const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + const normalizedQuads = preparePublicWriteQuads("quads", quads); if (!normalizedQuads.ok) { return { ok: false, error: String(normalizedQuads.body.error ?? 'Oversized RDF literal'), body: normalizedQuads.body }; } + const publishQuads = normalizedQuads.value.quads.map((quad) => ({ + subject: quad.subject, + predicate: quad.predicate, + object: quad.object, + graph: quad.graph ?? '', + })); if ( privateQuads !== undefined && @@ -521,9 +532,9 @@ export function parsePublishRequestBody( ok: true, value: { contextGraphId, - quads: normalizedQuads.quads, + quads: publishQuads, privateQuads, - ...(normalizedQuads.rewrites.length > 0 ? { literalRewrites: normalizedQuads.rewrites } : {}), + ...(normalizedQuads.value.rewrites.length > 0 ? { literalRewrites: normalizedQuads.value.rewrites } : {}), accessPolicy, allowedPeers, subGraphName: subGraphName as string | undefined, diff --git a/packages/cli/src/daemon/routes/knowledge-assets-import.ts b/packages/cli/src/daemon/routes/knowledge-assets-import.ts index f6090571c..82ed5edf6 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets-import.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets-import.ts @@ -43,7 +43,7 @@ import { normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, oversizedRdfLiteralResponseBody, - normalizeWritableQuadLiterals, + preparePublicWriteQuads, SMALL_BODY_BYTES, MAX_UPLOAD_BYTES, type ImportFileExtractionPayload, @@ -639,11 +639,11 @@ export async function handleKaSemanticEnrichmentWrite(ctx: RequestContext): Prom generationMethod, semanticQuads, }); - const normalizedQuads = normalizeWritableQuadLiterals("semanticQuads", [...semanticQuads, ...provenanceQuads]); + const normalizedQuads = preparePublicWriteQuads("semanticQuads", [...semanticQuads, ...provenanceQuads]); if (!normalizedQuads.ok) { return jsonResponse(res, 400, normalizedQuads.body); } - const quads = normalizedQuads.quads; + const quads = normalizedQuads.value.quads; const targetAssertionUri = contextGraphAssertionUri( artifact.contextGraphId, artifact.assertionAgentAddress, diff --git a/packages/cli/src/daemon/routes/knowledge-assets.ts b/packages/cli/src/daemon/routes/knowledge-assets.ts index 109875a13..d1df3832e 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets.ts @@ -41,7 +41,7 @@ import { respondIfReconcileUnavailable, respondIfChainRpcTransportError, sanitizeRpcMessage, - normalizeWritableQuadLiterals, + preparePublicWriteQuads, normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, isNoFundedPublisherWalletLike, @@ -828,9 +828,9 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< } const objErr = validateQuadObjectTerms("quads", quads); if (objErr) return jsonResponse(res, 400, { error: objErr }); - const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + const normalizedQuads = preparePublicWriteQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - quadsToWrite = normalizedQuads.quads; + quadsToWrite = normalizedQuads.value.quads; } const shouldFinalize = hasQuads && finalize !== false; // #1116 D5: the create ROUTE stays a primitive — create+write+seal, with @@ -1149,7 +1149,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< // literal nor an absolute IRI before they reach (and crash) the parser. const wmObjErr = validateQuadObjectTerms("quads", parsed.quads); if (wmObjErr) return jsonResponse(res, 400, { error: wmObjErr }); - const normalizedQuads = normalizeWritableQuadLiterals("quads", parsed.quads); + const normalizedQuads = preparePublicWriteQuads("quads", parsed.quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); // A bare write to a name that was never created used to fall through to // the legacy `/assertion/{addr}/{name}` graph and produce a KA that is @@ -1176,9 +1176,9 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< ...writeAuthorLane, }); } - await agent.assertion.write(contextGraphId, name, normalizedQuads.quads, { subGraphName, ...writeAuthorLane }); - emitMemoryGraphChanged?.({ contextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: normalizedQuads.quads.length } }); - return jsonResponse(res, 200, { written: normalizedQuads.quads.length }); + await agent.assertion.write(contextGraphId, name, normalizedQuads.value.quads, { subGraphName, ...writeAuthorLane }); + emitMemoryGraphChanged?.({ contextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: normalizedQuads.value.quads.length } }); + return jsonResponse(res, 200, { written: normalizedQuads.value.quads.length }); } if (verb === "finalize") { const finalizeOptions = resolveFinalizeOptions(parsed, res, writePreflightCallerAgentAddress); diff --git a/packages/cli/src/daemon/routes/memory.ts b/packages/cli/src/daemon/routes/memory.ts index 3d8cdac2e..357185729 100644 --- a/packages/cli/src/daemon/routes/memory.ts +++ b/packages/cli/src/daemon/routes/memory.ts @@ -200,7 +200,7 @@ import { isPublishQuad, isWritableQuad, validateQuadObjectTerms, - normalizeWritableQuadLiterals, + preparePublicWriteQuads, oversizedRdfLiteralResponseBody, parsePublishRequestBody, jsonResponse, @@ -670,9 +670,12 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { }; }); - const normalizedQuads = normalizeWritableQuadLiterals("quads", normalized); + const normalizedQuads = preparePublicWriteQuads("quads", normalized); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - normalized = normalizedQuads.quads; + normalized = normalizedQuads.value.quads.map((quad) => ({ + ...quad, + graph: quad.graph ?? graph, + })); await agent.store.insert(normalized); return jsonResponse(res, 200, { ok: true, @@ -1662,9 +1665,9 @@ WHERE { const objErr = validateQuadObjectTerms("quads", quads); if (objErr) return jsonResponse(res, 400, { error: objErr }); } - const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + const normalizedQuads = preparePublicWriteQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - quads = normalizedQuads.quads; + quads = normalizedQuads.value.quads; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2267,9 +2270,9 @@ WHERE { const objErr = validateQuadObjectTerms("quads", quads); if (objErr) return jsonResponse(res, 400, { error: objErr }); } - const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + const normalizedQuads = preparePublicWriteQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - quads = normalizedQuads.quads; + quads = normalizedQuads.value.quads; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2465,9 +2468,12 @@ WHERE { }); } - const normalizedQuads = normalizeWritableQuadLiterals("quads", quads); + const normalizedQuads = preparePublicWriteQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - const quadsToWrite = normalizedQuads.quads; + const quadsToWrite = normalizedQuads.value.quads.map((quad) => ({ + ...quad, + graph: quad.graph ?? targetGraph, + })); // 5. Write to target layer try { diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index 9f13c8412..64218d3e9 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -2,10 +2,12 @@ import { describe, expect, it } from 'vitest'; import { DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY, + normalizeLargeRdfLiteralsForBlazegraph, } from '@origintrail-official/dkg-core'; import { buildImportFileResponse, parsePublishRequestBody, + preparePublicWriteQuads, validateWritableQuadLiteralSizes, } from '../src/daemon/http-utils.js'; @@ -88,6 +90,64 @@ describe('HTTP RDF literal size validation', () => { } }); + it('keeps import-file chunk quads in their source data or metadata graph', () => { + const dataGraph = 'did:dkg:context-graph:test/_working_memory/0xabc/1'; + const metaGraph = 'did:dkg:context-graph:test/_meta'; + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/import-root', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: dataGraph, + }, + { + subject: 'http://example.org/import-meta', + predicate: 'http://schema.org/name', + object: '"metadata"', + graph: metaGraph, + }, + ], { label: 'import-file.quads' }); + + const dataQuads = normalized.quads.filter((quad) => quad.graph === dataGraph); + const metaQuads = normalized.quads.filter((quad) => quad.graph === metaGraph); + expect(dataQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(dataQuads.some((quad) => quad.predicate === 'http://schema.org/text')).toBe(false); + expect(metaQuads).toEqual([{ + subject: 'http://example.org/import-meta', + predicate: 'http://schema.org/name', + object: '"metadata"', + graph: metaGraph, + }]); + }); + + it('prepares semantic enrichment quads without moving provenance quads', () => { + const graph = 'did:dkg:context-graph:test/_working_memory/0xabc/semantic'; + const prepared = preparePublicWriteQuads('semanticQuads', [ + { + subject: 'http://example.org/semantic', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph, + }, + { + subject: 'http://example.org/provenance', + predicate: 'http://dkg.io/ontology/generatedBy', + object: '"semantic-enrichment"', + graph, + }, + ]); + + expect(prepared.ok).toBe(true); + if (prepared.ok) { + expect(prepared.value.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(prepared.value.quads.some((quad) => + quad.subject === 'http://example.org/provenance' && + quad.predicate === 'http://dkg.io/ontology/generatedBy' + )).toBe(true); + expect(prepared.value.rewrites).toHaveLength(1); + } + }); + it('preserves oversized literal fields in import-file extraction responses', () => { expect(buildImportFileResponse({ assertionUri: 'http://example.org/assertion', diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index c3bbe57c9..0cfb148f7 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -232,14 +232,12 @@ export { assertRdfLiteralMutf8Safe, assertQuadLiteralsMutf8Safe, normalizeLargeRdfLiteralsForBlazegraph, - reconstructChunkedTextBodies, type RdfLiteralSizeContext, type QuadLiteralLike, type ParsedRdfLiteralTerm, type RdfTextLiteralRewrite, type RdfLiteralNormalizationResult, type RdfLiteralNormalizationOptions, - type ReconstructedChunkedTextBody, } from './rdf-literal-size.js'; export { DKGError, diff --git a/packages/core/src/rdf-literal-size.ts b/packages/core/src/rdf-literal-size.ts index 05478a373..2bd036830 100644 --- a/packages/core/src/rdf-literal-size.ts +++ b/packages/core/src/rdf-literal-size.ts @@ -243,8 +243,8 @@ export function assertQuadLiteralsMutf8Safe( } } -export function normalizeLargeRdfLiteralsForBlazegraph( - quads: readonly T[], +export function normalizeLargeRdfLiteralsForBlazegraph( + quads: readonly QuadLiteralLike[], options: RdfLiteralNormalizationOptions = {}, ): RdfLiteralNormalizationResult { const maxBytes = options.maxBytes ?? DKG_RDF_LITERAL_SAFE_MUTF8_BYTES; @@ -252,7 +252,7 @@ export function normalizeLargeRdfLiteralsForBlazegraph + quad.subject === subject && + quad.predicate === DKG_HAS_TEXT_BODY + )?.object; + if (!bodySubject) throw new Error(`Missing text body for ${subject}`); + return quads + .filter((quad) => quad.subject === bodySubject && quad.predicate === DKG_HAS_TEXT_CHUNK) + .map((link) => { + const chunkQuads = quads.filter((quad) => quad.subject === link.object); + const indexTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_INDEX)?.object; + const valueTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_VALUE)?.object; + const index = Number(/^"(\d+)"/.exec(indexTerm ?? '')?.[1] ?? NaN); + if (!Number.isInteger(index) || !valueTerm) throw new Error(`Invalid chunk ${link.object}`); + return { index, value: JSON.parse(valueTerm) as string }; + }) + .sort((a, b) => a.index - b.index) + .map((chunk) => chunk.value) + .join(''); +} + async function makePublisher(): Promise<{ publisher: DKGPublisher; store: OxigraphStore }> { const store = new OxigraphStore(); const publisher = new DKGPublisher({ @@ -65,9 +87,7 @@ describe('DKGPublisher compatibility aliases', () => { )).toBe(true); expect(result.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); - const reconstructed = reconstructChunkedTextBodies(result.quads, { subject: root }); - expect(reconstructed).toHaveLength(1); - expect(reconstructed[0].lexical).toBe('x'.repeat(60_000)); + expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); it('rejects oversized non-text RDF literals at shared-memory producer boundary', async () => { @@ -138,7 +158,36 @@ describe('DKGPublisher compatibility aliases', () => { quad.subject === root && quad.predicate === 'http://schema.org/text' )).toBe(false); - expect(reconstructChunkedTextBodies(result.quads, { subject: root })[0].lexical).toBe('x'.repeat(60_000)); + expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals during update', async () => { + const { publisher, store } = await makePublisher(); + const root = 'urn:compat:update-oversized'; + const original = await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/name', '"Before"')], + skipContextGraphEnsure: true, + }); + + await publisher.update(original.kaId, { + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/text', `"${'x'.repeat(60_000)}"`)], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH ?g { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); it('chunks oversized schema:text literals during assertion write', async () => { @@ -157,7 +206,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.subject === root && quad.predicate === 'http://schema.org/text' )).toBe(false); - expect(reconstructChunkedTextBodies(quads, { subject: root })[0].lexical).toBe('x'.repeat(60_000)); + expect(reconstructPlainChunkedText(quads, root)).toBe('x'.repeat(60_000)); }); it('rejects oversized private literals before publish canonicalization', async () => { From 3e62231a155ea549cb6d798b1539e2a8a72e1ab0 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 20:00:32 +0200 Subject: [PATCH 03/12] Add import route chunking coverage --- .../daemon/routes/knowledge-assets-import.ts | 17 +++-- .../cli/test/import-artifact-routes.test.ts | 60 +++++++++++++++ .../import-file-integration.part-01.test.ts | 73 +++++++++++++++++++ .../test/import-file-orchestration.shared.ts | 16 +++- 4 files changed, 159 insertions(+), 7 deletions(-) diff --git a/packages/cli/src/daemon/routes/knowledge-assets-import.ts b/packages/cli/src/daemon/routes/knowledge-assets-import.ts index 82ed5edf6..ee89e6856 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets-import.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets-import.ts @@ -1849,12 +1849,17 @@ export async function handleKaImportFile(ctx: RequestContext, name: string): Pro const normalizedImportQuads = normalizeLargeRdfLiteralsForBlazegraph([...dataGraphQuads, ...metaQuads], { label: 'import-file.quads', }); - const normalizedGraphQuads = normalizedImportQuads.quads as Array<{ - subject: string; - predicate: string; - object: string; - graph: string; - }>; + const normalizedGraphQuads = normalizedImportQuads.quads.map((q) => { + if (q.graph === undefined) { + throw new Error('import-file.quads normalization produced a quad without graph'); + } + return { + subject: q.subject, + predicate: q.predicate, + object: q.object, + graph: q.graph, + }; + }); dataGraphQuads = normalizedGraphQuads.filter((q) => q.graph === assertionGraph); metaQuads.splice(0, metaQuads.length, ...normalizedGraphQuads.filter((q) => q.graph === metaGraph)); } catch (err: any) { diff --git a/packages/cli/test/import-artifact-routes.test.ts b/packages/cli/test/import-artifact-routes.test.ts index 6a86d8458..26ada8411 100644 --- a/packages/cli/test/import-artifact-routes.test.ts +++ b/packages/cli/test/import-artifact-routes.test.ts @@ -5,6 +5,10 @@ import { mkdtemp, rm } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, contextGraphAssertionUri, contextGraphMetaUri, contextGraphSharedMemoryUri, @@ -20,6 +24,27 @@ const PROV = 'http://www.w3.org/ns/prov#'; type Quad = { subject: string; predicate: string; object: string }; +function reconstructChunkedText(quads: readonly Quad[], subject: string): string { + const bodySubject = quads.find((quad) => + quad.subject === subject && + quad.predicate === DKG_HAS_TEXT_BODY + )?.object; + if (!bodySubject) throw new Error(`Missing chunked text body for ${subject}`); + return quads + .filter((quad) => quad.subject === bodySubject && quad.predicate === DKG_HAS_TEXT_CHUNK) + .map((link) => { + const chunkQuads = quads.filter((quad) => quad.subject === link.object); + const indexTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_INDEX)?.object; + const valueTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_VALUE)?.object; + const index = Number(/^"(\d+)"/.exec(indexTerm ?? '')?.[1] ?? NaN); + if (!Number.isInteger(index) || !valueTerm) throw new Error(`Invalid chunk ${link.object}`); + return { index, value: JSON.parse(valueTerm) as string }; + }) + .sort((a, b) => a.index - b.index) + .map((chunk) => chunk.value) + .join(''); +} + function sha256Hash(bytes: Buffer): string { return `sha256:${createHash('sha256').update(bytes).digest('hex')}`; } @@ -1890,6 +1915,41 @@ describe('import artifact daemon routes', () => { ])); }); + it('chunks oversized semantic enrichment schema:text before assertion write', async () => { + const entry = await fileStore.put(Buffer.from('# Imported\n'), 'text/markdown'); + const contextGraphId = 'cg-semantic-enrichment-large-text'; + const assertionName = 'imported-large-text'; + const assertionUri = contextGraphAssertionUri(contextGraphId, 'did:dkg:agent:test', assertionName); + const largeText = 'x'.repeat(60_000); + const { agent, writes } = makeAgent({ + contextGraphId, + assertionName, + assertionUri, + fileHash: entry.keccak256, + markdownHash: entry.keccak256, + markdownForm: `urn:dkg:file:${entry.keccak256}`, + }); + await startRoutes({ agent }); + + const result = await post('/api/knowledge-assets/semantic-enrichment/write', { + contextGraphId, + assertionUri, + semanticQuads: [ + { subject: 'urn:doc:semantic-large', predicate: 'http://schema.org/text', object: `"${largeText}"` }, + ], + }); + + expect(result.status).toBe(200); + expect(writes).toHaveLength(1); + const written = writes[0]!.quads; + expect(written.some((quad) => + quad.subject === 'urn:doc:semantic-large' && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(written.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructChunkedText(written, 'urn:doc:semantic-large')).toBe(largeText); + }); + it('rejects semantic enrichment of imported assertions owned by another agent', async () => { const entry = await fileStore.put(Buffer.from('# Imported\n'), 'text/markdown'); const contextGraphId = 'cg-semantic-enrichment-cross-agent'; diff --git a/packages/cli/test/import-file-integration.part-01.test.ts b/packages/cli/test/import-file-integration.part-01.test.ts index 45cd7bb30..a93fe376e 100644 --- a/packages/cli/test/import-file-integration.part-01.test.ts +++ b/packages/cli/test/import-file-integration.part-01.test.ts @@ -1,5 +1,32 @@ import { describe, it, expect, beforeEach, afterEach, mkdtemp, rm, readFile, tmpdir, join, existsSync, ExtractionPipelineRegistry, autoPartition, findReservedSubjectPrefix, FileStore, parseBoundary, extractFromMarkdown, contextGraphAssertionUri, contextGraphMetaUri, assertionLifecycleUri, ImportFileRouteError, makeMockAgent, getDataGraphQuads, BOUNDARY, CRLF, buildMultipart, type ExtractionPipeline, type ExtractionInput, type ConverterOutput, type ExtractionStatusRecord, type CapturedQuad, type MockAgent } from './import-file-test-helpers'; import { runImportFileOrchestration } from './import-file-orchestration.shared'; +import { + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, +} from '@origintrail-official/dkg-core'; + +function reconstructChunkedText(quads: readonly CapturedQuad[], subject: string): string { + const bodySubject = quads.find((quad) => + quad.subject === subject && + quad.predicate === DKG_HAS_TEXT_BODY + )?.object; + if (!bodySubject) throw new Error(`Missing chunked text body for ${subject}`); + return quads + .filter((quad) => quad.subject === bodySubject && quad.predicate === DKG_HAS_TEXT_CHUNK) + .map((link) => { + const chunkQuads = quads.filter((quad) => quad.subject === link.object); + const indexTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_INDEX)?.object; + const valueTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_VALUE)?.object; + const index = Number(/^"(\d+)"/.exec(indexTerm ?? '')?.[1] ?? NaN); + if (!Number.isInteger(index) || !valueTerm) throw new Error(`Invalid chunk ${link.object}`); + return { index, value: JSON.parse(valueTerm) as string }; + }) + .sort((a, b) => a.index - b.index) + .map((chunk) => chunk.value) + .join(''); +} describe('import-file orchestration — happy paths', () => { @@ -143,6 +170,52 @@ describe('import-file orchestration — happy paths', () => { expect(record.tripleCount).toBe(result.extraction.tripleCount); }); + it('text/markdown upload chunks oversized schema:text frontmatter literals in the assertion graph', async () => { + const largeText = 'x'.repeat(60_000); + const markdown = [ + '---', + 'id: large-literal-note', + `text: "${largeText}"`, + '---', + '', + '# Large Literal', + '', + ].join('\n'); + + const body = buildMultipart([ + { kind: 'text', name: 'contextGraphId', value: 'large-literal-cg' }, + { kind: 'file', name: 'file', filename: 'large.md', contentType: 'text/markdown', content: Buffer.from(markdown, 'utf-8') }, + ]); + + const result = await runImportFileOrchestration({ + agent, fileStore, extractionRegistry: registry, extractionStatus: status, + multipartBody: body, boundary: BOUNDARY, assertionName: 'large-literal', + }); + + expect(result.extraction.status).toBe('completed'); + const assertionGraph = contextGraphAssertionUri('large-literal-cg', agent.peerId, 'large-literal'); + const writtenTriples = getDataGraphQuads(agent, 'large-literal-cg', 'large-literal'); + expect(writtenTriples.some((quad) => + quad.subject === result.assertionUri && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(writtenTriples.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructChunkedText(writtenTriples, result.assertionUri)).toBe(largeText); + + const chunkQuads = agent.insertedQuads.filter((quad) => + quad.predicate === DKG_HAS_TEXT_BODY || + quad.predicate === DKG_HAS_TEXT_CHUNK || + quad.predicate === DKG_CHUNK_INDEX || + quad.predicate === DKG_CHUNK_VALUE + ); + expect(chunkQuads.length).toBeGreaterThan(0); + expect(chunkQuads.every((quad) => quad.graph === assertionGraph)).toBe(true); + expect(agent.insertedQuads.some((quad) => + quad.graph === contextGraphMetaUri('large-literal-cg') && + quad.predicate === DKG_CHUNK_VALUE + )).toBe(false); + }); + it('text/markdown upload uses filePart content type when contentType field is not provided', async () => { const body = buildMultipart([ diff --git a/packages/cli/test/import-file-orchestration.shared.ts b/packages/cli/test/import-file-orchestration.shared.ts index b4224e2ba..957131904 100644 --- a/packages/cli/test/import-file-orchestration.shared.ts +++ b/packages/cli/test/import-file-orchestration.shared.ts @@ -1,4 +1,5 @@ import { contextGraphAssertionUri, contextGraphMetaUri, assertionLifecycleUri, parseMultipart, findReservedSubjectPrefix, isSkolemizedUri, FileStore, ExtractionPipelineRegistry, extractFromMarkdown, randomUUID, buildImportFileResponse, normalizeDetectedContentType, ImportFileRouteError, type CapturedQuad, type MockAgent, type ImportFileResult, type ExtractionStatusRecord } from './import-file-test-helpers'; +import { normalizeLargeRdfLiteralsForBlazegraph } from '@origintrail-official/dkg-core'; import { inferContentTypeFromFilename } from '../src/daemon/manifest.js'; @@ -655,7 +656,20 @@ export async function runImportFileOrchestration(params: { metaCleanupSucceeded = true; await agent.store.dropGraph(assertionGraph); dataDropSucceeded = true; - await agent.store.insert([...dataGraphQuads, ...metaQuads]); + const normalizedImportQuads = normalizeLargeRdfLiteralsForBlazegraph([...dataGraphQuads, ...metaQuads], { + label: 'import-file.quads', + }).quads.map((q) => { + if (q.graph === undefined) { + throw new Error('import-file.quads normalization produced a quad without graph'); + } + return { + subject: q.subject, + predicate: q.predicate, + object: q.object, + graph: q.graph, + }; + }); + await agent.store.insert(normalizedImportQuads); } catch (writeErr: any) { const rollbackErrors: string[] = []; if (dataDropSucceeded && dataSnapshot.length > 0) { From 912279eec9eb12ae879235aa7b3dfedaa2aa8938 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 20:02:45 +0200 Subject: [PATCH 04/12] Keep chunk reconstruction test-only --- packages/core/src/rdf-literal-size.ts | 138 ------------------ packages/core/test/rdf-literal-size.test.ts | 152 +++++++++++++++++++- 2 files changed, 151 insertions(+), 139 deletions(-) diff --git a/packages/core/src/rdf-literal-size.ts b/packages/core/src/rdf-literal-size.ts index 2bd036830..c1b70d096 100644 --- a/packages/core/src/rdf-literal-size.ts +++ b/packages/core/src/rdf-literal-size.ts @@ -76,19 +76,6 @@ export interface RdfLiteralNormalizationOptions { readonly label?: string; } -export interface ReconstructedChunkedTextBody { - readonly subject: string; - readonly bodySubject: string; - readonly sourcePredicate: string; - readonly lexical: string; - readonly literalTerm: string; - readonly chunkCount: number; - readonly lexicalSha256: string; - readonly literalTermSha256: string; - readonly language?: string; - readonly datatype?: string; -} - export class OversizedRdfLiteralError extends DKGUserError { readonly code = OVERSIZED_RDF_LITERAL_ERROR_CODE; readonly actualBytes: number; @@ -319,88 +306,6 @@ export function normalizeLargeRdfLiteralsForBlazegraph( return { quads: normalized, rewrites }; } -export function reconstructChunkedTextBodies( - quads: readonly QuadLiteralLike[], - options: { subject?: string; bodySubject?: string; sourcePredicate?: string } = {}, -): ReconstructedChunkedTextBody[] { - const bySubject = indexQuadsBySubject(quads); - const bodyLinks = quads.filter((q) => - q.predicate === DKG_HAS_TEXT_BODY && - (!options.subject || q.subject === options.subject) && - (!options.bodySubject || q.object === options.bodySubject) - ); - const explicitBody = options.bodySubject && bodyLinks.length === 0 - ? [{ subject: findOwnerSubject(quads, options.bodySubject), object: options.bodySubject }] - : []; - const bodies = [...bodyLinks, ...explicitBody]; - const reconstructed: ReconstructedChunkedTextBody[] = []; - - for (const link of bodies) { - const bodySubject = link.object; - const bodyQuads = bySubject.get(bodySubject) ?? []; - const sourcePredicate = iriObject(bodyQuads, DKG_TEXT_SOURCE_PREDICATE); - if (!sourcePredicate) throw new Error(`Chunked text body ${bodySubject} is missing source predicate`); - if (options.sourcePredicate && sourcePredicate !== options.sourcePredicate) { - continue; - } - - const count = integerObject(bodyQuads, DKG_TEXT_CHUNK_COUNT); - if (count === undefined) throw new Error(`Chunked text body ${bodySubject} is missing chunk count`); - const lexicalSha256 = literalObject(bodyQuads, DKG_TEXT_CONTENT_SHA256); - if (!lexicalSha256) throw new Error(`Chunked text body ${bodySubject} is missing content hash`); - const literalTermSha256 = literalObject(bodyQuads, DKG_TEXT_LITERAL_TERM_SHA256); - if (!literalTermSha256) throw new Error(`Chunked text body ${bodySubject} is missing literal term hash`); - const language = literalObject(bodyQuads, DKG_TEXT_LANGUAGE); - const datatype = iriObject(bodyQuads, DKG_TEXT_DATATYPE); - const chunkSubjects = bodyQuads.filter((q) => q.predicate === DKG_HAS_TEXT_CHUNK).map((q) => q.object); - if (chunkSubjects.length !== count) { - throw new Error(`Chunked text body ${bodySubject} expected ${count} chunks but found ${chunkSubjects.length}`); - } - - const chunks = chunkSubjects.map((chunkSubject) => { - const chunkQuads = bySubject.get(chunkSubject) ?? []; - const index = integerObject(chunkQuads, DKG_CHUNK_INDEX); - if (index === undefined) throw new Error(`Chunk ${chunkSubject} is missing chunkIndex`); - const valueTerm = literalTermObject(chunkQuads, DKG_CHUNK_VALUE); - if (!valueTerm) throw new Error(`Chunk ${chunkSubject} is missing chunkValue`); - const parsed = parseRdfLiteralTerm(valueTerm); - if (!parsed) throw new Error(`Chunk ${chunkSubject} has invalid chunkValue`); - return { index, lexical: parsed.lexical }; - }).sort((a, b) => a.index - b.index); - - for (let i = 0; i < chunks.length; i++) { - if (chunks[i]!.index !== i) { - throw new Error(`Chunked text body ${bodySubject} has non-contiguous chunk index ${chunks[i]!.index}`); - } - } - - const lexical = chunks.map((chunk) => chunk.lexical).join(''); - const suffix = suffixFromMetadata(language, datatype); - const literalTerm = rdfLiteralTerm(lexical, suffix); - if (sha256Hex(lexical) !== lexicalSha256) { - throw new Error(`Chunked text body ${bodySubject} content hash mismatch`); - } - if (sha256Hex(literalTerm) !== literalTermSha256) { - throw new Error(`Chunked text body ${bodySubject} literal term hash mismatch`); - } - - reconstructed.push({ - subject: link.subject, - bodySubject, - sourcePredicate, - lexical, - literalTerm, - chunkCount: chunks.length, - lexicalSha256, - literalTermSha256, - ...(language !== undefined ? { language } : {}), - ...(datatype !== undefined ? { datatype } : {}), - }); - } - - return reconstructed; -} - function validateNormalizationLimits(maxBytes: number, chunkMaxBytes: number): void { if (!Number.isInteger(maxBytes) || maxBytes <= 0 || maxBytes > JAVA_WRITE_UTF_MAX_BYTES) { throw new Error(`Invalid maxBytes: ${maxBytes}`); @@ -596,46 +501,3 @@ function sha256Hex(value: string): string { function utf8ByteLength(value: string): number { return new TextEncoder().encode(value).length; } - -function indexQuadsBySubject(quads: readonly QuadLiteralLike[]): Map { - const map = new Map(); - for (const quad of quads) { - const list = map.get(quad.subject); - if (list) list.push(quad); - else map.set(quad.subject, [quad]); - } - return map; -} - -function findOwnerSubject(quads: readonly QuadLiteralLike[], bodySubject: string): string { - return quads.find((q) => q.predicate === DKG_HAS_TEXT_BODY && q.object === bodySubject)?.subject ?? ''; -} - -function literalTermObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { - const object = quads.find((q) => q.predicate === predicate)?.object; - return object?.startsWith('"') ? object : undefined; -} - -function literalObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { - const term = literalTermObject(quads, predicate); - if (!term) return undefined; - const parsed = parseRdfLiteralTerm(term); - return parsed?.lexical; -} - -function iriObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { - const object = quads.find((q) => q.predicate === predicate)?.object; - return object && !object.startsWith('"') ? object : undefined; -} - -function integerObject(quads: readonly QuadLiteralLike[], predicate: string): number | undefined { - const value = literalObject(quads, predicate); - if (value === undefined || !/^\d+$/.test(value)) return undefined; - return Number(value); -} - -function suffixFromMetadata(language?: string, datatype?: string): string { - if (language) return `@${language}`; - if (datatype) return `^^<${datatype}>`; - return ''; -} diff --git a/packages/core/test/rdf-literal-size.test.ts b/packages/core/test/rdf-literal-size.test.ts index d6b54628d..97947bf5b 100644 --- a/packages/core/test/rdf-literal-size.test.ts +++ b/packages/core/test/rdf-literal-size.test.ts @@ -1,11 +1,17 @@ +import { createHash } from 'node:crypto'; import { describe, expect, it } from 'vitest'; import { + DKG_CHUNK_INDEX, DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY, DKG_HAS_TEXT_CHUNK, DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, + DKG_TEXT_CHUNK_COUNT, DKG_TEXT_CONTENT_SHA256, + DKG_TEXT_DATATYPE, + DKG_TEXT_LANGUAGE, DKG_TEXT_LITERAL_TERM_SHA256, + DKG_TEXT_SOURCE_PREDICATE, OVERSIZED_RDF_LITERAL_ERROR_CODE, XSD_STRING_IRI, assertQuadLiteralsMutf8Safe, @@ -14,9 +20,153 @@ import { normalizeLargeRdfLiteralsForBlazegraph, parseRdfLiteralTerm, rdfLiteralTermMutf8ByteLength, - reconstructChunkedTextBodies, + type QuadLiteralLike, } from '../src/rdf-literal-size.js'; +interface ReconstructedChunkedTextBody { + readonly subject: string; + readonly bodySubject: string; + readonly sourcePredicate: string; + readonly lexical: string; + readonly literalTerm: string; + readonly chunkCount: number; + readonly lexicalSha256: string; + readonly literalTermSha256: string; + readonly language?: string; + readonly datatype?: string; +} + +function reconstructChunkedTextBodies( + quads: readonly QuadLiteralLike[], + options: { subject?: string; bodySubject?: string; sourcePredicate?: string } = {}, +): ReconstructedChunkedTextBody[] { + const bySubject = indexQuadsBySubject(quads); + const bodyLinks = quads.filter((q) => + q.predicate === DKG_HAS_TEXT_BODY && + (!options.subject || q.subject === options.subject) && + (!options.bodySubject || q.object === options.bodySubject) + ); + const explicitBody = options.bodySubject && bodyLinks.length === 0 + ? [{ subject: findOwnerSubject(quads, options.bodySubject), object: options.bodySubject }] + : []; + const bodies = [...bodyLinks, ...explicitBody]; + const reconstructed: ReconstructedChunkedTextBody[] = []; + + for (const link of bodies) { + const bodySubject = link.object; + const bodyQuads = bySubject.get(bodySubject) ?? []; + const sourcePredicate = iriObject(bodyQuads, DKG_TEXT_SOURCE_PREDICATE); + if (!sourcePredicate) throw new Error(`Chunked text body ${bodySubject} is missing source predicate`); + if (options.sourcePredicate && sourcePredicate !== options.sourcePredicate) continue; + + const count = integerObject(bodyQuads, DKG_TEXT_CHUNK_COUNT); + if (count === undefined) throw new Error(`Chunked text body ${bodySubject} is missing chunk count`); + const lexicalSha256 = literalObject(bodyQuads, DKG_TEXT_CONTENT_SHA256); + if (!lexicalSha256) throw new Error(`Chunked text body ${bodySubject} is missing content hash`); + const literalTermSha256 = literalObject(bodyQuads, DKG_TEXT_LITERAL_TERM_SHA256); + if (!literalTermSha256) throw new Error(`Chunked text body ${bodySubject} is missing literal term hash`); + const language = literalObject(bodyQuads, DKG_TEXT_LANGUAGE); + const datatype = iriObject(bodyQuads, DKG_TEXT_DATATYPE); + const chunkSubjects = bodyQuads.filter((q) => q.predicate === DKG_HAS_TEXT_CHUNK).map((q) => q.object); + if (chunkSubjects.length !== count) { + throw new Error(`Chunked text body ${bodySubject} expected ${count} chunks but found ${chunkSubjects.length}`); + } + + const chunks = chunkSubjects.map((chunkSubject) => { + const chunkQuads = bySubject.get(chunkSubject) ?? []; + const index = integerObject(chunkQuads, DKG_CHUNK_INDEX); + if (index === undefined) throw new Error(`Chunk ${chunkSubject} is missing chunkIndex`); + const valueTerm = literalTermObject(chunkQuads, DKG_CHUNK_VALUE); + if (!valueTerm) throw new Error(`Chunk ${chunkSubject} is missing chunkValue`); + const parsed = parseRdfLiteralTerm(valueTerm); + if (!parsed) throw new Error(`Chunk ${chunkSubject} has invalid chunkValue`); + return { index, lexical: parsed.lexical }; + }).sort((a, b) => a.index - b.index); + + for (let i = 0; i < chunks.length; i++) { + if (chunks[i]!.index !== i) { + throw new Error(`Chunked text body ${bodySubject} has non-contiguous chunk index ${chunks[i]!.index}`); + } + } + + const lexical = chunks.map((chunk) => chunk.lexical).join(''); + const suffix = suffixFromMetadata(language, datatype); + const literalTerm = rdfLiteralTerm(lexical, suffix); + if (sha256Hex(lexical) !== lexicalSha256) { + throw new Error(`Chunked text body ${bodySubject} content hash mismatch`); + } + if (sha256Hex(literalTerm) !== literalTermSha256) { + throw new Error(`Chunked text body ${bodySubject} literal term hash mismatch`); + } + + reconstructed.push({ + subject: link.subject, + bodySubject, + sourcePredicate, + lexical, + literalTerm, + chunkCount: chunks.length, + lexicalSha256, + literalTermSha256, + ...(language !== undefined ? { language } : {}), + ...(datatype !== undefined ? { datatype } : {}), + }); + } + + return reconstructed; +} + +function indexQuadsBySubject(quads: readonly QuadLiteralLike[]): Map { + const map = new Map(); + for (const quad of quads) { + const list = map.get(quad.subject); + if (list) list.push(quad); + else map.set(quad.subject, [quad]); + } + return map; +} + +function findOwnerSubject(quads: readonly QuadLiteralLike[], bodySubject: string): string { + return quads.find((q) => q.predicate === DKG_HAS_TEXT_BODY && q.object === bodySubject)?.subject ?? ''; +} + +function literalTermObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object?.startsWith('"') ? object : undefined; +} + +function literalObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { + const term = literalTermObject(quads, predicate); + if (!term) return undefined; + const parsed = parseRdfLiteralTerm(term); + return parsed?.lexical; +} + +function iriObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object && !object.startsWith('"') ? object : undefined; +} + +function integerObject(quads: readonly QuadLiteralLike[], predicate: string): number | undefined { + const value = literalObject(quads, predicate); + if (value === undefined || !/^\d+$/.test(value)) return undefined; + return Number(value); +} + +function suffixFromMetadata(language?: string, datatype?: string): string { + if (language) return `@${language}`; + if (datatype) return `^^<${datatype}>`; + return ''; +} + +function rdfLiteralTerm(lexical: string, suffix = ''): string { + return `${JSON.stringify(lexical)}${suffix}`; +} + +function sha256Hex(value: string): string { + return createHash('sha256').update(value, 'utf8').digest('hex'); +} + describe('rdf literal Java MUTF-8 sizing', () => { it('counts Java Modified UTF-8 byte length by UTF-16 code unit', () => { expect(javaModifiedUtf8ByteLength('abc')).toBe(3); From 0fb5577c4ff5bc07d92a6f7fc3130ed2de2ac69a Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 20:33:42 +0200 Subject: [PATCH 05/12] Address large literal chunking review --- packages/agent/src/dkg-agent-publish.ts | 13 +- .../agent/test/publish-literal-size.test.ts | 67 +++- packages/cli/src/daemon/http-utils.ts | 45 ++- .../cli/src/daemon/routes/knowledge-assets.ts | 4 +- .../test/http-literal-size-validation.test.ts | 53 ++- .../cli/test/import-artifact-routes.test.ts | 100 ++++- .../cli/test/knowledge-assets-route.test.ts | 2 +- packages/core/src/index.ts | 18 +- packages/core/src/rdf-literal-size.ts | 360 ----------------- .../src/rdf-text-literal-normalization.ts | 369 ++++++++++++++++++ packages/core/test/rdf-literal-size.test.ts | 18 +- packages/publisher/src/dkg-publisher.ts | 14 +- packages/publisher/src/index.ts | 6 + .../src/public-write-normalization.ts | 44 +++ .../test/dkg-publisher-compat.test.ts | 129 ++++++ 15 files changed, 817 insertions(+), 425 deletions(-) create mode 100644 packages/core/src/rdf-text-literal-normalization.ts create mode 100644 packages/publisher/src/public-write-normalization.ts diff --git a/packages/agent/src/dkg-agent-publish.ts b/packages/agent/src/dkg-agent-publish.ts index 0c68faf40..d28923768 100644 --- a/packages/agent/src/dkg-agent-publish.ts +++ b/packages/agent/src/dkg-agent-publish.ts @@ -98,7 +98,6 @@ import { sharedMemoryReadBothFilter, partitionCatalogQuads, assertQuadLiteralsMutf8Safe, - normalizeLargeRdfLiteralsForBlazegraph, } from '@origintrail-official/dkg-core'; import { GraphManager, PrivateContentStore, createTripleStore, type TripleStore, type TripleStoreConfig, type Quad, type LargeLiteralStorageConfig } from '@origintrail-official/dkg-storage'; import { EVMChainAdapter, NoChainAdapter, enrichEvmError, buildKnowledgeAssetUal, type EVMAdapterConfig, type ChainAdapter, type CreateContextGraphParams, type CreateOnChainContextGraphParams, type CreateOnChainContextGraphResult, type TxResult, type V10PublishingConvictionAccountInfo } from '@origintrail-official/dkg-chain'; @@ -110,6 +109,7 @@ import { resolveWorkspaceAgentRecipients, computeTripleHashV10 as computeTripleHash, computeFlatKCRootV10 as computeFlatKCRoot, skolemizeByEntity, isReservedSubject, canonicalPublishPayload, + preparePublicWriteQuads, generatedPrivateCatalogTripleKeys, resolveLiftWorkspaceSlice, validateLiftPublishPayload, @@ -428,16 +428,7 @@ function rejectOversizedRdfLiterals(quads: Quad[] | undefined, label: string): v } function normalizePublicRdfLiterals(quads: Quad[], label: string): Quad[] { - return normalizeLargeRdfLiteralsForBlazegraph(quads, { label }).quads.map(toQuad); -} - -function toQuad(quad: { subject: string; predicate: string; object: string; graph?: string }): Quad { - return { - subject: quad.subject, - predicate: quad.predicate, - object: quad.object, - graph: quad.graph ?? '', - }; + return preparePublicWriteQuads(quads, { label }).quads; } export class PublishMethods extends DKGAgentBase { diff --git a/packages/agent/test/publish-literal-size.test.ts b/packages/agent/test/publish-literal-size.test.ts index 08a4dd51a..b74b38fc9 100644 --- a/packages/agent/test/publish-literal-size.test.ts +++ b/packages/agent/test/publish-literal-size.test.ts @@ -2,9 +2,8 @@ import { describe, expect, it, vi } from 'vitest'; import { DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY, - normalizeLargeRdfLiteralsForBlazegraph, } from '@origintrail-official/dkg-core'; -import { canonicalPublishPayload } from '@origintrail-official/dkg-publisher'; +import { canonicalPublishPayload, preparePublicWriteQuads } from '@origintrail-official/dkg-publisher'; import { PublishMethods } from '../src/dkg-agent-publish.js'; import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; @@ -20,6 +19,21 @@ const OVERSIZED_NAME_QUAD: Quad = { predicate: 'http://schema.org/name', }; +const LINKED_BLANK_OVERSIZED_TEXT_QUADS: Quad[] = [ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:body', + graph: 'http://example.org/graph', + }, + { + subject: '_:body', + predicate: 'http://schema.org/text', + object: `"${'x'.repeat(60_000)}"`, + graph: 'http://example.org/graph', + }, +]; + describe('agent publish literal size validation', () => { it('rejects publishAsync private quads before workspace staging', async () => { const agentStub = { @@ -134,14 +148,59 @@ describe('agent publish literal size validation', () => { [OVERSIZED_TEXT_QUAD], ); - const chunkedQuads = normalizeLargeRdfLiteralsForBlazegraph( + const chunkedQuads = preparePublicWriteQuads( [OVERSIZED_TEXT_QUAD], { label: 'test.expected' }, - ).quads.map((quad) => ({ ...quad, graph: quad.graph ?? '' })); + ).quads; const expectedChunkedRoot = canonicalPublishPayload(chunkedQuads, []).kcMerkleRoot; const unchunkedRoot = canonicalPublishPayload([OVERSIZED_TEXT_QUAD], []).kcMerkleRoot; expect(Array.from(attestation?.expectedMerkleRoot ?? [])).toEqual(Array.from(expectedChunkedRoot)); expect(Array.from(attestation?.expectedMerkleRoot ?? [])).not.toEqual(Array.from(unchunkedRoot)); }); + + it('builds selection precomputed attestations over skolemized linked blank-node text chunks', async () => { + const authorAddress = '0x000000000000000000000000000000000000dEaD'; + const agentStub = { + chain: { + getEvmChainId: vi.fn(async () => 31337n), + getKnowledgeAssetsLifecycleAddress: vi.fn(async () => '0x000000000000000000000000000000000000c0de'), + }, + getContextGraphOnChainId: vi.fn(async () => 42n), + isPrivateContextGraph: vi.fn(async () => false), + publisher: { + publisherFallbackAuthorAddress: vi.fn(async () => authorAddress), + signAuthorAttestationAsPublisher: vi.fn(async () => ({ + r: new Uint8Array(32).fill(1), + vs: new Uint8Array(32).fill(2), + })), + }, + kaNumberAllocator: { + reconcile: vi.fn(), + markReconciled: vi.fn(), + allocate: vi.fn(() => ({ number: 7n })), + }, + reconciledKaAuthors: new Set(), + }; + + const attestation = + await PublishMethods.prototype._buildPrecomputedAttestationForSelection.call( + agentStub as never, + 'computer-history', + LINKED_BLANK_OVERSIZED_TEXT_QUADS, + ); + + const prepared = preparePublicWriteQuads(LINKED_BLANK_OVERSIZED_TEXT_QUADS, { label: 'test.expected' }).quads; + const child = 'http://example.org/root/.well-known/genid/body'; + expect(prepared.some((quad) => + quad.subject === child && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(prepared.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + const expectedChunkedRoot = canonicalPublishPayload(prepared, []).kcMerkleRoot; + expect(Array.from(attestation?.expectedMerkleRoot ?? [])).toEqual(Array.from(expectedChunkedRoot)); + }); }); diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index afa1f2a11..f5bff27c7 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -9,7 +9,6 @@ import { PayloadTooLargeError, assertQuadLiteralsMutf8Safe, isOversizedRdfLiteralError, - normalizeLargeRdfLiteralsForBlazegraph, type RdfTextLiteralRewrite, validateContextGraphId, validateSubGraphName, @@ -19,6 +18,7 @@ import { } from '@origintrail-official/dkg-core'; import { enrichEvmError, isChainRpcTransportError } from '@origintrail-official/dkg-chain'; import type { DKGAgent, ContextGraphWritePreflightProbe } from '@origintrail-official/dkg-agent'; +import { preparePublicWriteQuads as prepareCanonicalPublicWriteQuads } from '@origintrail-official/dkg-publisher'; import type { DkgConfig } from '../config.js'; import { enforceSignedRequestPostBody } from '../auth.js'; @@ -38,13 +38,17 @@ export interface PublishRequestBody { contextGraphId: string; quads: PublishQuad[]; privateQuads?: PublishQuad[]; - literalRewrites?: RdfTextLiteralRewrite[]; accessPolicy?: PublishAccessPolicy; allowedPeers?: string[]; subGraphName?: string; onChainContextGraphId?: string; } +export interface ParsedPublishRequest { + readonly body: PublishRequestBody; + readonly literalRewrites: RdfTextLiteralRewrite[]; +} + import type { CorsAllowlist } from './state.js'; export function isPayloadTooLargeError(err: unknown): err is PayloadTooLargeError { @@ -232,8 +236,13 @@ export function preparePublicWriteQuads( quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>, ): { ok: true; value: PreparedPublicWriteQuads } | { ok: false; body: Record } { try { - const result = normalizeLargeRdfLiteralsForBlazegraph(quads, { label }); - return { ok: true, value: { quads: result.quads, rewrites: result.rewrites } }; + const result = prepareCanonicalPublicWriteQuads(quads, { label }); + const normalizedQuads = result.quads.map((quad) => ( + quad.graph + ? quad + : { subject: quad.subject, predicate: quad.predicate, object: quad.object } + )); + return { ok: true, value: { quads: normalizedQuads, rewrites: result.rewrites } }; } catch (err) { if (isOversizedRdfLiteralError(err)) { return { ok: false, body: oversizedRdfLiteralResponseBody(err) }; @@ -259,10 +268,14 @@ export function validateQuadObjectTerms( ): string | null { const badIndex = quads.findIndex((q) => { const object = q.object.trim(); - return !object.startsWith('"') && !isSafeIri(object); + return !object.startsWith('"') && !isSafeBlankNode(object) && !isSafeIri(object); }); if (badIndex === -1) return null; - return `Invalid "${label}[${badIndex}].object": RDF object must be a quoted literal term or absolute IRI`; + return `Invalid "${label}[${badIndex}].object": RDF object must be a quoted literal term, blank node, or absolute IRI`; +} + +function isSafeBlankNode(term: string): boolean { + return /^_:[A-Za-z][A-Za-z0-9_-]*$/.test(term); } /** @@ -396,7 +409,7 @@ export function respondIfChainRpcTransportError( export function parsePublishRequestBody( body: string, -): { ok: true; value: PublishRequestBody } | { ok: false; error: string; body?: Record } { +): { ok: true; value: ParsedPublishRequest } | { ok: false; error: string; body?: Record } { let parsed: unknown; try { parsed = JSON.parse(body); @@ -531,14 +544,16 @@ export function parsePublishRequestBody( return { ok: true, value: { - contextGraphId, - quads: publishQuads, - privateQuads, - ...(normalizedQuads.value.rewrites.length > 0 ? { literalRewrites: normalizedQuads.value.rewrites } : {}), - accessPolicy, - allowedPeers, - subGraphName: subGraphName as string | undefined, - onChainContextGraphId: normalizedOnChainContextGraphId, + body: { + contextGraphId, + quads: publishQuads, + privateQuads, + accessPolicy, + allowedPeers, + subGraphName: subGraphName as string | undefined, + onChainContextGraphId: normalizedOnChainContextGraphId, + }, + literalRewrites: normalizedQuads.value.rewrites, }, }; } diff --git a/packages/cli/src/daemon/routes/knowledge-assets.ts b/packages/cli/src/daemon/routes/knowledge-assets.ts index d1df3832e..a22dd88e7 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets.ts @@ -673,16 +673,16 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< const parsed = parsePublishRequestBody(rawBody); if (!parsed.ok) return jsonResponse(res, 400, parsed.body ?? { error: parsed.error }); const raw = JSON.parse(rawBody) as Record; + const { body: publishBody, literalRewrites } = parsed.value; const { contextGraphId, quads, privateQuads, - literalRewrites, accessPolicy, allowedPeers, subGraphName, onChainContextGraphId, - } = parsed.value; + } = publishBody; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index 64218d3e9..bf8bfd437 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -33,15 +33,62 @@ describe('HTTP RDF literal size validation', () => { predicate: 'http://schema.org/text', originalMutf8Bytes: 60_002, }); - expect(parsed.value.quads.some((quad) => + expect(parsed.value.body.quads.some((quad) => quad.subject === 'http://example.org/s' && quad.predicate === 'http://schema.org/text' )).toBe(false); - expect(parsed.value.quads.some((quad) => + expect(parsed.value.body.quads.some((quad) => quad.subject === 'http://example.org/s' && quad.predicate === DKG_HAS_TEXT_BODY )).toBe(true); - expect(parsed.value.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(parsed.value.body.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + } + }); + + it('normalizes linked blank-node schema:text literals and derives rewrite metadata server-side', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + literalRewrites: [{ subject: 'client-supplied', predicate: 'ignored' }], + quads: [ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:body', + graph: 'http://example.org/g', + }, + { + subject: '_:body', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: 'http://example.org/g', + }, + ], + })); + + expect(parsed.ok).toBe(true); + if (parsed.ok) { + const child = 'http://example.org/root/.well-known/genid/body'; + expect(parsed.value.literalRewrites).toHaveLength(1); + expect(parsed.value.literalRewrites[0]).toMatchObject({ + subject: child, + predicate: 'http://schema.org/text', + originalMutf8Bytes: 60_002, + }); + expect(parsed.value.literalRewrites[0]?.subject).not.toBe('client-supplied'); + expect(parsed.value.body.quads.some((quad) => + quad.subject === 'http://example.org/root' && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(parsed.value.body.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(parsed.value.body.quads.some((quad) => + quad.subject === child && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(parsed.value.body.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); } }); diff --git a/packages/cli/test/import-artifact-routes.test.ts b/packages/cli/test/import-artifact-routes.test.ts index 26ada8411..1d330a0a2 100644 --- a/packages/cli/test/import-artifact-routes.test.ts +++ b/packages/cli/test/import-artifact-routes.test.ts @@ -22,7 +22,7 @@ import { handleKnowledgeAssetsRoutes } from '../src/daemon/routes/knowledge-asse const DKG = 'http://dkg.io/ontology/'; const PROV = 'http://www.w3.org/ns/prov#'; -type Quad = { subject: string; predicate: string; object: string }; +type Quad = { subject: string; predicate: string; object: string; graph?: string }; function reconstructChunkedText(quads: readonly Quad[], subject: string): string { const bodySubject = quads.find((quad) => @@ -136,6 +136,26 @@ describe('import artifact daemon routes', () => { return { status: res.status, body: await res.json() }; } + async function postMultipart( + path: string, + parts: Array< + | { name: string; value: string } + | { name: string; filename: string; contentType: string; value: string | Uint8Array } + >, + ) { + const form = new FormData(); + for (const part of parts) { + if ('filename' in part) { + const bytes = typeof part.value === 'string' ? part.value : new Uint8Array(part.value); + form.append(part.name, new Blob([bytes], { type: part.contentType }), part.filename); + } else { + form.append(part.name, part.value); + } + } + const res = await fetch(`${baseUrl}${path}`, { method: 'POST', body: form }); + return { status: res.status, body: await res.json() }; + } + async function get(path: string) { const res = await fetch(`${baseUrl}${path}`); return { status: res.status, body: await res.json() }; @@ -193,6 +213,7 @@ describe('import artifact daemon routes', () => { agentAddress?: string; subGraphName?: string; }> = []; + const insertedQuads: Quad[] = []; const queries: string[] = []; const queryQuads = args.queryQuads ?? [ { subject: 'urn:z', predicate: 'urn:p', object: 'urn:o' }, @@ -253,6 +274,14 @@ describe('import artifact daemon routes', () => { if (args.publisherDiscardError) throw args.publisherDiscardError; discards.push({ contextGraphId, name, agentAddress, subGraphName }); }, + async wmGraphUri( + contextGraphId: string, + agentAddress: string, + name: string, + subGraphName?: string, + ) { + return contextGraphAssertionUri(contextGraphId, agentAddress, name, subGraphName); + }, }, ...(args.onChainPolicy ? { @@ -277,8 +306,20 @@ describe('import artifact daemon routes', () => { async hasGraph() { return Boolean(args.targetGraphExists); }, + async insert(quads: Quad[]) { + insertedQuads.push(...quads); + }, + async deleteByPattern() { + return 0; + }, + async dropGraph() { + return undefined; + }, async query(sparql: string) { queries.push(sparql); + if (sparql.includes('CONSTRUCT')) { + return { type: 'quads', quads: [] }; + } if (sparql.includes('SELECT ?p ?o')) { return { type: 'bindings', bindings: [] }; } @@ -334,7 +375,7 @@ describe('import artifact daemon routes', () => { }, }, }; - return { agent, created, writes, discards, queries }; + return { agent, created, writes, discards, queries, insertedQuads }; } it('resolves and safely reads a completed Markdown import artifact by content hash', async () => { @@ -386,6 +427,61 @@ describe('import artifact daemon routes', () => { }); }); + it('wm/import-file chunks oversized schema:text through handleKaImportFile', async () => { + const contextGraphId = 'cg-import-file-large-text'; + const assertionName = 'large-text-import'; + const assertionUri = contextGraphAssertionUri(contextGraphId, 'did:dkg:agent:test', assertionName); + const { agent, insertedQuads } = makeAgent({ + contextGraphId, + assertionName, + assertionUri, + fileHash: `keccak256:${'0'.repeat(64)}`, + markdownHash: `keccak256:${'1'.repeat(64)}`, + markdownForm: `urn:dkg:file:keccak256:${'1'.repeat(64)}`, + }); + await startRoutes({ agent }); + + const largeText = 'x'.repeat(60_000); + const markdown = [ + '---', + 'id: route-large-text', + `text: "${largeText}"`, + '---', + '', + '# Large Text', + '', + ].join('\n'); + + const result = await postMultipart(`/api/knowledge-assets/${assertionName}/wm/import-file`, [ + { name: 'contextGraphId', value: contextGraphId }, + { name: 'file', filename: 'large.md', contentType: 'text/markdown', value: markdown }, + ]); + + expect(result.status).toBe(200); + expect(result.body.assertionUri).toBe(assertionUri); + const assertionGraph = contextGraphAssertionUri(contextGraphId, 'did:dkg:agent:test', assertionName); + const dataQuads = insertedQuads.filter((quad) => quad.graph === assertionGraph); + expect(dataQuads.some((quad) => + quad.subject === assertionUri && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(dataQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructChunkedText(dataQuads, assertionUri)).toBe(largeText); + + const chunkQuads = insertedQuads.filter((quad) => + quad.predicate === DKG_HAS_TEXT_BODY || + quad.predicate === DKG_HAS_TEXT_CHUNK || + quad.predicate === DKG_CHUNK_INDEX || + quad.predicate === DKG_CHUNK_VALUE + ); + expect(chunkQuads.length).toBeGreaterThan(0); + expect(chunkQuads.every((quad) => quad.graph === assertionGraph)).toBe(true); + expect(insertedQuads.some((quad) => + quad.graph === contextGraphMetaUri(contextGraphId) && + quad.predicate === DKG_CHUNK_VALUE + )).toBe(false); + }); + it('reads a generic source artifact as bounded base64 bytes', async () => { const entry = await fileStore.put(Buffer.from('source bytes'), 'text/markdown'); const contextGraphId = 'cg-generic-source-artifact'; diff --git a/packages/cli/test/knowledge-assets-route.test.ts b/packages/cli/test/knowledge-assets-route.test.ts index a2a84572f..edfdbda97 100644 --- a/packages/cli/test/knowledge-assets-route.test.ts +++ b/packages/cli/test/knowledge-assets-route.test.ts @@ -1089,7 +1089,7 @@ describe('/api/knowledge-assets routes (real daemon, real chain)', () => { quads: [{ subject: 'urn:s', predicate: 'urn:p', object: 'bare string', graph: '' }], }); expect(res.status).toBe(400); - expect(String(res.body.error)).toMatch(/RDF object must be a quoted literal term or absolute IRI/); + expect(String(res.body.error)).toMatch(/RDF object must be a quoted literal term, blank node, or absolute IRI/); }); it('rejects a malformed onChainContextGraphId with 400 before direct publish', async () => { diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0cfb148f7..8fd987c8f 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -206,6 +206,15 @@ export { DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, OVERSIZED_RDF_LITERAL_ERROR_CODE, OversizedRdfLiteralError, + javaModifiedUtf8ByteLength, + rdfLiteralTermMutf8ByteLength, + isOversizedRdfLiteralError, + assertRdfLiteralMutf8Safe, + assertQuadLiteralsMutf8Safe, + type RdfLiteralSizeContext, + type QuadLiteralLike, +} from './rdf-literal-size.js'; +export { RDF_TYPE_IRI, XSD_INTEGER_IRI, XSD_STRING_IRI, @@ -225,20 +234,13 @@ export { DKG_TEXT_DATATYPE, DKG_CHUNK_INDEX, DKG_CHUNK_VALUE, - javaModifiedUtf8ByteLength, - rdfLiteralTermMutf8ByteLength, parseRdfLiteralTerm, - isOversizedRdfLiteralError, - assertRdfLiteralMutf8Safe, - assertQuadLiteralsMutf8Safe, normalizeLargeRdfLiteralsForBlazegraph, - type RdfLiteralSizeContext, - type QuadLiteralLike, type ParsedRdfLiteralTerm, type RdfTextLiteralRewrite, type RdfLiteralNormalizationResult, type RdfLiteralNormalizationOptions, -} from './rdf-literal-size.js'; +} from './rdf-text-literal-normalization.js'; export { DKGError, DKGUserError, diff --git a/packages/core/src/rdf-literal-size.ts b/packages/core/src/rdf-literal-size.ts index c1b70d096..c98a810cd 100644 --- a/packages/core/src/rdf-literal-size.ts +++ b/packages/core/src/rdf-literal-size.ts @@ -1,35 +1,9 @@ -import { createHash } from 'node:crypto'; import { DKGUserError } from './errors.js'; -import { isSafeIri } from './sparql-safe.js'; export const JAVA_WRITE_UTF_MAX_BYTES = 65_535; export const DKG_RDF_LITERAL_SAFE_MUTF8_BYTES = 60_000; export const OVERSIZED_RDF_LITERAL_ERROR_CODE = 'OVERSIZED_RDF_LITERAL'; -export const RDF_TYPE_IRI = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; -export const XSD_INTEGER_IRI = 'http://www.w3.org/2001/XMLSchema#integer'; -export const XSD_STRING_IRI = 'http://www.w3.org/2001/XMLSchema#string'; -export const SCHEMA_TEXT_PREDICATES = [ - 'http://schema.org/text', - 'https://schema.org/text', -] as const; - -export const DKG_TEXT_BODY_CLASS = 'http://dkg.io/ontology/TextBody'; -export const DKG_TEXT_CHUNK_CLASS = 'http://dkg.io/ontology/TextChunk'; -export const DKG_HAS_TEXT_BODY = 'http://dkg.io/ontology/hasTextBody'; -export const DKG_HAS_TEXT_CHUNK = 'http://dkg.io/ontology/hasTextChunk'; -export const DKG_TEXT_SOURCE_PREDICATE = 'http://dkg.io/ontology/textSourcePredicate'; -export const DKG_TEXT_CONTENT_SHA256 = 'http://dkg.io/ontology/textContentSha256'; -export const DKG_TEXT_LITERAL_TERM_SHA256 = 'http://dkg.io/ontology/textLiteralTermSha256'; -export const DKG_TEXT_LITERAL_MUTF8_BYTES = 'http://dkg.io/ontology/textLiteralMutf8Bytes'; -export const DKG_TEXT_UTF8_BYTES = 'http://dkg.io/ontology/textUtf8Bytes'; -export const DKG_TEXT_CHUNK_COUNT = 'http://dkg.io/ontology/textChunkCount'; -export const DKG_TEXT_CHUNK_LIMIT = 'http://dkg.io/ontology/textChunkMutf8Limit'; -export const DKG_TEXT_LANGUAGE = 'http://dkg.io/ontology/textLanguage'; -export const DKG_TEXT_DATATYPE = 'http://dkg.io/ontology/textDatatype'; -export const DKG_CHUNK_INDEX = 'http://dkg.io/ontology/chunkIndex'; -export const DKG_CHUNK_VALUE = 'http://dkg.io/ontology/chunkValue'; - export interface RdfLiteralSizeContext { readonly label?: string; readonly subject?: string; @@ -45,37 +19,6 @@ export interface QuadLiteralLike { readonly graph?: string; } -export interface ParsedRdfLiteralTerm { - readonly lexical: string; - readonly suffix: string; - readonly language?: string; - readonly datatype?: string; -} - -export interface RdfTextLiteralRewrite { - readonly subject: string; - readonly predicate: string; - readonly graph?: string; - readonly bodySubject: string; - readonly originalMutf8Bytes: number; - readonly originalUtf8Bytes: number; - readonly chunkCount: number; - readonly lexicalSha256: string; - readonly literalTermSha256: string; -} - -export interface RdfLiteralNormalizationResult { - readonly quads: QuadLiteralLike[]; - readonly rewrites: RdfTextLiteralRewrite[]; -} - -export interface RdfLiteralNormalizationOptions { - readonly maxBytes?: number; - readonly chunkMaxBytes?: number; - readonly textPredicates?: Iterable; - readonly label?: string; -} - export class OversizedRdfLiteralError extends DKGUserError { readonly code = OVERSIZED_RDF_LITERAL_ERROR_CODE; readonly actualBytes: number; @@ -159,37 +102,6 @@ export function rdfLiteralTermMutf8ByteLength(term: string): number | undefined return javaModifiedUtf8ByteLength(term); } -export function parseRdfLiteralTerm(term: string): ParsedRdfLiteralTerm | null { - if (!term.startsWith('"')) return null; - let escaped = false; - for (let i = 1; i < term.length; i++) { - const ch = term[i]!; - if (escaped) { - escaped = false; - continue; - } - if (ch === '\\') { - escaped = true; - continue; - } - if (ch !== '"') continue; - - try { - const body = term.slice(1, i); - const suffix = term.slice(i + 1); - const metadata = parseLiteralSuffix(suffix); - return { - lexical: decodeRdfLiteralBody(body), - suffix, - ...metadata, - }; - } catch { - return null; - } - } - return null; -} - export function isOversizedRdfLiteralError(err: unknown): err is OversizedRdfLiteralError { if (err instanceof OversizedRdfLiteralError) return true; if (!err || typeof err !== 'object') return false; @@ -229,275 +141,3 @@ export function assertQuadLiteralsMutf8Safe( }); } } - -export function normalizeLargeRdfLiteralsForBlazegraph( - quads: readonly QuadLiteralLike[], - options: RdfLiteralNormalizationOptions = {}, -): RdfLiteralNormalizationResult { - const maxBytes = options.maxBytes ?? DKG_RDF_LITERAL_SAFE_MUTF8_BYTES; - const chunkMaxBytes = options.chunkMaxBytes ?? maxBytes; - validateNormalizationLimits(maxBytes, chunkMaxBytes); - - const textPredicates = new Set(options.textPredicates ?? SCHEMA_TEXT_PREDICATES); - const normalized: QuadLiteralLike[] = []; - const rewrites: RdfTextLiteralRewrite[] = []; - - for (let i = 0; i < quads.length; i++) { - const quad = quads[i]!; - const literalBytes = rdfLiteralTermMutf8ByteLength(quad.object); - if (literalBytes === undefined || literalBytes <= maxBytes) { - normalized.push({ ...quad }); - continue; - } - - if (!textPredicates.has(quad.predicate)) { - throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); - } - - const parsed = parseRdfLiteralTerm(quad.object); - if (!parsed || !isChunkableTextLiteral(parsed)) { - throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); - } - if (!isSafeIri(quad.subject)) { - throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); - } - - const canonicalLiteralTerm = rdfLiteralTerm(parsed.lexical, parsed.suffix); - const literalTermSha256 = sha256Hex(canonicalLiteralTerm); - const lexicalSha256 = sha256Hex(parsed.lexical); - const bodyIdentitySha256 = sha256Hex(`${quad.predicate}\u0000${canonicalLiteralTerm}`); - const bodySubject = `${quad.subject}/.well-known/genid/dkg-text-body-${bodyIdentitySha256}`; - if (!isSafeIri(bodySubject)) { - throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); - } - - const chunks = splitLexicalIntoSafeChunks(parsed.lexical, chunkMaxBytes); - const graph = quad.graph ?? ''; - const bodyQuads = buildTextBodyQuads({ - source: quad, - graph, - bodySubject, - parsed, - chunks, - maxBytes, - chunkMaxBytes, - originalMutf8Bytes: literalBytes, - lexicalSha256, - literalTermSha256, - }); - normalized.push(...bodyQuads); - rewrites.push({ - subject: quad.subject, - predicate: quad.predicate, - graph: quad.graph, - bodySubject, - originalMutf8Bytes: literalBytes, - originalUtf8Bytes: utf8ByteLength(parsed.lexical), - chunkCount: chunks.length, - lexicalSha256, - literalTermSha256, - }); - } - - assertQuadLiteralsMutf8Safe(normalized, { - label: options.label ? `${options.label}.normalized` : 'normalizedQuads', - maxBytes, - }); - return { quads: normalized, rewrites }; -} - -function validateNormalizationLimits(maxBytes: number, chunkMaxBytes: number): void { - if (!Number.isInteger(maxBytes) || maxBytes <= 0 || maxBytes > JAVA_WRITE_UTF_MAX_BYTES) { - throw new Error(`Invalid maxBytes: ${maxBytes}`); - } - if (!Number.isInteger(chunkMaxBytes) || chunkMaxBytes <= 0 || chunkMaxBytes > maxBytes) { - throw new Error(`Invalid chunkMaxBytes: ${chunkMaxBytes}`); - } -} - -function labelFor(label: string | undefined, index: number): string { - return label ? `${label}[${index}].object` : `quads[${index}].object`; -} - -function throwOversizedForQuad( - quad: QuadLiteralLike, - actualBytes: number, - maxBytes: number, - label: string, -): never { - throw new OversizedRdfLiteralError({ - actualBytes, - maxBytes, - label, - subject: quad.subject, - predicate: quad.predicate, - graph: quad.graph, - }); -} - -function buildTextBodyQuads(args: { - source: QuadLiteralLike; - graph: string; - bodySubject: string; - parsed: ParsedRdfLiteralTerm; - chunks: readonly string[]; - maxBytes: number; - chunkMaxBytes: number; - originalMutf8Bytes: number; - lexicalSha256: string; - literalTermSha256: string; -}): QuadLiteralLike[] { - const quads: QuadLiteralLike[] = [ - { subject: args.source.subject, predicate: DKG_HAS_TEXT_BODY, object: args.bodySubject, graph: args.graph }, - { subject: args.bodySubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_BODY_CLASS, graph: args.graph }, - { subject: args.bodySubject, predicate: DKG_TEXT_SOURCE_PREDICATE, object: args.source.predicate, graph: args.graph }, - { subject: args.bodySubject, predicate: DKG_TEXT_CONTENT_SHA256, object: rdfLiteralTerm(args.lexicalSha256), graph: args.graph }, - { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_TERM_SHA256, object: rdfLiteralTerm(args.literalTermSha256), graph: args.graph }, - { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_MUTF8_BYTES, object: xsdInteger(args.originalMutf8Bytes), graph: args.graph }, - { subject: args.bodySubject, predicate: DKG_TEXT_UTF8_BYTES, object: xsdInteger(utf8ByteLength(args.parsed.lexical)), graph: args.graph }, - { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_COUNT, object: xsdInteger(args.chunks.length), graph: args.graph }, - { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_LIMIT, object: xsdInteger(args.chunkMaxBytes), graph: args.graph }, - ]; - - if (args.parsed.language) { - quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_LANGUAGE, object: rdfLiteralTerm(args.parsed.language), graph: args.graph }); - } - if (args.parsed.datatype) { - quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_DATATYPE, object: args.parsed.datatype, graph: args.graph }); - } - - args.chunks.forEach((chunk, index) => { - const chunkSubject = `${args.bodySubject}/chunk-${index}`; - if (!isSafeIri(chunkSubject)) { - throw new OversizedRdfLiteralError({ - actualBytes: args.originalMutf8Bytes, - maxBytes: args.maxBytes, - subject: args.source.subject, - predicate: args.source.predicate, - graph: args.source.graph, - }); - } - quads.push( - { subject: args.bodySubject, predicate: DKG_HAS_TEXT_CHUNK, object: chunkSubject, graph: args.graph }, - { subject: chunkSubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_CHUNK_CLASS, graph: args.graph }, - { subject: chunkSubject, predicate: DKG_CHUNK_INDEX, object: xsdInteger(index), graph: args.graph }, - { subject: chunkSubject, predicate: DKG_CHUNK_VALUE, object: rdfLiteralTerm(chunk), graph: args.graph }, - ); - }); - - return quads; -} - -function isChunkableTextLiteral(parsed: ParsedRdfLiteralTerm): boolean { - return parsed.suffix === '' || - parsed.language !== undefined || - parsed.datatype === XSD_STRING_IRI; -} - -function splitLexicalIntoSafeChunks(lexical: string, maxBytes: number): string[] { - const chunks: string[] = []; - let current = ''; - let currentBytes = javaModifiedUtf8ByteLength('""'); - - for (const ch of lexical) { - const chBytes = serializedLiteralBodyMutf8ByteLength(ch); - if (current.length > 0 && currentBytes + chBytes > maxBytes) { - chunks.push(current); - current = ch; - currentBytes = javaModifiedUtf8ByteLength('""') + chBytes; - } else { - current += ch; - currentBytes += chBytes; - } - - if (currentBytes > maxBytes) { - throw new Error('A single literal character exceeds the MUTF-8 chunk budget'); - } - } - - if (current.length > 0 || lexical.length === 0) chunks.push(current); - return chunks; -} - -function serializedLiteralBodyMutf8ByteLength(value: string): number { - const serialized = JSON.stringify(value); - return javaModifiedUtf8ByteLength(serialized.slice(1, -1)); -} - -function parseLiteralSuffix(suffix: string): { language?: string; datatype?: string } { - if (suffix === '') return {}; - const language = /^@([A-Za-z]+(?:-[A-Za-z0-9]+)*)$/.exec(suffix); - if (language) return { language: language[1] }; - const datatype = /^\^\^<([^<>"{}|\\^`\x00-\x20>]+)>$/.exec(suffix); - if (datatype) return { datatype: datatype[1] }; - throw new Error(`Invalid RDF literal suffix: ${suffix.slice(0, 80)}`); -} - -function decodeRdfLiteralBody(body: string): string { - let out = ''; - for (let i = 0; i < body.length; i++) { - const ch = body[i]!; - if (ch !== '\\') { - out += ch; - continue; - } - i += 1; - if (i >= body.length) throw new Error('Invalid trailing RDF literal escape'); - const escaped = body[i]!; - switch (escaped) { - case 't': - out += '\t'; - break; - case 'b': - out += '\b'; - break; - case 'n': - out += '\n'; - break; - case 'r': - out += '\r'; - break; - case 'f': - out += '\f'; - break; - case '"': - case "'": - case '\\': - out += escaped; - break; - case 'u': { - const hex = body.slice(i + 1, i + 5); - if (!/^[0-9a-fA-F]{4}$/.test(hex)) throw new Error('Invalid RDF \\u escape'); - out += String.fromCharCode(parseInt(hex, 16)); - i += 4; - break; - } - case 'U': { - const hex = body.slice(i + 1, i + 9); - if (!/^[0-9a-fA-F]{8}$/.test(hex)) throw new Error('Invalid RDF \\U escape'); - out += String.fromCodePoint(parseInt(hex, 16)); - i += 8; - break; - } - default: - throw new Error(`Invalid RDF literal escape: \\${escaped}`); - } - } - return out; -} - -function rdfLiteralTerm(lexical: string, suffix = ''): string { - return `${JSON.stringify(lexical)}${suffix}`; -} - -function xsdInteger(value: number): string { - return `"${value}"^^<${XSD_INTEGER_IRI}>`; -} - -function sha256Hex(value: string): string { - return createHash('sha256').update(value, 'utf8').digest('hex'); -} - -function utf8ByteLength(value: string): number { - return new TextEncoder().encode(value).length; -} diff --git a/packages/core/src/rdf-text-literal-normalization.ts b/packages/core/src/rdf-text-literal-normalization.ts new file mode 100644 index 000000000..183e3781c --- /dev/null +++ b/packages/core/src/rdf-text-literal-normalization.ts @@ -0,0 +1,369 @@ +import { createHash } from 'node:crypto'; +import { + DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, + JAVA_WRITE_UTF_MAX_BYTES, + OversizedRdfLiteralError, + assertQuadLiteralsMutf8Safe, + javaModifiedUtf8ByteLength, + rdfLiteralTermMutf8ByteLength, + type QuadLiteralLike, +} from './rdf-literal-size.js'; +import { isSafeIri } from './sparql-safe.js'; + +export const RDF_TYPE_IRI = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; +export const XSD_INTEGER_IRI = 'http://www.w3.org/2001/XMLSchema#integer'; +export const XSD_STRING_IRI = 'http://www.w3.org/2001/XMLSchema#string'; +export const SCHEMA_TEXT_PREDICATES = [ + 'http://schema.org/text', + 'https://schema.org/text', +] as const; + +export const DKG_TEXT_BODY_CLASS = 'http://dkg.io/ontology/TextBody'; +export const DKG_TEXT_CHUNK_CLASS = 'http://dkg.io/ontology/TextChunk'; +export const DKG_HAS_TEXT_BODY = 'http://dkg.io/ontology/hasTextBody'; +export const DKG_HAS_TEXT_CHUNK = 'http://dkg.io/ontology/hasTextChunk'; +export const DKG_TEXT_SOURCE_PREDICATE = 'http://dkg.io/ontology/textSourcePredicate'; +export const DKG_TEXT_CONTENT_SHA256 = 'http://dkg.io/ontology/textContentSha256'; +export const DKG_TEXT_LITERAL_TERM_SHA256 = 'http://dkg.io/ontology/textLiteralTermSha256'; +export const DKG_TEXT_LITERAL_MUTF8_BYTES = 'http://dkg.io/ontology/textLiteralMutf8Bytes'; +export const DKG_TEXT_UTF8_BYTES = 'http://dkg.io/ontology/textUtf8Bytes'; +export const DKG_TEXT_CHUNK_COUNT = 'http://dkg.io/ontology/textChunkCount'; +export const DKG_TEXT_CHUNK_LIMIT = 'http://dkg.io/ontology/textChunkMutf8Limit'; +export const DKG_TEXT_LANGUAGE = 'http://dkg.io/ontology/textLanguage'; +export const DKG_TEXT_DATATYPE = 'http://dkg.io/ontology/textDatatype'; +export const DKG_CHUNK_INDEX = 'http://dkg.io/ontology/chunkIndex'; +export const DKG_CHUNK_VALUE = 'http://dkg.io/ontology/chunkValue'; + +export interface ParsedRdfLiteralTerm { + readonly lexical: string; + readonly suffix: string; + readonly language?: string; + readonly datatype?: string; +} + +export interface RdfTextLiteralRewrite { + readonly subject: string; + readonly predicate: string; + readonly graph?: string; + readonly bodySubject: string; + readonly originalMutf8Bytes: number; + readonly originalUtf8Bytes: number; + readonly chunkCount: number; + readonly lexicalSha256: string; + readonly literalTermSha256: string; +} + +export interface RdfLiteralNormalizationResult { + readonly quads: QuadLiteralLike[]; + readonly rewrites: RdfTextLiteralRewrite[]; +} + +export interface RdfLiteralNormalizationOptions { + readonly maxBytes?: number; + readonly chunkMaxBytes?: number; + readonly textPredicates?: Iterable; + readonly label?: string; +} + +export function parseRdfLiteralTerm(term: string): ParsedRdfLiteralTerm | null { + if (!term.startsWith('"')) return null; + let escaped = false; + for (let i = 1; i < term.length; i++) { + const ch = term[i]!; + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (ch !== '"') continue; + + try { + const body = term.slice(1, i); + const suffix = term.slice(i + 1); + const metadata = parseLiteralSuffix(suffix); + return { + lexical: decodeRdfLiteralBody(body), + suffix, + ...metadata, + }; + } catch { + return null; + } + } + return null; +} + +export function normalizeLargeRdfLiteralsForBlazegraph( + quads: readonly QuadLiteralLike[], + options: RdfLiteralNormalizationOptions = {}, +): RdfLiteralNormalizationResult { + const maxBytes = options.maxBytes ?? DKG_RDF_LITERAL_SAFE_MUTF8_BYTES; + const chunkMaxBytes = options.chunkMaxBytes ?? maxBytes; + validateNormalizationLimits(maxBytes, chunkMaxBytes); + + const textPredicates = new Set(options.textPredicates ?? SCHEMA_TEXT_PREDICATES); + const normalized: QuadLiteralLike[] = []; + const rewrites: RdfTextLiteralRewrite[] = []; + + for (let i = 0; i < quads.length; i++) { + const quad = quads[i]!; + const literalBytes = rdfLiteralTermMutf8ByteLength(quad.object); + if (literalBytes === undefined || literalBytes <= maxBytes) { + normalized.push({ ...quad }); + continue; + } + + if (!textPredicates.has(quad.predicate)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const parsed = parseRdfLiteralTerm(quad.object); + if (!parsed || !isChunkableTextLiteral(parsed)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + if (!isSafeIri(quad.subject)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const canonicalLiteralTerm = rdfLiteralTerm(parsed.lexical, parsed.suffix); + const literalTermSha256 = sha256Hex(canonicalLiteralTerm); + const lexicalSha256 = sha256Hex(parsed.lexical); + const bodyIdentitySha256 = sha256Hex(`${quad.predicate}\u0000${canonicalLiteralTerm}`); + const bodySubject = `${quad.subject}/.well-known/genid/dkg-text-body-${bodyIdentitySha256}`; + if (!isSafeIri(bodySubject)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const chunks = splitLexicalIntoSafeChunks(parsed.lexical, chunkMaxBytes); + const graph = quad.graph ?? ''; + const bodyQuads = buildTextBodyQuads({ + source: quad, + graph, + bodySubject, + parsed, + chunks, + maxBytes, + chunkMaxBytes, + originalMutf8Bytes: literalBytes, + lexicalSha256, + literalTermSha256, + }); + normalized.push(...bodyQuads); + rewrites.push({ + subject: quad.subject, + predicate: quad.predicate, + graph: quad.graph, + bodySubject, + originalMutf8Bytes: literalBytes, + originalUtf8Bytes: utf8ByteLength(parsed.lexical), + chunkCount: chunks.length, + lexicalSha256, + literalTermSha256, + }); + } + + assertQuadLiteralsMutf8Safe(normalized, { + label: options.label ? `${options.label}.normalized` : 'normalizedQuads', + maxBytes, + }); + return { quads: normalized, rewrites }; +} + +function validateNormalizationLimits(maxBytes: number, chunkMaxBytes: number): void { + if (!Number.isInteger(maxBytes) || maxBytes <= 0 || maxBytes > JAVA_WRITE_UTF_MAX_BYTES) { + throw new Error(`Invalid maxBytes: ${maxBytes}`); + } + if (!Number.isInteger(chunkMaxBytes) || chunkMaxBytes <= 0 || chunkMaxBytes > maxBytes) { + throw new Error(`Invalid chunkMaxBytes: ${chunkMaxBytes}`); + } +} + +function labelFor(label: string | undefined, index: number): string { + return label ? `${label}[${index}].object` : `quads[${index}].object`; +} + +function throwOversizedForQuad( + quad: QuadLiteralLike, + actualBytes: number, + maxBytes: number, + label: string, +): never { + throw new OversizedRdfLiteralError({ + actualBytes, + maxBytes, + label, + subject: quad.subject, + predicate: quad.predicate, + graph: quad.graph, + }); +} + +function buildTextBodyQuads(args: { + source: QuadLiteralLike; + graph: string; + bodySubject: string; + parsed: ParsedRdfLiteralTerm; + chunks: readonly string[]; + maxBytes: number; + chunkMaxBytes: number; + originalMutf8Bytes: number; + lexicalSha256: string; + literalTermSha256: string; +}): QuadLiteralLike[] { + const quads: QuadLiteralLike[] = [ + { subject: args.source.subject, predicate: DKG_HAS_TEXT_BODY, object: args.bodySubject, graph: args.graph }, + { subject: args.bodySubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_BODY_CLASS, graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_SOURCE_PREDICATE, object: args.source.predicate, graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CONTENT_SHA256, object: rdfLiteralTerm(args.lexicalSha256), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_TERM_SHA256, object: rdfLiteralTerm(args.literalTermSha256), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_MUTF8_BYTES, object: xsdInteger(args.originalMutf8Bytes), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_UTF8_BYTES, object: xsdInteger(utf8ByteLength(args.parsed.lexical)), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_COUNT, object: xsdInteger(args.chunks.length), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_LIMIT, object: xsdInteger(args.chunkMaxBytes), graph: args.graph }, + ]; + + if (args.parsed.language) { + quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_LANGUAGE, object: rdfLiteralTerm(args.parsed.language), graph: args.graph }); + } + if (args.parsed.datatype) { + quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_DATATYPE, object: args.parsed.datatype, graph: args.graph }); + } + + args.chunks.forEach((chunk, index) => { + const chunkSubject = `${args.bodySubject}/chunk-${index}`; + if (!isSafeIri(chunkSubject)) { + throw new OversizedRdfLiteralError({ + actualBytes: args.originalMutf8Bytes, + maxBytes: args.maxBytes, + subject: args.source.subject, + predicate: args.source.predicate, + graph: args.source.graph, + }); + } + quads.push( + { subject: args.bodySubject, predicate: DKG_HAS_TEXT_CHUNK, object: chunkSubject, graph: args.graph }, + { subject: chunkSubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_CHUNK_CLASS, graph: args.graph }, + { subject: chunkSubject, predicate: DKG_CHUNK_INDEX, object: xsdInteger(index), graph: args.graph }, + { subject: chunkSubject, predicate: DKG_CHUNK_VALUE, object: rdfLiteralTerm(chunk), graph: args.graph }, + ); + }); + + return quads; +} + +function isChunkableTextLiteral(parsed: ParsedRdfLiteralTerm): boolean { + return parsed.suffix === '' || + parsed.language !== undefined || + parsed.datatype === XSD_STRING_IRI; +} + +function splitLexicalIntoSafeChunks(lexical: string, maxBytes: number): string[] { + const chunks: string[] = []; + let current = ''; + let currentBytes = javaModifiedUtf8ByteLength('""'); + + for (const ch of lexical) { + const chBytes = serializedLiteralBodyMutf8ByteLength(ch); + if (current.length > 0 && currentBytes + chBytes > maxBytes) { + chunks.push(current); + current = ch; + currentBytes = javaModifiedUtf8ByteLength('""') + chBytes; + } else { + current += ch; + currentBytes += chBytes; + } + + if (currentBytes > maxBytes) { + throw new Error('A single literal character exceeds the MUTF-8 chunk budget'); + } + } + + if (current.length > 0 || lexical.length === 0) chunks.push(current); + return chunks; +} + +function serializedLiteralBodyMutf8ByteLength(value: string): number { + const serialized = JSON.stringify(value); + return javaModifiedUtf8ByteLength(serialized.slice(1, -1)); +} + +function parseLiteralSuffix(suffix: string): { language?: string; datatype?: string } { + if (suffix === '') return {}; + const language = /^@([A-Za-z]+(?:-[A-Za-z0-9]+)*)$/.exec(suffix); + if (language) return { language: language[1] }; + const datatype = /^\^\^<([^<>"{}|\\^`\x00-\x20>]+)>$/.exec(suffix); + if (datatype) return { datatype: datatype[1] }; + throw new Error(`Invalid RDF literal suffix: ${suffix.slice(0, 80)}`); +} + +function decodeRdfLiteralBody(body: string): string { + let out = ''; + for (let i = 0; i < body.length; i++) { + const ch = body[i]!; + if (ch !== '\\') { + out += ch; + continue; + } + i += 1; + if (i >= body.length) throw new Error('Invalid trailing RDF literal escape'); + const escaped = body[i]!; + switch (escaped) { + case 't': + out += '\t'; + break; + case 'b': + out += '\b'; + break; + case 'n': + out += '\n'; + break; + case 'r': + out += '\r'; + break; + case 'f': + out += '\f'; + break; + case '"': + case "'": + case '\\': + out += escaped; + break; + case 'u': { + const hex = body.slice(i + 1, i + 5); + if (!/^[0-9a-fA-F]{4}$/.test(hex)) throw new Error('Invalid RDF \\u escape'); + out += String.fromCharCode(parseInt(hex, 16)); + i += 4; + break; + } + case 'U': { + const hex = body.slice(i + 1, i + 9); + if (!/^[0-9a-fA-F]{8}$/.test(hex)) throw new Error('Invalid RDF \\U escape'); + out += String.fromCodePoint(parseInt(hex, 16)); + i += 8; + break; + } + default: + throw new Error(`Invalid RDF literal escape: \\${escaped}`); + } + } + return out; +} + +function rdfLiteralTerm(lexical: string, suffix = ''): string { + return `${JSON.stringify(lexical)}${suffix}`; +} + +function xsdInteger(value: number): string { + return `"${value}"^^<${XSD_INTEGER_IRI}>`; +} + +function sha256Hex(value: string): string { + return createHash('sha256').update(value, 'utf8').digest('hex'); +} + +function utf8ByteLength(value: string): number { + return new TextEncoder().encode(value).length; +} diff --git a/packages/core/test/rdf-literal-size.test.ts b/packages/core/test/rdf-literal-size.test.ts index 97947bf5b..6193dd2f9 100644 --- a/packages/core/test/rdf-literal-size.test.ts +++ b/packages/core/test/rdf-literal-size.test.ts @@ -1,27 +1,29 @@ import { createHash } from 'node:crypto'; import { describe, expect, it } from 'vitest'; +import { + DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, + OVERSIZED_RDF_LITERAL_ERROR_CODE, + assertQuadLiteralsMutf8Safe, + assertRdfLiteralMutf8Safe, + javaModifiedUtf8ByteLength, + rdfLiteralTermMutf8ByteLength, + type QuadLiteralLike, +} from '../src/rdf-literal-size.js'; import { DKG_CHUNK_INDEX, DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY, DKG_HAS_TEXT_CHUNK, - DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, DKG_TEXT_CHUNK_COUNT, DKG_TEXT_CONTENT_SHA256, DKG_TEXT_DATATYPE, DKG_TEXT_LANGUAGE, DKG_TEXT_LITERAL_TERM_SHA256, DKG_TEXT_SOURCE_PREDICATE, - OVERSIZED_RDF_LITERAL_ERROR_CODE, XSD_STRING_IRI, - assertQuadLiteralsMutf8Safe, - assertRdfLiteralMutf8Safe, - javaModifiedUtf8ByteLength, normalizeLargeRdfLiteralsForBlazegraph, parseRdfLiteralTerm, - rdfLiteralTermMutf8ByteLength, - type QuadLiteralLike, -} from '../src/rdf-literal-size.js'; +} from '../src/rdf-text-literal-normalization.js'; interface ReconstructedChunkedTextBody { readonly subject: string; diff --git a/packages/publisher/src/dkg-publisher.ts b/packages/publisher/src/dkg-publisher.ts index 412c55858..202d833a3 100644 --- a/packages/publisher/src/dkg-publisher.ts +++ b/packages/publisher/src/dkg-publisher.ts @@ -2,7 +2,7 @@ import type { Quad, TripleStore } from '@origintrail-official/dkg-storage'; import type { ChainAdapter, OnChainPublishResult, AddBatchToContextGraphParams } from '@origintrail-official/dkg-chain'; import { enrichEvmError } from '@origintrail-official/dkg-chain'; import type { EventBus, OperationContext } from '@origintrail-official/dkg-core'; -import { DKGEvent, Logger, createOperationContext, sha256, encodeWorkspacePublishRequest, encodeEncryptedWorkspacePayload, encryptWorkspacePayload, contextGraphDataUri, contextGraphDataGraphUri, contextGraphMetaUri, contextGraphAssertionUri, contextGraphLayerUri, MemoryLayer, assertionLifecycleUri, contextGraphSubGraphUri, contextGraphSubGraphMetaUri, SYSTEM_CONTEXT_GRAPHS, validateSubGraphName, isSafeIri, assertSafeIri, assertSafeRdfTerm, assertQuadLiteralsMutf8Safe, normalizeLargeRdfLiteralsForBlazegraph, DKG_GOSSIP_MAX_MESSAGE_BYTES, SwmGossipPayloadTooLargeError, type Ed25519Keypair, buildAuthorAttestationTypedData, buildUpdateAuthorAttestationTypedData, AUTHOR_SCHEME_VERSION_V1, TrustLevel, TRUST_LEVEL_PREDICATE, assertNoUserAuthoredTrustLevelQuads, buildTrustLevelQuads, isTrustLevelQuad, DKG_ENTITY, DKG_ROOT_ENTITY_LEGACY, ENTITY_PRED_ALT, parseAssertionSealQuads, ASSERTION_SEAL_PREDICATES, sharedMemoryReadBothFilter, DKG_ONTOLOGY } from '@origintrail-official/dkg-core'; +import { DKGEvent, Logger, createOperationContext, sha256, encodeWorkspacePublishRequest, encodeEncryptedWorkspacePayload, encryptWorkspacePayload, contextGraphDataUri, contextGraphDataGraphUri, contextGraphMetaUri, contextGraphAssertionUri, contextGraphLayerUri, MemoryLayer, assertionLifecycleUri, contextGraphSubGraphUri, contextGraphSubGraphMetaUri, SYSTEM_CONTEXT_GRAPHS, validateSubGraphName, isSafeIri, assertSafeIri, assertSafeRdfTerm, assertQuadLiteralsMutf8Safe, DKG_GOSSIP_MAX_MESSAGE_BYTES, SwmGossipPayloadTooLargeError, type Ed25519Keypair, buildAuthorAttestationTypedData, buildUpdateAuthorAttestationTypedData, AUTHOR_SCHEME_VERSION_V1, TrustLevel, TRUST_LEVEL_PREDICATE, assertNoUserAuthoredTrustLevelQuads, buildTrustLevelQuads, isTrustLevelQuad, DKG_ENTITY, DKG_ROOT_ENTITY_LEGACY, ENTITY_PRED_ALT, parseAssertionSealQuads, ASSERTION_SEAL_PREDICATES, sharedMemoryReadBothFilter, DKG_ONTOLOGY } from '@origintrail-official/dkg-core'; import { GraphManager, PrivateContentStore } from '@origintrail-official/dkg-storage'; import { DEFAULT_PUBLISH_EPOCHS, MAX_PUBLISH_EPOCHS, type Publisher, type PublishOptions, type PublishResult, type KAManifestEntry, type PhaseCallback, type V10CoreNodeACK } from './publisher.js'; import { skolemizeByEntity } from './auto-partition.js'; @@ -49,6 +49,7 @@ import { } from './metadata.js'; import { storeWorkspaceOperationPublicQuads } from './workspace-resolution.js'; import type { WorkspacePublicSnapshotStore } from './workspace-snapshot-store.js'; +import { preparePublicWriteQuads } from './public-write-normalization.js'; import { ethers } from 'ethers'; import type { WorkspaceAgentRecipientResolver } from './workspace-agent-recipients.js'; import { @@ -415,16 +416,7 @@ function rejectOversizedRdfLiterals(quads: Quad[], label: string): void { } function normalizePublicRdfLiterals(quads: Quad[], label: string): Quad[] { - return normalizeLargeRdfLiteralsForBlazegraph(quads, { label }).quads.map(toQuad); -} - -function toQuad(quad: { subject: string; predicate: string; object: string; graph?: string }): Quad { - return { - subject: quad.subject, - predicate: quad.predicate, - object: quad.object, - graph: quad.graph ?? '', - }; + return preparePublicWriteQuads(quads, { label }).quads; } async function stampTrustLevel( diff --git a/packages/publisher/src/index.ts b/packages/publisher/src/index.ts index 4fb56c93e..bd1d4df7a 100644 --- a/packages/publisher/src/index.ts +++ b/packages/publisher/src/index.ts @@ -2,6 +2,12 @@ export * from './publisher.js'; export { skolemize, isBlankNode, isSkolemizedUri, rootEntityFromSkolemized } from './skolemize.js'; export { RESERVED_SUBJECT_PREFIXES, findReservedSubjectPrefix, isReservedSubject } from './reserved-subjects.js'; export { skolemizeByEntity, autoPartition } from './auto-partition.js'; +export { + preparePublicWriteQuads, + toPublicWriteQuad, + type PreparedPublicWriteQuads, + type PublicWriteNormalizationOptions, +} from './public-write-normalization.js'; export { canonicalPublishPayload, type CanonicalPublishPayload, diff --git a/packages/publisher/src/public-write-normalization.ts b/packages/publisher/src/public-write-normalization.ts new file mode 100644 index 000000000..a4bcd202a --- /dev/null +++ b/packages/publisher/src/public-write-normalization.ts @@ -0,0 +1,44 @@ +import { + normalizeLargeRdfLiteralsForBlazegraph, + type QuadLiteralLike, + type RdfLiteralNormalizationOptions, + type RdfTextLiteralRewrite, +} from '@origintrail-official/dkg-core'; +import type { Quad } from '@origintrail-official/dkg-storage'; +import { skolemizeByEntity } from './auto-partition.js'; + +export interface PreparedPublicWriteQuads { + readonly quads: Quad[]; + readonly rewrites: RdfTextLiteralRewrite[]; +} + +export interface PublicWriteNormalizationOptions extends RdfLiteralNormalizationOptions { + readonly skolemize?: boolean; +} + +export function preparePublicWriteQuads( + quads: readonly QuadLiteralLike[], + options: PublicWriteNormalizationOptions = {}, +): PreparedPublicWriteQuads { + const concreteQuads = quads.map(toPublicWriteQuad); + const skolemizedByRoot = options.skolemize === false + ? new Map() + : skolemizeByEntity(concreteQuads); + const input = skolemizedByRoot.size > 0 + ? [...skolemizedByRoot.values()].flat() + : concreteQuads; + const result = normalizeLargeRdfLiteralsForBlazegraph(input, options); + return { + quads: result.quads.map(toPublicWriteQuad), + rewrites: result.rewrites, + }; +} + +export function toPublicWriteQuad(quad: QuadLiteralLike): Quad { + return { + subject: quad.subject, + predicate: quad.predicate, + object: quad.object, + graph: quad.graph ?? '', + }; +} diff --git a/packages/publisher/test/dkg-publisher-compat.test.ts b/packages/publisher/test/dkg-publisher-compat.test.ts index 3ad96aff4..7f90bbf6f 100644 --- a/packages/publisher/test/dkg-publisher-compat.test.ts +++ b/packages/publisher/test/dkg-publisher-compat.test.ts @@ -90,6 +90,34 @@ describe('DKGPublisher compatibility aliases', () => { expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); + it('chunks oversized schema:text literals on linked blank nodes before shared-memory writes', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:share-blank-root'; + const child = `${root}/.well-known/genid/body`; + + await publisher.share('test', [ + q(root, 'http://schema.org/hasPart', '_:body'), + q('_:body', 'http://schema.org/text', oversized), + ], { publisherPeerId: 'test-peer', localOnly: true }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(reconstructPlainChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + }); + it('rejects oversized non-text RDF literals at shared-memory producer boundary', async () => { const { publisher } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; @@ -137,6 +165,36 @@ describe('DKGPublisher compatibility aliases', () => { }); }); + it('chunks oversized schema:text literals during conditionalShare writes', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:cas-chunked'; + + await publisher.conditionalShare('test', [ + q(root, 'http://schema.org/text', oversized), + ], { + publisherPeerId: 'test-peer', + localOnly: true, + conditions: [{ + subject: root, + predicate: 'http://schema.org/name', + expectedValue: null, + }], + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + }); + it('chunks oversized schema:text literals during direct publish', async () => { const { publisher, store } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; @@ -161,6 +219,39 @@ describe('DKGPublisher compatibility aliases', () => { expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); + it('chunks oversized schema:text literals on linked blank nodes during direct publish', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:publish-blank-root'; + const child = `${root}/.well-known/genid/body`; + + await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [ + q(root, 'http://schema.org/hasPart', '_:body'), + q('_:body', 'http://schema.org/text', oversized), + ], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(reconstructPlainChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + }); + it('chunks oversized schema:text literals during update', async () => { const { publisher, store } = await makePublisher(); const root = 'urn:compat:update-oversized'; @@ -190,6 +281,44 @@ describe('DKGPublisher compatibility aliases', () => { expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); + it('chunks oversized schema:text literals on linked blank nodes during update', async () => { + const { publisher, store } = await makePublisher(); + const root = 'urn:compat:update-blank-root'; + const child = `${root}/.well-known/genid/body`; + const original = await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/name', '"Before"')], + skipContextGraphEnsure: true, + }); + + await publisher.update(original.kaId, { + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [ + q(root, 'http://schema.org/hasPart', '_:body'), + q('_:body', 'http://schema.org/text', `"${'x'.repeat(60_000)}"`), + ], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH ?g { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(reconstructPlainChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + }); + it('chunks oversized schema:text literals during assertion write', async () => { const { publisher } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; From 96059546493b40bf07d991008bb37c2d6e4bdb71 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 20:57:35 +0200 Subject: [PATCH 06/12] Address public write normalization review --- packages/cli/src/daemon/http-utils.ts | 17 ++--- packages/cli/src/daemon/routes/memory.ts | 10 +-- packages/cli/test/daemon-ka-transport.test.ts | 38 +++++++++++ .../test/http-literal-size-validation.test.ts | 23 +++++++ ...no-funded-publisher-wallet-helpers.test.ts | 22 +++++++ packages/cli/vitest.unit.config.ts | 2 + packages/publisher/src/auto-partition.ts | 14 ++-- .../src/public-write-normalization.ts | 65 +++++++++++++++++-- .../test/dkg-publisher-compat.test.ts | 42 ++++++++++++ 9 files changed, 206 insertions(+), 27 deletions(-) diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index f5bff27c7..8c5c132a9 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -227,22 +227,23 @@ export function validateWritableQuadLiteralSizes( } export interface PreparedPublicWriteQuads { - readonly quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>; + readonly quads: PublishQuad[]; readonly rewrites: RdfTextLiteralRewrite[]; } +/** + * Canonical HTTP-facing public-write preparation. Route handlers that must know + * the normalized quad count or return rewrite metadata call this once and pass + * its explicit storage quads onward; writer boundaries still run the same + * publisher-owned normalizer as an idempotent SDK/backstop path. + */ export function preparePublicWriteQuads( label: string, quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>, ): { ok: true; value: PreparedPublicWriteQuads } | { ok: false; body: Record } { try { const result = prepareCanonicalPublicWriteQuads(quads, { label }); - const normalizedQuads = result.quads.map((quad) => ( - quad.graph - ? quad - : { subject: quad.subject, predicate: quad.predicate, object: quad.object } - )); - return { ok: true, value: { quads: normalizedQuads, rewrites: result.rewrites } }; + return { ok: true, value: { quads: result.quads, rewrites: result.rewrites } }; } catch (err) { if (isOversizedRdfLiteralError(err)) { return { ok: false, body: oversizedRdfLiteralResponseBody(err) }; @@ -453,7 +454,7 @@ export function parsePublishRequestBody( subject: quad.subject, predicate: quad.predicate, object: quad.object, - graph: quad.graph ?? '', + graph: quad.graph, })); if ( diff --git a/packages/cli/src/daemon/routes/memory.ts b/packages/cli/src/daemon/routes/memory.ts index 357185729..172727f0d 100644 --- a/packages/cli/src/daemon/routes/memory.ts +++ b/packages/cli/src/daemon/routes/memory.ts @@ -672,10 +672,7 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { const normalizedQuads = preparePublicWriteQuads("quads", normalized); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - normalized = normalizedQuads.value.quads.map((quad) => ({ - ...quad, - graph: quad.graph ?? graph, - })); + normalized = normalizedQuads.value.quads; await agent.store.insert(normalized); return jsonResponse(res, 200, { ok: true, @@ -2470,10 +2467,7 @@ WHERE { const normalizedQuads = preparePublicWriteQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - const quadsToWrite = normalizedQuads.value.quads.map((quad) => ({ - ...quad, - graph: quad.graph ?? targetGraph, - })); + const quadsToWrite = normalizedQuads.value.quads; // 5. Write to target layer try { diff --git a/packages/cli/test/daemon-ka-transport.test.ts b/packages/cli/test/daemon-ka-transport.test.ts index 5ea3785ff..e2e250c28 100644 --- a/packages/cli/test/daemon-ka-transport.test.ts +++ b/packages/cli/test/daemon-ka-transport.test.ts @@ -8,6 +8,7 @@ // daemon / storage / native deps. import { describe, it, expect } from 'vitest'; import { ChainRpcTransportError } from '@origintrail-official/dkg-chain'; +import { DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY } from '@origintrail-official/dkg-core'; import { handleKnowledgeAssetsRoutes } from '../src/daemon/routes/knowledge-assets.js'; import type { RequestContext } from '../src/daemon/routes/context.js'; @@ -66,6 +67,43 @@ function revert() { describe('knowledge-assets publish routes — transport-status mapping (#1329)', () => { describe('POST /api/knowledge-assets/publish (direct explicit-quads mint)', () => { + it('returns literalRewrites after oversized schema:text normalization on success', async () => { + const root = 'http://example.org/direct-oversized'; + const oversized = `"${'x'.repeat(60_000)}"`; + let publishedQuads: any[] = []; + const agent = publishAgent({ + publish: async (_cg: string, quads: any[]) => { + publishedQuads = quads; + return { + status: 'confirmed', + kaId: '42', + kaManifest: [{ tokenId: '42', rootEntity: root }], + }; + }, + }); + + const { res, done } = runKaCtx('POST', '/api/knowledge-assets/publish', agent, { + contextGraphId: 'cg-1', + quads: [{ subject: root, predicate: 'http://schema.org/text', object: oversized, graph: '' }], + }); + + await done; + const body = JSON.parse(res.body); + + expect(res.statusCode).toBe(200); + expect(body.literalRewrites).toHaveLength(1); + expect(body.literalRewrites[0]).toMatchObject({ + subject: root, + predicate: 'http://schema.org/text', + graph: '', + originalMutf8Bytes: 60_002, + }); + expect(publishedQuads.some((quad) => quad.predicate === 'http://schema.org/text')).toBe(false); + expect(publishedQuads.some((quad) => quad.predicate === DKG_HAS_TEXT_BODY)).toBe(true); + expect(publishedQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(publishedQuads.every((quad) => quad.graph === '')).toBe(true); + }); + it('→ 503 on RPC_ENDPOINTS_EXHAUSTED, with a sanitized body (no URL/key leak)', async () => { const agent = publishAgent({ publish: async () => { throw exhaustion(); } }); const { res, done } = runKaCtx('POST', '/api/knowledge-assets/publish', agent, { contextGraphId: 'cg-1', quads: QUADS }); diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index bf8bfd437..5c004a501 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -195,6 +195,29 @@ describe('HTTP RDF literal size validation', () => { } }); + it('preserves empty graph on normalized public-write quads', () => { + const prepared = preparePublicWriteQuads('quads', [{ + subject: 'http://example.org/empty-graph', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: '', + }]); + + expect(prepared.ok).toBe(true); + if (prepared.ok) { + expect(prepared.value.quads.length).toBeGreaterThan(1); + expect(prepared.value.quads.every((quad) => + Object.prototype.hasOwnProperty.call(quad, 'graph') + )).toBe(true); + expect(prepared.value.quads.every((quad) => quad.graph === '')).toBe(true); + expect(prepared.value.rewrites[0]).toMatchObject({ + subject: 'http://example.org/empty-graph', + predicate: 'http://schema.org/text', + graph: '', + }); + } + }); + it('preserves oversized literal fields in import-file extraction responses', () => { expect(buildImportFileResponse({ assertionUri: 'http://example.org/assertion', diff --git a/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts b/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts index 4840e78f6..abcf1a7b1 100644 --- a/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts +++ b/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts @@ -10,6 +10,7 @@ */ import { describe, expect, it } from 'vitest'; +import { OversizedRdfLiteralError } from '@origintrail-official/dkg-core'; import { isNoFundedPublisherWalletLike, noFundedPublisherWalletBody, @@ -91,6 +92,27 @@ describe('respondWithDaemonError', () => { expect(JSON.parse(res.body).code).toBeUndefined(); }); + it('maps oversized RDF literal errors to HTTP 400 with structured details', () => { + const res = mockRes(); + respondWithDaemonError(res, new OversizedRdfLiteralError({ + actualBytes: 60_002, + maxBytes: 60_000, + label: 'quads[0].object', + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + graph: 'http://example.org/g', + })); + expect(res.statusCode).toBe(400); + expect(JSON.parse(res.body)).toMatchObject({ + code: 'OVERSIZED_RDF_LITERAL', + actualBytes: 60_002, + limitBytes: 60_000, + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + graph: 'http://example.org/g', + }); + }); + it('is a no-op once the response has already been sent', () => { const res = mockRes(); res.writableEnded = true; diff --git a/packages/cli/vitest.unit.config.ts b/packages/cli/vitest.unit.config.ts index 4426b7b51..c21b046e4 100644 --- a/packages/cli/vitest.unit.config.ts +++ b/packages/cli/vitest.unit.config.ts @@ -65,6 +65,8 @@ export default defineConfig({ // #761 — context graph write-target validation (from main). 'test/context-graph-write-path-validation.test.ts', 'test/http-literal-size-validation.test.ts', + 'test/no-funded-publisher-wallet-helpers.test.ts', + 'test/daemon-ka-transport.test.ts', 'test/epcis-route-readiness.test.ts', // Notifications-pane redesign (A3) — assertion_activity emitter // helper. Pure logic + a tmp SQLite DashboardDB, no hardhat. diff --git a/packages/publisher/src/auto-partition.ts b/packages/publisher/src/auto-partition.ts index e528634b4..e788a97ab 100644 --- a/packages/publisher/src/auto-partition.ts +++ b/packages/publisher/src/auto-partition.ts @@ -21,7 +21,10 @@ export function skolemizeByEntity(quads: Quad[]): Map { // Phase 1: Find root entities (non-blank, non-skolemized unique subjects) const rootEntities = new Set(); for (const q of quads) { - if (!isBlankNode(q.subject) && !isSkolemizedUri(q.subject)) { + if (isSkolemizedUri(q.subject)) { + const root = rootEntityFromSkolemized(q.subject); + if (root) rootEntities.add(root); + } else if (!isBlankNode(q.subject)) { rootEntities.add(q.subject); } } @@ -31,8 +34,11 @@ export function skolemizeByEntity(quads: Quad[]): Map { // Heuristic: a blank node belongs to the root entity that references it as an object. const blankToRoot = new Map(); for (const q of quads) { - if (rootEntities.has(q.subject) && isBlankNode(q.object)) { - blankToRoot.set(q.object, q.subject); + const subjectRoot = isSkolemizedUri(q.subject) + ? rootEntityFromSkolemized(q.subject) + : rootEntities.has(q.subject) ? q.subject : null; + if (subjectRoot && isBlankNode(q.object)) { + blankToRoot.set(q.object, subjectRoot); } } @@ -77,7 +83,7 @@ export function skolemizeByEntity(quads: Quad[]): Map { if (isSkolemizedUri(q.subject)) { const root = rootEntityFromSkolemized(q.subject); if (root && rootQuadsMap.has(root)) { - rootQuadsMap.get(root)!.push(q); + rootQuadsMap.get(root)!.push(...skolemize(root, [q])); } } } diff --git a/packages/publisher/src/public-write-normalization.ts b/packages/publisher/src/public-write-normalization.ts index a4bcd202a..419eb5605 100644 --- a/packages/publisher/src/public-write-normalization.ts +++ b/packages/publisher/src/public-write-normalization.ts @@ -5,7 +5,9 @@ import { type RdfTextLiteralRewrite, } from '@origintrail-official/dkg-core'; import type { Quad } from '@origintrail-official/dkg-storage'; -import { skolemizeByEntity } from './auto-partition.js'; +import { isBlankNode, isSkolemizedUri, rootEntityFromSkolemized } from './skolemize.js'; + +const GENID_SEGMENT = '/.well-known/genid/'; export interface PreparedPublicWriteQuads { readonly quads: Quad[]; @@ -21,12 +23,9 @@ export function preparePublicWriteQuads( options: PublicWriteNormalizationOptions = {}, ): PreparedPublicWriteQuads { const concreteQuads = quads.map(toPublicWriteQuad); - const skolemizedByRoot = options.skolemize === false - ? new Map() - : skolemizeByEntity(concreteQuads); - const input = skolemizedByRoot.size > 0 - ? [...skolemizedByRoot.values()].flat() - : concreteQuads; + const input = options.skolemize === false + ? concreteQuads + : skolemizePublicWriteQuads(concreteQuads); const result = normalizeLargeRdfLiteralsForBlazegraph(input, options); return { quads: result.quads.map(toPublicWriteQuad), @@ -34,6 +33,58 @@ export function preparePublicWriteQuads( }; } +function skolemizePublicWriteQuads(quads: readonly Quad[]): Quad[] { + const blankToRoot = inferBlankNodeRoots(quads); + return quads.map((quad) => { + const root = rootForQuad(quad, blankToRoot); + if (!root) return { ...quad }; + return { + subject: skolemizeTermForRoot(quad.subject, root), + predicate: quad.predicate, + object: skolemizeTermForRoot(quad.object, root), + graph: quad.graph, + }; + }); +} + +function inferBlankNodeRoots(quads: readonly Quad[]): Map { + const blankToRoot = new Map(); + for (const quad of quads) { + const root = rootForNonBlankSubject(quad.subject); + if (root && isBlankNode(quad.object) && !blankToRoot.has(quad.object)) { + blankToRoot.set(quad.object, root); + } + } + + let changed = true; + while (changed) { + changed = false; + for (const quad of quads) { + const root = isBlankNode(quad.subject) ? blankToRoot.get(quad.subject) : undefined; + if (root && isBlankNode(quad.object) && !blankToRoot.has(quad.object)) { + blankToRoot.set(quad.object, root); + changed = true; + } + } + } + return blankToRoot; +} + +function rootForQuad(quad: Quad, blankToRoot: ReadonlyMap): string | undefined { + if (isBlankNode(quad.subject)) return blankToRoot.get(quad.subject); + return rootForNonBlankSubject(quad.subject); +} + +function rootForNonBlankSubject(subject: string): string | undefined { + if (isBlankNode(subject)) return undefined; + if (isSkolemizedUri(subject)) return rootEntityFromSkolemized(subject) ?? undefined; + return subject; +} + +function skolemizeTermForRoot(term: string, root: string): string { + return isBlankNode(term) ? `${root}${GENID_SEGMENT}${term.slice(2)}` : term; +} + export function toPublicWriteQuad(quad: QuadLiteralLike): Quad { return { subject: quad.subject, diff --git a/packages/publisher/test/dkg-publisher-compat.test.ts b/packages/publisher/test/dkg-publisher-compat.test.ts index 7f90bbf6f..d99db3173 100644 --- a/packages/publisher/test/dkg-publisher-compat.test.ts +++ b/packages/publisher/test/dkg-publisher-compat.test.ts @@ -10,6 +10,8 @@ import { } from '@origintrail-official/dkg-core'; import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; import { DKGPublisher } from '../src/dkg-publisher.js'; +import { skolemizeByEntity } from '../src/auto-partition.js'; +import { preparePublicWriteQuads } from '../src/public-write-normalization.js'; function q(s: string, p: string, o: string): Quad { return { @@ -338,6 +340,46 @@ describe('DKGPublisher compatibility aliases', () => { expect(reconstructPlainChunkedText(quads, root)).toBe('x'.repeat(60_000)); }); + it('preserves already-skolemized quads when normal roots are present', () => { + const externalSkolemized = q( + 'http://example.org/external/.well-known/genid/body', + 'http://schema.org/name', + '"kept"', + ); + + const prepared = preparePublicWriteQuads([ + q('http://example.org/root', 'http://schema.org/name', '"root"'), + externalSkolemized, + ]).quads; + + expect(prepared).toContainEqual(externalSkolemized); + expect(prepared).toHaveLength(2); + }); + + it('indexes already-skolemized subjects even when their root subject is absent', () => { + const root = 'http://example.org/external'; + const externalSkolemized = q( + `${root}/.well-known/genid/body`, + 'http://schema.org/hasPart', + '_:nested', + ); + + const partitioned = skolemizeByEntity([ + q('http://example.org/root', 'http://schema.org/name', '"root"'), + externalSkolemized, + q('_:nested', 'http://schema.org/name', '"kept"'), + ]); + + expect(partitioned.get(root)).toContainEqual({ + ...externalSkolemized, + object: `${root}/.well-known/genid/nested`, + }); + expect(partitioned.get(root)).toContainEqual({ + ...q('_:nested', 'http://schema.org/name', '"kept"'), + subject: `${root}/.well-known/genid/nested`, + }); + }); + it('rejects oversized private literals before publish canonicalization', async () => { const { publisher } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; From d5eea872903fd83fe663a72d7a2a05aafadf0026 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 21:09:07 +0200 Subject: [PATCH 07/12] Fix semantic enrichment route expectations --- .../cli/test/import-artifact-routes.test.ts | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/packages/cli/test/import-artifact-routes.test.ts b/packages/cli/test/import-artifact-routes.test.ts index 1d330a0a2..a12ac6009 100644 --- a/packages/cli/test/import-artifact-routes.test.ts +++ b/packages/cli/test/import-artifact-routes.test.ts @@ -1993,15 +1993,15 @@ describe('import artifact daemon routes', () => { expect(writes[0]!.name).toBe(assertionName); expect(writes[0]!.agentAddress).toBe('did:dkg:agent:test'); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Semantic topic"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceAssertion`, object: assertionUri }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceFileHash`, object: `"${entry.keccak256}"` }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownHash`, object: `"${entry.keccak256}"` }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownForm`, object: markdownForm }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"unit-test-model"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:reviewer' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:reviewer' }, - { subject: 'urn:doc:imported', predicate: `${PROV}wasDerivedFrom`, object: assertionUri }, + expect.objectContaining({ subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Semantic topic"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceAssertion`, object: assertionUri }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceFileHash`, object: `"${entry.keccak256}"` }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownHash`, object: `"${entry.keccak256}"` }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownForm`, object: markdownForm }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"unit-test-model"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:reviewer' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:reviewer' }), + expect.objectContaining({ subject: 'urn:doc:imported', predicate: `${PROV}wasDerivedFrom`, object: assertionUri }), ])); expect(events).toEqual(expect.arrayContaining([ expect.objectContaining({ operation: 'semantic_enrichment_written', layers: ['wm'] }), @@ -2116,7 +2116,7 @@ describe('import artifact daemon routes', () => { expect(result.status).toBe(200); expect(writes).toHaveLength(1); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer Bot"' }, + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer Bot"' }), ])); expect(writes[0]!.quads.filter((quad) => quad.predicate === `${PROV}wasAttributedTo`)).toHaveLength(0); }); @@ -2153,9 +2153,9 @@ describe('import artifact daemon routes', () => { expect(result.status).toBe(200); expect(writes).toHaveLength(1); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Topic\\u0000vertical\\u000Bdel\\u007F"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"model\\u0000unit\\u007F"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer\\u000BBot"' }, + expect.objectContaining({ subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Topic\\u0000vertical\\u000Bdel\\u007F"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"model\\u0000unit\\u007F"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer\\u000BBot"' }), ])); expect(writes[0]!.quads.filter((quad) => quad.predicate === `${PROV}wasAttributedTo`)).toHaveLength(0); }); @@ -2186,9 +2186,9 @@ describe('import artifact daemon routes', () => { expect(result.status).toBe(200); expect(writes).toHaveLength(1); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Fallback topic"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:test' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:test' }, + expect.objectContaining({ subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Fallback topic"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:test' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:test' }), ])); expect(writes[0]!.quads).not.toEqual(expect.arrayContaining([ expect.objectContaining({ object: 'did:dkg:agent:did:dkg:agent:test' }), From 5e740d22ea510e604017433c2aa062fa6bcf65e6 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 23:28:11 +0200 Subject: [PATCH 08/12] Reject private blank node object terms --- packages/cli/src/daemon/http-utils.ts | 21 +++++++++--- .../test/http-literal-size-validation.test.ts | 32 +++++++++++++++++++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index 8c5c132a9..aad2ac84d 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -261,18 +261,29 @@ export function preparePublicWriteQuads( * word `hello` or a number `123`) slips past them and crashes the RDF parser * with an uncaught "No scheme found in an absolute IRI" → HTTP 500 instead of an * actionable 400. Operates on any `{ object: string }` (PublishQuad or writable - * quad alike). + * quad alike). Blank-node object terms are valid only for public write paths that + * subsequently skolemize them; callers that persist quads without normalization + * should set `allowBlankNodes: false`. */ export function validateQuadObjectTerms( label: string, quads: ReadonlyArray<{ object: string }>, + options: { allowBlankNodes?: boolean } = {}, ): string | null { + const allowBlankNodes = options.allowBlankNodes ?? true; const badIndex = quads.findIndex((q) => { const object = q.object.trim(); - return !object.startsWith('"') && !isSafeBlankNode(object) && !isSafeIri(object); + return ( + !object.startsWith('"') && + !(allowBlankNodes && isSafeBlankNode(object)) && + !isSafeIri(object) + ); }); if (badIndex === -1) return null; - return `Invalid "${label}[${badIndex}].object": RDF object must be a quoted literal term, blank node, or absolute IRI`; + const expected = allowBlankNodes + ? "a quoted literal term, blank node, or absolute IRI" + : "a quoted literal term or absolute IRI"; + return `Invalid "${label}[${badIndex}].object": RDF object must be ${expected}`; } function isSafeBlankNode(term: string): boolean { @@ -467,7 +478,9 @@ export function parsePublishRequestBody( }; } if (privateQuads !== undefined) { - const privateQuadObjectError = validateQuadObjectTerms("privateQuads", privateQuads); + const privateQuadObjectError = validateQuadObjectTerms("privateQuads", privateQuads, { + allowBlankNodes: false, + }); if (privateQuadObjectError) return { ok: false, error: privateQuadObjectError }; const privateQuadSize = validateWritableQuadLiteralSizes("privateQuads", privateQuads); if (!privateQuadSize.ok) { diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index 5c004a501..5f5f14c5e 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -120,6 +120,38 @@ describe('HTTP RDF literal size validation', () => { } }); + it('rejects blank-node private publish object terms because private quads are not skolemized', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + quads: [{ + subject: 'http://example.org/root', + predicate: 'http://schema.org/name', + object: '"safe"', + graph: 'http://example.org/g', + }], + privateQuads: [ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:secret', + graph: 'http://example.org/g', + }, + { + subject: '_:secret', + predicate: 'http://schema.org/name', + object: '"hidden"', + graph: 'http://example.org/g', + }, + ], + })); + + expect(parsed.ok).toBe(false); + if (!parsed.ok) { + expect(parsed.error).toContain('Invalid "privateQuads[0].object"'); + expect(parsed.error).toContain('quoted literal term or absolute IRI'); + } + }); + it('returns structured validation failure for reject-only private/writable guards', () => { const result = validateWritableQuadLiteralSizes('quads', [{ subject: 'http://example.org/s', From 337fcce8dc3649f505c33db59beb64c62c55dc43 Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Thu, 25 Jun 2026 23:50:41 +0200 Subject: [PATCH 09/12] Reject private blank node subjects --- packages/cli/src/daemon/http-utils.ts | 11 ++++++++ .../test/http-literal-size-validation.test.ts | 26 ++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index aad2ac84d..1f14a40cd 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -286,6 +286,15 @@ export function validateQuadObjectTerms( return `Invalid "${label}[${badIndex}].object": RDF object must be ${expected}`; } +function validateNoBlankNodeSubjects( + label: string, + quads: ReadonlyArray<{ subject: string }>, +): string | null { + const badIndex = quads.findIndex((q) => q.subject.trim().startsWith("_:")); + if (badIndex === -1) return null; + return `Invalid "${label}[${badIndex}].subject": RDF subject must not be a blank node`; +} + function isSafeBlankNode(term: string): boolean { return /^_:[A-Za-z][A-Za-z0-9_-]*$/.test(term); } @@ -478,6 +487,8 @@ export function parsePublishRequestBody( }; } if (privateQuads !== undefined) { + const privateQuadSubjectError = validateNoBlankNodeSubjects("privateQuads", privateQuads); + if (privateQuadSubjectError) return { ok: false, error: privateQuadSubjectError }; const privateQuadObjectError = validateQuadObjectTerms("privateQuads", privateQuads, { allowBlankNodes: false, }); diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index 5f5f14c5e..37cb1cff5 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -137,7 +137,7 @@ describe('HTTP RDF literal size validation', () => { graph: 'http://example.org/g', }, { - subject: '_:secret', + subject: 'http://example.org/root/secret', predicate: 'http://schema.org/name', object: '"hidden"', graph: 'http://example.org/g', @@ -152,6 +152,30 @@ describe('HTTP RDF literal size validation', () => { } }); + it('rejects blank-node private publish subjects even when public quads link to the same blank node', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + quads: [{ + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:secret', + graph: 'http://example.org/g', + }], + privateQuads: [{ + subject: '_:secret', + predicate: 'http://schema.org/name', + object: '"hidden"', + graph: 'http://example.org/g', + }], + })); + + expect(parsed.ok).toBe(false); + if (!parsed.ok) { + expect(parsed.error).toContain('Invalid "privateQuads[0].subject"'); + expect(parsed.error).toContain('must not be a blank node'); + } + }); + it('returns structured validation failure for reject-only private/writable guards', () => { const result = validateWritableQuadLiteralSizes('quads', [{ subject: 'http://example.org/s', From 411bd826896634a6ac22f15ed45579e25d72db5b Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Fri, 26 Jun 2026 00:44:53 +0200 Subject: [PATCH 10/12] Address RDF literal chunking review edge cases --- packages/cli/src/daemon/http-utils.ts | 35 ++++- .../daemon/routes/knowledge-assets-import.ts | 4 +- .../cli/src/daemon/routes/knowledge-assets.ts | 15 +- packages/cli/src/daemon/routes/memory.ts | 33 ++--- .../daemon-memory-turn-literal-size.test.ts | 118 +++++++++++++++ .../test/http-literal-size-validation.test.ts | 47 ++++++ packages/cli/vitest.unit.config.ts | 1 + packages/core/src/index.ts | 1 + packages/core/src/rdf-literal-codec.ts | 108 ++++++++++++++ .../src/rdf-text-literal-normalization.ts | 115 ++------------- packages/core/test/rdf-literal-size.test.ts | 41 ++++++ packages/publisher/src/auto-partition.ts | 138 +++++++++++------- .../src/public-write-normalization.ts | 58 +------- .../test/dkg-publisher-compat.test.ts | 29 ++++ .../publisher/test/pure-functions.test.ts | 28 ++++ 15 files changed, 517 insertions(+), 254 deletions(-) create mode 100644 packages/cli/test/daemon-memory-turn-literal-size.test.ts create mode 100644 packages/core/src/rdf-literal-codec.ts diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index 1f14a40cd..ba76308ef 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -9,6 +9,7 @@ import { PayloadTooLargeError, assertQuadLiteralsMutf8Safe, isOversizedRdfLiteralError, + parseRdfLiteralTerm, type RdfTextLiteralRewrite, validateContextGraphId, validateSubGraphName, @@ -231,6 +232,10 @@ export interface PreparedPublicWriteQuads { readonly rewrites: RdfTextLiteralRewrite[]; } +export interface PreparedValidatedPublicWriteQuads extends PreparedPublicWriteQuads { + readonly totalQuads: number; +} + /** * Canonical HTTP-facing public-write preparation. Route handlers that must know * the normalized quad count or return rewrite metadata call this once and pass @@ -252,6 +257,23 @@ export function preparePublicWriteQuads( } } +export function prepareValidatedPublicWriteQuads( + label: string, + quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>, +): { ok: true; value: PreparedValidatedPublicWriteQuads } | { ok: false; body: Record } { + const objectError = validateQuadObjectTerms(label, quads); + if (objectError) return { ok: false, body: { error: objectError } }; + const prepared = preparePublicWriteQuads(label, quads); + if (!prepared.ok) return prepared; + return { + ok: true, + value: { + ...prepared.value, + totalQuads: prepared.value.quads.length, + }, + }; +} + /** * GH #306 / #787 (follow-up) — validate each quad's `object` term is either a * quoted RDF literal (`"…"`) or an absolute IRI. Shared by the publish path AND @@ -272,17 +294,18 @@ export function validateQuadObjectTerms( ): string | null { const allowBlankNodes = options.allowBlankNodes ?? true; const badIndex = quads.findIndex((q) => { - const object = q.object.trim(); + const object = q.object; + if (object.trim() !== object) return true; + if (object.startsWith('"')) return parseRdfLiteralTerm(object) === null; return ( - !object.startsWith('"') && !(allowBlankNodes && isSafeBlankNode(object)) && !isSafeIri(object) ); }); if (badIndex === -1) return null; const expected = allowBlankNodes - ? "a quoted literal term, blank node, or absolute IRI" - : "a quoted literal term or absolute IRI"; + ? "a well-formed quoted literal term, blank node, or absolute IRI" + : "a well-formed quoted literal term or absolute IRI"; return `Invalid "${label}[${badIndex}].object": RDF object must be ${expected}`; } @@ -464,9 +487,7 @@ export function parsePublishRequestBody( error: 'Missing or invalid "quads" (must be a non-empty quad array)', }; } - const quadObjectError = validateQuadObjectTerms("quads", quads); - if (quadObjectError) return { ok: false, error: quadObjectError }; - const normalizedQuads = preparePublicWriteQuads("quads", quads); + const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); if (!normalizedQuads.ok) { return { ok: false, error: String(normalizedQuads.body.error ?? 'Oversized RDF literal'), body: normalizedQuads.body }; } diff --git a/packages/cli/src/daemon/routes/knowledge-assets-import.ts b/packages/cli/src/daemon/routes/knowledge-assets-import.ts index ee89e6856..5f3fc71ba 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets-import.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets-import.ts @@ -43,7 +43,7 @@ import { normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, oversizedRdfLiteralResponseBody, - preparePublicWriteQuads, + prepareValidatedPublicWriteQuads, SMALL_BODY_BYTES, MAX_UPLOAD_BYTES, type ImportFileExtractionPayload, @@ -639,7 +639,7 @@ export async function handleKaSemanticEnrichmentWrite(ctx: RequestContext): Prom generationMethod, semanticQuads, }); - const normalizedQuads = preparePublicWriteQuads("semanticQuads", [...semanticQuads, ...provenanceQuads]); + const normalizedQuads = prepareValidatedPublicWriteQuads("semanticQuads", [...semanticQuads, ...provenanceQuads]); if (!normalizedQuads.ok) { return jsonResponse(res, 400, normalizedQuads.body); } diff --git a/packages/cli/src/daemon/routes/knowledge-assets.ts b/packages/cli/src/daemon/routes/knowledge-assets.ts index a22dd88e7..fc2a09944 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets.ts @@ -37,11 +37,10 @@ import { validateRequiredContextGraphId, parsePublishRequestBody, isWritableQuad, - validateQuadObjectTerms, respondIfReconcileUnavailable, respondIfChainRpcTransportError, sanitizeRpcMessage, - preparePublicWriteQuads, + prepareValidatedPublicWriteQuads, normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, isNoFundedPublisherWalletLike, @@ -826,9 +825,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< if (!quads.every(isWritableQuad)) { return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); } - const objErr = validateQuadObjectTerms("quads", quads); - if (objErr) return jsonResponse(res, 400, { error: objErr }); - const normalizedQuads = preparePublicWriteQuads("quads", quads); + const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); quadsToWrite = normalizedQuads.value.quads; } @@ -1145,11 +1142,9 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< if (!parsed.quads.every(isWritableQuad)) { return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); } - // GH #306/#787 (follow-up) — reject objects that are neither a quoted - // literal nor an absolute IRI before they reach (and crash) the parser. - const wmObjErr = validateQuadObjectTerms("quads", parsed.quads); - if (wmObjErr) return jsonResponse(res, 400, { error: wmObjErr }); - const normalizedQuads = preparePublicWriteQuads("quads", parsed.quads); + // Validate object terms and chunk oversized schema:text literals before + // the write path sees these public-write quads. + const normalizedQuads = prepareValidatedPublicWriteQuads("quads", parsed.quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); // A bare write to a name that was never created used to fall through to // the legacy `/assertion/{addr}/{name}` graph and produce a KA that is diff --git a/packages/cli/src/daemon/routes/memory.ts b/packages/cli/src/daemon/routes/memory.ts index 172727f0d..0989d4951 100644 --- a/packages/cli/src/daemon/routes/memory.ts +++ b/packages/cli/src/daemon/routes/memory.ts @@ -199,8 +199,7 @@ import { resolveNameToPeerId, isPublishQuad, isWritableQuad, - validateQuadObjectTerms, - preparePublicWriteQuads, + prepareValidatedPublicWriteQuads, oversizedRdfLiteralResponseBody, parsePublishRequestBody, jsonResponse, @@ -670,7 +669,7 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { }; }); - const normalizedQuads = preparePublicWriteQuads("quads", normalized); + const normalizedQuads = prepareValidatedPublicWriteQuads("quads", normalized); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); normalized = normalizedQuads.value.quads; await agent.store.insert(normalized); @@ -1655,16 +1654,13 @@ WHERE { // of crashing the SWM write path with a TypeError (HTTP 500). if (!Array.isArray(quads) || !quads.every(isWritableQuad)) return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); - // GH #306/#787 (follow-up) — also reject objects that are neither a quoted - // literal nor an absolute IRI; otherwise they slip past the shape guard and - // crash the RDF parser ("No scheme found in an absolute IRI") with HTTP 500. + // Validate object terms and chunk oversized schema:text literals before + // storage/share sees these public-write quads. { - const objErr = validateQuadObjectTerms("quads", quads); - if (objErr) return jsonResponse(res, 400, { error: objErr }); + const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quads = normalizedQuads.value.quads; } - const normalizedQuads = preparePublicWriteQuads("quads", quads); - if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - quads = normalizedQuads.value.quads; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2260,16 +2256,13 @@ WHERE { // GH #787 / #306 — reject string-shaped / malformed quads (4xx, not a 500 crash). if (!Array.isArray(quads) || !quads.every(isWritableQuad)) return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); - // GH #306/#787 (follow-up) — also reject objects that are neither a quoted - // literal nor an absolute IRI; otherwise they slip past the shape guard and - // crash the RDF parser ("No scheme found in an absolute IRI") with HTTP 500. + // Validate object terms and chunk oversized schema:text literals before + // storage/share sees these public-write quads. { - const objErr = validateQuadObjectTerms("quads", quads); - if (objErr) return jsonResponse(res, 400, { error: objErr }); + const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quads = normalizedQuads.value.quads; } - const normalizedQuads = preparePublicWriteQuads("quads", quads); - if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); - quads = normalizedQuads.value.quads; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2465,7 +2458,7 @@ WHERE { }); } - const normalizedQuads = preparePublicWriteQuads("quads", quads); + const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); const quadsToWrite = normalizedQuads.value.quads; diff --git a/packages/cli/test/daemon-memory-turn-literal-size.test.ts b/packages/cli/test/daemon-memory-turn-literal-size.test.ts new file mode 100644 index 000000000..6f1fa7777 --- /dev/null +++ b/packages/cli/test/daemon-memory-turn-literal-size.test.ts @@ -0,0 +1,118 @@ +import { describe, expect, it, vi } from 'vitest'; +import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, +} from '@origintrail-official/dkg-core'; +import { handleMemoryRoutes } from '../src/daemon/routes/memory.js'; +import type { RequestContext } from '../src/daemon/routes/context.js'; + +function fakeRes() { + const res: any = { statusCode: 0, body: '', headers: {} as Record, writableEnded: false }; + res.writeHead = (status: number, headers?: Record) => { + res.statusCode = status; + if (headers) Object.assign(res.headers, headers); + return res; + }; + res.setHeader = (key: string, value: string) => { + res.headers[key] = value; + }; + res.end = (body?: string) => { + res.body = body ?? ''; + res.writableEnded = true; + }; + return res; +} + +function fakeReq(body: unknown) { + return { + method: 'POST', + headers: {}, + __dkgPrebufferedBody: Buffer.from(JSON.stringify(body)), + } as any; +} + +const noopTracker = { + start() {}, + complete() {}, + fail() {}, + setCost() {}, + setTxHash() {}, + trackPhase: (_ctx: unknown, _phase: unknown, fn: () => unknown) => fn(), +}; + +const ACCEPT_PROBE = { + exists: true, + hasLocalContent: true, + declarationFound: true, + accessPolicy: 'public', + callerAuthorized: true, +}; + +describe('POST /api/memory/turn RDF literal normalization', () => { + it('chunks oversized schema:text emitted from markdown frontmatter before WM insert', async () => { + const inserted: Array<{ subject: string; predicate: string; object: string; graph: string }> = []; + const events: unknown[] = []; + const agent = { + peerId: 'peer-memory-turn', + probeContextGraphWritePreflight: vi.fn(async () => ACCEPT_PROBE), + store: { + insert: vi.fn(async (quads: typeof inserted) => { + inserted.push(...quads); + }), + }, + }; + const res = fakeRes(); + const url = new URL('http://127.0.0.1/api/memory/turn'); + const largeBody = 'computer history '.repeat(4_000); + const markdown = [ + '---', + 'id: http://example.org/turn/frontmatter-text', + 'text: |-', + ` ${largeBody}`, + '---', + '# Conversation turn', + ].join('\n'); + + await handleMemoryRoutes({ + req: fakeReq({ + contextGraphId: 'memory-turn-cg', + layer: 'wm', + markdown, + }), + res, + agent, + tracker: noopTracker, + config: {}, + fileStore: { + put: vi.fn(async (bytes: Buffer, contentType: string) => ({ + keccak256: 'abc123', + size: bytes.length, + contentType, + })), + }, + vectorStore: { insert: vi.fn() }, + embeddingProvider: null, + path: url.pathname, + url, + requestAgentAddress: `0x${'12'.repeat(20)}`, + emitMemoryGraphChanged: (event: unknown) => { + events.push(event); + }, + } as unknown as RequestContext); + + expect(res.statusCode, res.body).toBe(200); + const response = JSON.parse(res.body); + expect(agent.store.insert).toHaveBeenCalledTimes(1); + expect(inserted.some((quad) => + quad.subject === response.turnUri && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(inserted.some((quad) => + quad.subject === response.turnUri && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(inserted.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(response.totalQuads).toBe(inserted.length); + expect(events).toHaveLength(1); + }); +}); diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index 37cb1cff5..dae9fe14d 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -8,6 +8,7 @@ import { buildImportFileResponse, parsePublishRequestBody, preparePublicWriteQuads, + prepareValidatedPublicWriteQuads, validateWritableQuadLiteralSizes, } from '../src/daemon/http-utils.js'; @@ -193,6 +194,52 @@ describe('HTTP RDF literal size validation', () => { } }); + it('prepares validated public-write quads with explicit normalized count', () => { + const prepared = prepareValidatedPublicWriteQuads('quads', [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: 'http://example.org/g', + }]); + + expect(prepared.ok).toBe(true); + if (prepared.ok) { + expect(prepared.value.rewrites).toHaveLength(1); + expect(prepared.value.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(prepared.value.totalQuads).toBe(prepared.value.quads.length); + } + }); + + it('returns structured object-term validation failures from the validated public-write helper', () => { + const prepared = prepareValidatedPublicWriteQuads('quads', [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + object: 'not-a-valid-rdf-object', + graph: 'http://example.org/g', + }]); + + expect(prepared.ok).toBe(false); + if (!prepared.ok) { + expect(prepared.body.error).toContain('Invalid "quads[0].object"'); + } + }); + + it('rejects small malformed quoted literal terms before storage routes see them', () => { + for (const object of ['"bad\nliteral"', ' "ok"', '"ok"\n']) { + const prepared = prepareValidatedPublicWriteQuads('quads', [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + object, + graph: 'http://example.org/g', + }]); + + expect(prepared.ok).toBe(false); + if (!prepared.ok) { + expect(prepared.body.error).toContain('well-formed quoted literal term'); + } + } + }); + it('keeps import-file chunk quads in their source data or metadata graph', () => { const dataGraph = 'did:dkg:context-graph:test/_working_memory/0xabc/1'; const metaGraph = 'did:dkg:context-graph:test/_meta'; diff --git a/packages/cli/vitest.unit.config.ts b/packages/cli/vitest.unit.config.ts index c21b046e4..f10898288 100644 --- a/packages/cli/vitest.unit.config.ts +++ b/packages/cli/vitest.unit.config.ts @@ -65,6 +65,7 @@ export default defineConfig({ // #761 — context graph write-target validation (from main). 'test/context-graph-write-path-validation.test.ts', 'test/http-literal-size-validation.test.ts', + 'test/daemon-memory-turn-literal-size.test.ts', 'test/no-funded-publisher-wallet-helpers.test.ts', 'test/daemon-ka-transport.test.ts', 'test/epcis-route-readiness.test.ts', diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 8fd987c8f..94728bd2b 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -235,6 +235,7 @@ export { DKG_CHUNK_INDEX, DKG_CHUNK_VALUE, parseRdfLiteralTerm, + rdfLiteralTerm, normalizeLargeRdfLiteralsForBlazegraph, type ParsedRdfLiteralTerm, type RdfTextLiteralRewrite, diff --git a/packages/core/src/rdf-literal-codec.ts b/packages/core/src/rdf-literal-codec.ts new file mode 100644 index 000000000..f3580cbdf --- /dev/null +++ b/packages/core/src/rdf-literal-codec.ts @@ -0,0 +1,108 @@ +export interface ParsedRdfLiteralTerm { + readonly lexical: string; + readonly suffix: string; + readonly language?: string; + readonly datatype?: string; +} + +export function parseRdfLiteralTerm(term: string): ParsedRdfLiteralTerm | null { + if (!term.startsWith('"')) return null; + let escaped = false; + for (let i = 1; i < term.length; i++) { + const ch = term[i]!; + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (isRawLiteralControlCharacter(ch)) return null; + if (ch !== '"') continue; + + try { + const body = term.slice(1, i); + const suffix = term.slice(i + 1); + const metadata = parseLiteralSuffix(suffix); + return { + lexical: decodeRdfLiteralBody(body), + suffix, + ...metadata, + }; + } catch { + return null; + } + } + return null; +} + +export function rdfLiteralTerm(lexical: string, suffix = ''): string { + return `${JSON.stringify(lexical)}${suffix}`; +} + +function isRawLiteralControlCharacter(value: string): boolean { + return value.charCodeAt(0) < 0x20; +} + +function parseLiteralSuffix(suffix: string): { language?: string; datatype?: string } { + if (suffix === '') return {}; + const language = /^@([A-Za-z]+(?:-[A-Za-z0-9]+)*)$/.exec(suffix); + if (language) return { language: language[1] }; + const datatype = /^\^\^<([^<>"{}|\\^`\x00-\x20>]+)>$/.exec(suffix); + if (datatype) return { datatype: datatype[1] }; + throw new Error(`Invalid RDF literal suffix: ${suffix.slice(0, 80)}`); +} + +function decodeRdfLiteralBody(body: string): string { + let out = ''; + for (let i = 0; i < body.length; i++) { + const ch = body[i]!; + if (ch !== '\\') { + out += ch; + continue; + } + i += 1; + if (i >= body.length) throw new Error('Invalid trailing RDF literal escape'); + const escaped = body[i]!; + switch (escaped) { + case 't': + out += '\t'; + break; + case 'b': + out += '\b'; + break; + case 'n': + out += '\n'; + break; + case 'r': + out += '\r'; + break; + case 'f': + out += '\f'; + break; + case '"': + case "'": + case '\\': + out += escaped; + break; + case 'u': { + const hex = body.slice(i + 1, i + 5); + if (!/^[0-9a-fA-F]{4}$/.test(hex)) throw new Error('Invalid RDF \\u escape'); + out += String.fromCharCode(parseInt(hex, 16)); + i += 4; + break; + } + case 'U': { + const hex = body.slice(i + 1, i + 9); + if (!/^[0-9a-fA-F]{8}$/.test(hex)) throw new Error('Invalid RDF \\U escape'); + out += String.fromCodePoint(parseInt(hex, 16)); + i += 8; + break; + } + default: + throw new Error(`Invalid RDF literal escape: \\${escaped}`); + } + } + return out; +} diff --git a/packages/core/src/rdf-text-literal-normalization.ts b/packages/core/src/rdf-text-literal-normalization.ts index 183e3781c..6bf502a45 100644 --- a/packages/core/src/rdf-text-literal-normalization.ts +++ b/packages/core/src/rdf-text-literal-normalization.ts @@ -8,6 +8,11 @@ import { rdfLiteralTermMutf8ByteLength, type QuadLiteralLike, } from './rdf-literal-size.js'; +import { + parseRdfLiteralTerm, + rdfLiteralTerm, + type ParsedRdfLiteralTerm, +} from './rdf-literal-codec.js'; import { isSafeIri } from './sparql-safe.js'; export const RDF_TYPE_IRI = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; @@ -34,13 +39,6 @@ export const DKG_TEXT_DATATYPE = 'http://dkg.io/ontology/textDatatype'; export const DKG_CHUNK_INDEX = 'http://dkg.io/ontology/chunkIndex'; export const DKG_CHUNK_VALUE = 'http://dkg.io/ontology/chunkValue'; -export interface ParsedRdfLiteralTerm { - readonly lexical: string; - readonly suffix: string; - readonly language?: string; - readonly datatype?: string; -} - export interface RdfTextLiteralRewrite { readonly subject: string; readonly predicate: string; @@ -65,37 +63,6 @@ export interface RdfLiteralNormalizationOptions { readonly label?: string; } -export function parseRdfLiteralTerm(term: string): ParsedRdfLiteralTerm | null { - if (!term.startsWith('"')) return null; - let escaped = false; - for (let i = 1; i < term.length; i++) { - const ch = term[i]!; - if (escaped) { - escaped = false; - continue; - } - if (ch === '\\') { - escaped = true; - continue; - } - if (ch !== '"') continue; - - try { - const body = term.slice(1, i); - const suffix = term.slice(i + 1); - const metadata = parseLiteralSuffix(suffix); - return { - lexical: decodeRdfLiteralBody(body), - suffix, - ...metadata, - }; - } catch { - return null; - } - } - return null; -} - export function normalizeLargeRdfLiteralsForBlazegraph( quads: readonly QuadLiteralLike[], options: RdfLiteralNormalizationOptions = {}, @@ -290,72 +257,6 @@ function serializedLiteralBodyMutf8ByteLength(value: string): number { return javaModifiedUtf8ByteLength(serialized.slice(1, -1)); } -function parseLiteralSuffix(suffix: string): { language?: string; datatype?: string } { - if (suffix === '') return {}; - const language = /^@([A-Za-z]+(?:-[A-Za-z0-9]+)*)$/.exec(suffix); - if (language) return { language: language[1] }; - const datatype = /^\^\^<([^<>"{}|\\^`\x00-\x20>]+)>$/.exec(suffix); - if (datatype) return { datatype: datatype[1] }; - throw new Error(`Invalid RDF literal suffix: ${suffix.slice(0, 80)}`); -} - -function decodeRdfLiteralBody(body: string): string { - let out = ''; - for (let i = 0; i < body.length; i++) { - const ch = body[i]!; - if (ch !== '\\') { - out += ch; - continue; - } - i += 1; - if (i >= body.length) throw new Error('Invalid trailing RDF literal escape'); - const escaped = body[i]!; - switch (escaped) { - case 't': - out += '\t'; - break; - case 'b': - out += '\b'; - break; - case 'n': - out += '\n'; - break; - case 'r': - out += '\r'; - break; - case 'f': - out += '\f'; - break; - case '"': - case "'": - case '\\': - out += escaped; - break; - case 'u': { - const hex = body.slice(i + 1, i + 5); - if (!/^[0-9a-fA-F]{4}$/.test(hex)) throw new Error('Invalid RDF \\u escape'); - out += String.fromCharCode(parseInt(hex, 16)); - i += 4; - break; - } - case 'U': { - const hex = body.slice(i + 1, i + 9); - if (!/^[0-9a-fA-F]{8}$/.test(hex)) throw new Error('Invalid RDF \\U escape'); - out += String.fromCodePoint(parseInt(hex, 16)); - i += 8; - break; - } - default: - throw new Error(`Invalid RDF literal escape: \\${escaped}`); - } - } - return out; -} - -function rdfLiteralTerm(lexical: string, suffix = ''): string { - return `${JSON.stringify(lexical)}${suffix}`; -} - function xsdInteger(value: number): string { return `"${value}"^^<${XSD_INTEGER_IRI}>`; } @@ -367,3 +268,9 @@ function sha256Hex(value: string): string { function utf8ByteLength(value: string): number { return new TextEncoder().encode(value).length; } + +export { + parseRdfLiteralTerm, + rdfLiteralTerm, + type ParsedRdfLiteralTerm, +} from './rdf-literal-codec.js'; diff --git a/packages/core/test/rdf-literal-size.test.ts b/packages/core/test/rdf-literal-size.test.ts index 6193dd2f9..a42f688f3 100644 --- a/packages/core/test/rdf-literal-size.test.ts +++ b/packages/core/test/rdf-literal-size.test.ts @@ -281,6 +281,47 @@ describe('rdf literal Java MUTF-8 sizing', () => { }); }); + it('rejects raw control characters in quoted RDF literal source', () => { + expect(parseRdfLiteralTerm('"bad\nbody"')).toBeNull(); + expect(parseRdfLiteralTerm('"bad\rbody"')).toBeNull(); + expect(parseRdfLiteralTerm(`"bad${String.fromCharCode(1)}body"`)).toBeNull(); + + const rawNewlineLiteral = `"${'x'.repeat(250)}\n${'y'.repeat(250)}"`; + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/raw-control', + predicate: 'http://schema.org/text', + object: rawNewlineLiteral, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200, chunkMaxBytes: 100 }), + ).toThrow(/Blazegraph-compatible safe limit/); + }); + + it('chunks escaped control sequences and reconstructs their lexical value', () => { + const escapedLiteral = `"${'line\\nbreak '.repeat(80)}"@en`; + expect(parseRdfLiteralTerm('"line\\nbreak"')?.lexical).toBe('line\nbreak'); + + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/escaped-controls', + predicate: 'http://schema.org/text', + object: escapedLiteral, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 300, chunkMaxBytes: 120 }); + + const reconstructed = reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/escaped-controls', + }); + expect(reconstructed).toHaveLength(1); + expect(reconstructed[0]).toMatchObject({ + lexical: 'line\nbreak '.repeat(80), + language: 'en', + }); + }); + it('normalizes idempotently and keeps generated literals below the safe limit', () => { const oversized = JSON.stringify('x'.repeat(1_000)); const first = normalizeLargeRdfLiteralsForBlazegraph([ diff --git a/packages/publisher/src/auto-partition.ts b/packages/publisher/src/auto-partition.ts index e788a97ab..e2ae9b70b 100644 --- a/packages/publisher/src/auto-partition.ts +++ b/packages/publisher/src/auto-partition.ts @@ -1,5 +1,7 @@ import type { Quad } from '@origintrail-official/dkg-storage'; -import { skolemize, isSkolemizedUri, rootEntityFromSkolemized, isBlankNode } from './skolemize.js'; +import { isSkolemizedUri, rootEntityFromSkolemized, isBlankNode } from './skolemize.js'; + +const GENID_SEGMENT = '/.well-known/genid/'; /** * Skolemizes blank nodes under their parent entity and INDEXES the result by @@ -18,77 +20,103 @@ import { skolemize, isSkolemizedUri, rootEntityFromSkolemized, isBlankNode } fro * is just an index. Returns a Map of entity → Quad[]. */ export function skolemizeByEntity(quads: Quad[]): Map { - // Phase 1: Find root entities (non-blank, non-skolemized unique subjects) - const rootEntities = new Set(); - for (const q of quads) { - if (isSkolemizedUri(q.subject)) { - const root = rootEntityFromSkolemized(q.subject); - if (root) rootEntities.add(root); - } else if (!isBlankNode(q.subject)) { - rootEntities.add(q.subject); - } + const skolemized = skolemizeFlatQuads(quads); + const rootQuadsMap = new Map(); + for (const quad of skolemized) { + const root = rootForNonBlankSubject(quad.subject); + if (!root) continue; + const existing = rootQuadsMap.get(root); + if (existing) existing.push(quad); + else rootQuadsMap.set(root, [quad]); } + return rootQuadsMap; +} - // Phase 2: Skolemize blank nodes under their parent root entity. - // For each blank node, we need to determine which root entity it belongs to. - // Heuristic: a blank node belongs to the root entity that references it as an object. - const blankToRoot = new Map(); - for (const q of quads) { - const subjectRoot = isSkolemizedUri(q.subject) - ? rootEntityFromSkolemized(q.subject) - : rootEntities.has(q.subject) ? q.subject : null; - if (subjectRoot && isBlankNode(q.object)) { - blankToRoot.set(q.object, subjectRoot); +/** + * Losslessly skolemizes blank-node subjects/objects using the root inferred from + * the blank node itself, not from the currently visited edge. If a blank node is + * shared by multiple roots, every object reference points at the deterministic + * canonical generated IRI instead of manufacturing empty per-root generated nodes. + */ +export function skolemizeFlatQuads(quads: readonly Quad[]): Quad[] { + const blankToRoot = inferBlankNodeRoots(quads); + return quads.map((quad) => { + const subjectRoot = rootForQuadSubject(quad.subject, blankToRoot); + const objectRoot = isBlankNode(quad.object) + ? blankToRoot.get(quad.object) ?? subjectRoot + : undefined; + return { + subject: subjectRoot ? skolemizeTermForRoot(quad.subject, subjectRoot) : quad.subject, + predicate: quad.predicate, + object: objectRoot ? skolemizeTermForRoot(quad.object, objectRoot) : quad.object, + graph: quad.graph, + }; + }); +} + +function inferBlankNodeRoots(quads: readonly Quad[]): Map { + const blankRootCandidates = new Map>(); + for (const quad of quads) { + const root = rootForNonBlankSubject(quad.subject); + if (root && isBlankNode(quad.object)) { + addRootCandidate(blankRootCandidates, quad.object, root); } } - // Propagate: blank nodes referenced by other blank nodes let changed = true; while (changed) { changed = false; - for (const q of quads) { - if ( - isBlankNode(q.subject) && - blankToRoot.has(q.subject) && - isBlankNode(q.object) && - !blankToRoot.has(q.object) - ) { - blankToRoot.set(q.object, blankToRoot.get(q.subject)!); - changed = true; + for (const quad of quads) { + if (!isBlankNode(quad.subject) || !isBlankNode(quad.object)) continue; + const roots = blankRootCandidates.get(quad.subject); + if (!roots) continue; + for (const root of roots) { + if (addRootCandidate(blankRootCandidates, quad.object, root)) { + changed = true; + } } } } - // Skolemize per root entity - const skolemized: Quad[] = []; - const perRoot = new Map(); - for (const root of rootEntities) { - perRoot.set(root, []); + const blankToRoot = new Map(); + for (const [blankNode, roots] of blankRootCandidates) { + const canonicalRoot = [...roots].sort()[0]; + if (canonicalRoot) blankToRoot.set(blankNode, canonicalRoot); } + return blankToRoot; +} - // Collect which quads belong to which root, skolemizing as we go - const rootQuadsMap = new Map(); - for (const root of rootEntities) { - const rootQuads = quads.filter( - (q) => - q.subject === root || - (isBlankNode(q.subject) && blankToRoot.get(q.subject) === root), - ); - const sk = skolemize(root, rootQuads); - rootQuadsMap.set(root, sk); +function addRootCandidate( + candidates: Map>, + blankNode: string, + root: string, +): boolean { + const roots = candidates.get(blankNode); + if (roots) { + const size = roots.size; + roots.add(root); + return roots.size !== size; } + candidates.set(blankNode, new Set([root])); + return true; +} - // Also handle already-skolemized quads (no blank nodes) - for (const q of quads) { - if (isSkolemizedUri(q.subject)) { - const root = rootEntityFromSkolemized(q.subject); - if (root && rootQuadsMap.has(root)) { - rootQuadsMap.get(root)!.push(...skolemize(root, [q])); - } - } - } +function rootForQuadSubject( + subject: string, + blankToRoot: ReadonlyMap, +): string | undefined { + if (isBlankNode(subject)) return blankToRoot.get(subject); + return rootForNonBlankSubject(subject); +} - return rootQuadsMap; +function rootForNonBlankSubject(subject: string): string | undefined { + if (isBlankNode(subject)) return undefined; + if (isSkolemizedUri(subject)) return rootEntityFromSkolemized(subject) ?? undefined; + return subject; +} + +function skolemizeTermForRoot(term: string, root: string): string { + return isBlankNode(term) ? `${root}${GENID_SEGMENT}${term.slice(2)}` : term; } /** diff --git a/packages/publisher/src/public-write-normalization.ts b/packages/publisher/src/public-write-normalization.ts index 419eb5605..558932344 100644 --- a/packages/publisher/src/public-write-normalization.ts +++ b/packages/publisher/src/public-write-normalization.ts @@ -5,9 +5,7 @@ import { type RdfTextLiteralRewrite, } from '@origintrail-official/dkg-core'; import type { Quad } from '@origintrail-official/dkg-storage'; -import { isBlankNode, isSkolemizedUri, rootEntityFromSkolemized } from './skolemize.js'; - -const GENID_SEGMENT = '/.well-known/genid/'; +import { skolemizeFlatQuads } from './auto-partition.js'; export interface PreparedPublicWriteQuads { readonly quads: Quad[]; @@ -25,7 +23,7 @@ export function preparePublicWriteQuads( const concreteQuads = quads.map(toPublicWriteQuad); const input = options.skolemize === false ? concreteQuads - : skolemizePublicWriteQuads(concreteQuads); + : skolemizeFlatQuads(concreteQuads); const result = normalizeLargeRdfLiteralsForBlazegraph(input, options); return { quads: result.quads.map(toPublicWriteQuad), @@ -33,58 +31,6 @@ export function preparePublicWriteQuads( }; } -function skolemizePublicWriteQuads(quads: readonly Quad[]): Quad[] { - const blankToRoot = inferBlankNodeRoots(quads); - return quads.map((quad) => { - const root = rootForQuad(quad, blankToRoot); - if (!root) return { ...quad }; - return { - subject: skolemizeTermForRoot(quad.subject, root), - predicate: quad.predicate, - object: skolemizeTermForRoot(quad.object, root), - graph: quad.graph, - }; - }); -} - -function inferBlankNodeRoots(quads: readonly Quad[]): Map { - const blankToRoot = new Map(); - for (const quad of quads) { - const root = rootForNonBlankSubject(quad.subject); - if (root && isBlankNode(quad.object) && !blankToRoot.has(quad.object)) { - blankToRoot.set(quad.object, root); - } - } - - let changed = true; - while (changed) { - changed = false; - for (const quad of quads) { - const root = isBlankNode(quad.subject) ? blankToRoot.get(quad.subject) : undefined; - if (root && isBlankNode(quad.object) && !blankToRoot.has(quad.object)) { - blankToRoot.set(quad.object, root); - changed = true; - } - } - } - return blankToRoot; -} - -function rootForQuad(quad: Quad, blankToRoot: ReadonlyMap): string | undefined { - if (isBlankNode(quad.subject)) return blankToRoot.get(quad.subject); - return rootForNonBlankSubject(quad.subject); -} - -function rootForNonBlankSubject(subject: string): string | undefined { - if (isBlankNode(subject)) return undefined; - if (isSkolemizedUri(subject)) return rootEntityFromSkolemized(subject) ?? undefined; - return subject; -} - -function skolemizeTermForRoot(term: string, root: string): string { - return isBlankNode(term) ? `${root}${GENID_SEGMENT}${term.slice(2)}` : term; -} - export function toPublicWriteQuad(quad: QuadLiteralLike): Quad { return { subject: quad.subject, diff --git a/packages/publisher/test/dkg-publisher-compat.test.ts b/packages/publisher/test/dkg-publisher-compat.test.ts index d99db3173..b5667d84f 100644 --- a/packages/publisher/test/dkg-publisher-compat.test.ts +++ b/packages/publisher/test/dkg-publisher-compat.test.ts @@ -356,6 +356,35 @@ describe('DKGPublisher compatibility aliases', () => { expect(prepared).toHaveLength(2); }); + it('uses a blank node inferred root when skolemizing shared blank-node object links', () => { + const rootA = 'http://example.org/root-a'; + const rootB = 'http://example.org/root-b'; + const shared = `${rootA}/.well-known/genid/shared`; + const wrongShared = `${rootB}/.well-known/genid/shared`; + + const prepared = preparePublicWriteQuads([ + q(rootA, 'http://schema.org/hasPart', '_:shared'), + q(rootB, 'http://schema.org/hasPart', '_:shared'), + q('_:shared', 'http://schema.org/name', '"Shared"'), + ]).quads; + + expect(prepared).toContainEqual({ + ...q(rootA, 'http://schema.org/hasPart', '_:shared'), + object: shared, + }); + expect(prepared).toContainEqual({ + ...q(rootB, 'http://schema.org/hasPart', '_:shared'), + object: shared, + }); + expect(prepared).toContainEqual({ + ...q('_:shared', 'http://schema.org/name', '"Shared"'), + subject: shared, + }); + expect(prepared.some((quad) => + quad.subject === wrongShared || quad.object === wrongShared + )).toBe(false); + }); + it('indexes already-skolemized subjects even when their root subject is absent', () => { const root = 'http://example.org/external'; const externalSkolemized = q( diff --git a/packages/publisher/test/pure-functions.test.ts b/packages/publisher/test/pure-functions.test.ts index 474c37e55..848fa1767 100644 --- a/packages/publisher/test/pure-functions.test.ts +++ b/packages/publisher/test/pure-functions.test.ts @@ -100,6 +100,34 @@ describe('autoPartition', () => { const kaQuads = result.get(ENTITY)!; expect(kaQuads.length).toBe(3); }); + + it('uses the inferred blank-node root when multiple roots link to one blank object', () => { + const rootA = 'http://example.org/root-a'; + const rootB = 'http://example.org/root-b'; + const shared = `${rootA}/.well-known/genid/shared`; + const wrongShared = `${rootB}/.well-known/genid/shared`; + const quads: Quad[] = [ + q(rootA, 'http://example.org/hasPart', '_:shared'), + q(rootB, 'http://example.org/mentions', '_:shared'), + q('_:shared', 'http://schema.org/name', '"Shared"'), + ]; + + const result = autoPartition(quads); + const reordered = autoPartition([...quads].reverse()); + const allQuads = [...result.values()].flat(); + const allReorderedQuads = [...reordered.values()].flat(); + + expect(result.get(rootA)).toEqual(expect.arrayContaining([ + q(rootA, 'http://example.org/hasPart', shared), + q(shared, 'http://schema.org/name', '"Shared"'), + ])); + expect(result.get(rootB)).toEqual(expect.arrayContaining([ + q(rootB, 'http://example.org/mentions', shared), + ])); + expect(allQuads.some((quad) => quad.subject === wrongShared || quad.object === wrongShared)).toBe(false); + expect(allReorderedQuads).toEqual(expect.arrayContaining(allQuads)); + expect(allReorderedQuads).toHaveLength(allQuads.length); + }); }); describe('merkle', () => { From eb581049a2a98e07de3f3fe4e4eeeb3f33dc970a Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Fri, 26 Jun 2026 00:54:39 +0200 Subject: [PATCH 11/12] Preserve CLI RDF object validation wording --- packages/cli/src/daemon/http-utils.ts | 4 ++-- packages/cli/test/http-literal-size-validation.test.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index ba76308ef..65f974db4 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -304,8 +304,8 @@ export function validateQuadObjectTerms( }); if (badIndex === -1) return null; const expected = allowBlankNodes - ? "a well-formed quoted literal term, blank node, or absolute IRI" - : "a well-formed quoted literal term or absolute IRI"; + ? "a quoted literal term, blank node, or absolute IRI" + : "a quoted literal term or absolute IRI"; return `Invalid "${label}[${badIndex}].object": RDF object must be ${expected}`; } diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index dae9fe14d..f374d3869 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -235,7 +235,7 @@ describe('HTTP RDF literal size validation', () => { expect(prepared.ok).toBe(false); if (!prepared.ok) { - expect(prepared.body.error).toContain('well-formed quoted literal term'); + expect(prepared.body.error).toContain('quoted literal term'); } } }); From fce9b1447b44d57764d551bd6c0b21455e789aec Mon Sep 17 00:00:00 2001 From: Jurij Skornik Date: Fri, 26 Jun 2026 02:48:37 +0200 Subject: [PATCH 12/12] Address public write review boundary cleanup --- packages/cli/src/daemon/http-utils.ts | 15 +- .../daemon/routes/knowledge-assets-import.ts | 4 +- .../cli/src/daemon/routes/knowledge-assets.ts | 6 +- packages/cli/src/daemon/routes/memory.ts | 18 +- .../test/http-literal-size-validation.test.ts | 12 +- .../cli/test/import-artifact-routes.test.ts | 22 +-- .../import-file-integration.part-01.test.ts | 22 +-- packages/core/test/helpers/chunked-text.ts | 171 ++++++++++++++++++ packages/core/test/rdf-literal-size.test.ts | 152 +--------------- packages/publisher/src/auto-partition.ts | 11 +- packages/publisher/src/index.ts | 2 +- packages/publisher/src/skolemize.ts | 14 +- .../test/dkg-publisher-compat.test.ts | 40 +--- 13 files changed, 230 insertions(+), 259 deletions(-) create mode 100644 packages/core/test/helpers/chunked-text.ts diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index 65f974db4..06253588e 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -232,7 +232,7 @@ export interface PreparedPublicWriteQuads { readonly rewrites: RdfTextLiteralRewrite[]; } -export interface PreparedValidatedPublicWriteQuads extends PreparedPublicWriteQuads { +export interface PreparedPublicWriteStorageQuads extends PreparedPublicWriteQuads { readonly totalQuads: number; } @@ -257,10 +257,17 @@ export function preparePublicWriteQuads( } } -export function prepareValidatedPublicWriteQuads( +/** + * Prepares already shape-checked route quads for public storage. This helper + * intentionally does not own route shape, subject/predicate IRI, blank-node + * subject, or graph-policy validation; those remain with the route parsers that + * know their input contract. It validates object terms because parsing/chunking + * consumes them, then returns explicit storage quads plus rewrite/count metadata. + */ +export function preparePublicWriteStorageQuads( label: string, quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>, -): { ok: true; value: PreparedValidatedPublicWriteQuads } | { ok: false; body: Record } { +): { ok: true; value: PreparedPublicWriteStorageQuads } | { ok: false; body: Record } { const objectError = validateQuadObjectTerms(label, quads); if (objectError) return { ok: false, body: { error: objectError } }; const prepared = preparePublicWriteQuads(label, quads); @@ -487,7 +494,7 @@ export function parsePublishRequestBody( error: 'Missing or invalid "quads" (must be a non-empty quad array)', }; } - const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); if (!normalizedQuads.ok) { return { ok: false, error: String(normalizedQuads.body.error ?? 'Oversized RDF literal'), body: normalizedQuads.body }; } diff --git a/packages/cli/src/daemon/routes/knowledge-assets-import.ts b/packages/cli/src/daemon/routes/knowledge-assets-import.ts index 5f3fc71ba..e8083ce33 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets-import.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets-import.ts @@ -43,7 +43,7 @@ import { normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, oversizedRdfLiteralResponseBody, - prepareValidatedPublicWriteQuads, + preparePublicWriteStorageQuads, SMALL_BODY_BYTES, MAX_UPLOAD_BYTES, type ImportFileExtractionPayload, @@ -639,7 +639,7 @@ export async function handleKaSemanticEnrichmentWrite(ctx: RequestContext): Prom generationMethod, semanticQuads, }); - const normalizedQuads = prepareValidatedPublicWriteQuads("semanticQuads", [...semanticQuads, ...provenanceQuads]); + const normalizedQuads = preparePublicWriteStorageQuads("semanticQuads", [...semanticQuads, ...provenanceQuads]); if (!normalizedQuads.ok) { return jsonResponse(res, 400, normalizedQuads.body); } diff --git a/packages/cli/src/daemon/routes/knowledge-assets.ts b/packages/cli/src/daemon/routes/knowledge-assets.ts index fc2a09944..edfb7982a 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets.ts @@ -40,7 +40,7 @@ import { respondIfReconcileUnavailable, respondIfChainRpcTransportError, sanitizeRpcMessage, - prepareValidatedPublicWriteQuads, + preparePublicWriteStorageQuads, normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, isNoFundedPublisherWalletLike, @@ -825,7 +825,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< if (!quads.every(isWritableQuad)) { return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); } - const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); quadsToWrite = normalizedQuads.value.quads; } @@ -1144,7 +1144,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< } // Validate object terms and chunk oversized schema:text literals before // the write path sees these public-write quads. - const normalizedQuads = prepareValidatedPublicWriteQuads("quads", parsed.quads); + const normalizedQuads = preparePublicWriteStorageQuads("quads", parsed.quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); // A bare write to a name that was never created used to fall through to // the legacy `/assertion/{addr}/{name}` graph and produce a KA that is diff --git a/packages/cli/src/daemon/routes/memory.ts b/packages/cli/src/daemon/routes/memory.ts index 0989d4951..dfc1147c5 100644 --- a/packages/cli/src/daemon/routes/memory.ts +++ b/packages/cli/src/daemon/routes/memory.ts @@ -64,7 +64,7 @@ import { loadOpWallets, } from '@origintrail-official/dkg-agent'; import { computeNetworkId, createOperationContext, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, assertSafeRdfTerm, sparqlIri, contextGraphSharedMemoryUri, sharedMemoryReadBothFilter, contextGraphAssertionUri, contextGraphMetaUri, escapeDkgRdfLiteral, escapeSparqlLiteral, PROTOCOL_SYNC } from '@origintrail-official/dkg-core'; -import { skolemizeByEntity, findReservedSubjectPrefix, isSkolemizedUri, type PublishOptions, type PublishResult } from '@origintrail-official/dkg-publisher'; +import { skolemizeByEntity, findReservedSubjectPrefix, isSkolemizedUri, SKOLEMIZED_BLANK_NODE_SEGMENT, type PublishOptions, type PublishResult } from '@origintrail-official/dkg-publisher'; import type { Quad } from '@origintrail-official/dkg-storage'; import { buildAutoRegisterFailureBody } from "./shared-assertion-helpers.js"; import { @@ -199,7 +199,7 @@ import { resolveNameToPeerId, isPublishQuad, isWritableQuad, - prepareValidatedPublicWriteQuads, + preparePublicWriteStorageQuads, oversizedRdfLiteralResponseBody, parsePublishRequestBody, jsonResponse, @@ -367,10 +367,8 @@ type PreSignedAuthorAttestation = { type SharedMemoryPublishSelection = "all" | { rootEntities: string[] }; const WORKSPACE_OWNER_PREDICATE = 'http://dkg.io/ontology/workspaceOwner'; -const SKOLEM_GENID_SEGMENT = '/.well-known/genid/'; - function subjectMatchesPublishRoot(subject: string, root: string): boolean { - return subject === root || (isSkolemizedUri(subject) && subject.startsWith(`${root}${SKOLEM_GENID_SEGMENT}`)); + return subject === root || (isSkolemizedUri(subject) && subject.startsWith(`${root}${SKOLEMIZED_BLANK_NODE_SEGMENT}`)); } // Exported for the OT-RFC-46 read-both regression test (the route mocks the @@ -402,7 +400,7 @@ export async function resolvePublishRootEntities( VALUES ?root { ${values} } ?s ?p ?o . FILTER(?p != <${WORKSPACE_OWNER_PREDICATE}>) - FILTER(?s = ?root || STRSTARTS(STR(?s), CONCAT(STR(?root), "${SKOLEM_GENID_SEGMENT}"))) + FILTER(?s = ?root || STRSTARTS(STR(?s), CONCAT(STR(?root), "${SKOLEMIZED_BLANK_NODE_SEGMENT}"))) } ${swmGraphScope} }`, @@ -669,7 +667,7 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { }; }); - const normalizedQuads = prepareValidatedPublicWriteQuads("quads", normalized); + const normalizedQuads = preparePublicWriteStorageQuads("quads", normalized); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); normalized = normalizedQuads.value.quads; await agent.store.insert(normalized); @@ -1657,7 +1655,7 @@ WHERE { // Validate object terms and chunk oversized schema:text literals before // storage/share sees these public-write quads. { - const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); quads = normalizedQuads.value.quads; } @@ -2259,7 +2257,7 @@ WHERE { // Validate object terms and chunk oversized schema:text literals before // storage/share sees these public-write quads. { - const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); quads = normalizedQuads.value.quads; } @@ -2458,7 +2456,7 @@ WHERE { }); } - const normalizedQuads = prepareValidatedPublicWriteQuads("quads", quads); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); const quadsToWrite = normalizedQuads.value.quads; diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index f374d3869..3898860e1 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -8,7 +8,7 @@ import { buildImportFileResponse, parsePublishRequestBody, preparePublicWriteQuads, - prepareValidatedPublicWriteQuads, + preparePublicWriteStorageQuads, validateWritableQuadLiteralSizes, } from '../src/daemon/http-utils.js'; @@ -194,8 +194,8 @@ describe('HTTP RDF literal size validation', () => { } }); - it('prepares validated public-write quads with explicit normalized count', () => { - const prepared = prepareValidatedPublicWriteQuads('quads', [{ + it('prepares public-write storage quads with explicit normalized count', () => { + const prepared = preparePublicWriteStorageQuads('quads', [{ subject: 'http://example.org/s', predicate: 'http://schema.org/text', object: OVERSIZED_LITERAL, @@ -210,8 +210,8 @@ describe('HTTP RDF literal size validation', () => { } }); - it('returns structured object-term validation failures from the validated public-write helper', () => { - const prepared = prepareValidatedPublicWriteQuads('quads', [{ + it('returns structured object-term validation failures from the public-write storage helper', () => { + const prepared = preparePublicWriteStorageQuads('quads', [{ subject: 'http://example.org/s', predicate: 'http://schema.org/name', object: 'not-a-valid-rdf-object', @@ -226,7 +226,7 @@ describe('HTTP RDF literal size validation', () => { it('rejects small malformed quoted literal terms before storage routes see them', () => { for (const object of ['"bad\nliteral"', ' "ok"', '"ok"\n']) { - const prepared = prepareValidatedPublicWriteQuads('quads', [{ + const prepared = preparePublicWriteStorageQuads('quads', [{ subject: 'http://example.org/s', predicate: 'http://schema.org/name', object, diff --git a/packages/cli/test/import-artifact-routes.test.ts b/packages/cli/test/import-artifact-routes.test.ts index a12ac6009..20872866b 100644 --- a/packages/cli/test/import-artifact-routes.test.ts +++ b/packages/cli/test/import-artifact-routes.test.ts @@ -15,6 +15,7 @@ import { keccak256ContentHash, sharedMemoryReadBothFilter, } from '@origintrail-official/dkg-core'; +import { reconstructChunkedText } from '../../core/test/helpers/chunked-text.js'; import { FileStore } from '../src/file-store.js'; import type { ExtractionStatusRecord } from '../src/extraction-status.js'; import { handleKnowledgeAssetsRoutes } from '../src/daemon/routes/knowledge-assets.js'; @@ -24,27 +25,6 @@ const PROV = 'http://www.w3.org/ns/prov#'; type Quad = { subject: string; predicate: string; object: string; graph?: string }; -function reconstructChunkedText(quads: readonly Quad[], subject: string): string { - const bodySubject = quads.find((quad) => - quad.subject === subject && - quad.predicate === DKG_HAS_TEXT_BODY - )?.object; - if (!bodySubject) throw new Error(`Missing chunked text body for ${subject}`); - return quads - .filter((quad) => quad.subject === bodySubject && quad.predicate === DKG_HAS_TEXT_CHUNK) - .map((link) => { - const chunkQuads = quads.filter((quad) => quad.subject === link.object); - const indexTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_INDEX)?.object; - const valueTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_VALUE)?.object; - const index = Number(/^"(\d+)"/.exec(indexTerm ?? '')?.[1] ?? NaN); - if (!Number.isInteger(index) || !valueTerm) throw new Error(`Invalid chunk ${link.object}`); - return { index, value: JSON.parse(valueTerm) as string }; - }) - .sort((a, b) => a.index - b.index) - .map((chunk) => chunk.value) - .join(''); -} - function sha256Hash(bytes: Buffer): string { return `sha256:${createHash('sha256').update(bytes).digest('hex')}`; } diff --git a/packages/cli/test/import-file-integration.part-01.test.ts b/packages/cli/test/import-file-integration.part-01.test.ts index a93fe376e..70c04b35d 100644 --- a/packages/cli/test/import-file-integration.part-01.test.ts +++ b/packages/cli/test/import-file-integration.part-01.test.ts @@ -6,27 +6,7 @@ import { DKG_HAS_TEXT_BODY, DKG_HAS_TEXT_CHUNK, } from '@origintrail-official/dkg-core'; - -function reconstructChunkedText(quads: readonly CapturedQuad[], subject: string): string { - const bodySubject = quads.find((quad) => - quad.subject === subject && - quad.predicate === DKG_HAS_TEXT_BODY - )?.object; - if (!bodySubject) throw new Error(`Missing chunked text body for ${subject}`); - return quads - .filter((quad) => quad.subject === bodySubject && quad.predicate === DKG_HAS_TEXT_CHUNK) - .map((link) => { - const chunkQuads = quads.filter((quad) => quad.subject === link.object); - const indexTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_INDEX)?.object; - const valueTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_VALUE)?.object; - const index = Number(/^"(\d+)"/.exec(indexTerm ?? '')?.[1] ?? NaN); - if (!Number.isInteger(index) || !valueTerm) throw new Error(`Invalid chunk ${link.object}`); - return { index, value: JSON.parse(valueTerm) as string }; - }) - .sort((a, b) => a.index - b.index) - .map((chunk) => chunk.value) - .join(''); -} +import { reconstructChunkedText } from '../../core/test/helpers/chunked-text.js'; describe('import-file orchestration — happy paths', () => { diff --git a/packages/core/test/helpers/chunked-text.ts b/packages/core/test/helpers/chunked-text.ts new file mode 100644 index 000000000..bc943aa0f --- /dev/null +++ b/packages/core/test/helpers/chunked-text.ts @@ -0,0 +1,171 @@ +import { createHash } from 'node:crypto'; +import { + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, + DKG_TEXT_CHUNK_COUNT, + DKG_TEXT_CONTENT_SHA256, + DKG_TEXT_DATATYPE, + DKG_TEXT_LANGUAGE, + DKG_TEXT_LITERAL_TERM_SHA256, + DKG_TEXT_SOURCE_PREDICATE, + parseRdfLiteralTerm, + rdfLiteralTerm, +} from '../../src/rdf-text-literal-normalization.js'; + +export interface ChunkedTextQuad { + readonly subject: string; + readonly predicate: string; + readonly object: string; + readonly graph?: string; +} + +export interface ReconstructedChunkedTextBody { + readonly subject: string; + readonly bodySubject: string; + readonly sourcePredicate: string; + readonly lexical: string; + readonly literalTerm: string; + readonly chunkCount: number; + readonly lexicalSha256: string; + readonly literalTermSha256: string; + readonly language?: string; + readonly datatype?: string; +} + +export function reconstructChunkedText( + quads: readonly ChunkedTextQuad[], + subject: string, +): string { + const reconstructed = reconstructChunkedTextBodies(quads, { subject }); + if (reconstructed.length === 0) throw new Error(`Missing chunked text body for ${subject}`); + return reconstructed[0]!.lexical; +} + +export function reconstructChunkedTextBodies( + quads: readonly ChunkedTextQuad[], + options: { subject?: string; bodySubject?: string; sourcePredicate?: string } = {}, +): ReconstructedChunkedTextBody[] { + const bySubject = indexQuadsBySubject(quads); + const bodyLinks = quads.filter((q) => + q.predicate === DKG_HAS_TEXT_BODY && + (!options.subject || q.subject === options.subject) && + (!options.bodySubject || q.object === options.bodySubject) + ); + const explicitBody = options.bodySubject && bodyLinks.length === 0 + ? [{ subject: findOwnerSubject(quads, options.bodySubject), object: options.bodySubject }] + : []; + const bodies = [...bodyLinks, ...explicitBody]; + const reconstructed: ReconstructedChunkedTextBody[] = []; + + for (const link of bodies) { + const currentBodySubject = link.object; + const bodyQuads = bySubject.get(currentBodySubject) ?? []; + const sourcePredicate = iriObject(bodyQuads, DKG_TEXT_SOURCE_PREDICATE); + if (!sourcePredicate) throw new Error(`Chunked text body ${currentBodySubject} is missing source predicate`); + if (options.sourcePredicate && sourcePredicate !== options.sourcePredicate) continue; + + const count = integerObject(bodyQuads, DKG_TEXT_CHUNK_COUNT); + if (count === undefined) throw new Error(`Chunked text body ${currentBodySubject} is missing chunk count`); + const lexicalSha256 = literalObject(bodyQuads, DKG_TEXT_CONTENT_SHA256); + if (!lexicalSha256) throw new Error(`Chunked text body ${currentBodySubject} is missing content hash`); + const literalTermSha256 = literalObject(bodyQuads, DKG_TEXT_LITERAL_TERM_SHA256); + if (!literalTermSha256) throw new Error(`Chunked text body ${currentBodySubject} is missing literal term hash`); + const language = literalObject(bodyQuads, DKG_TEXT_LANGUAGE); + const datatype = iriObject(bodyQuads, DKG_TEXT_DATATYPE); + const chunkSubjects = bodyQuads.filter((q) => q.predicate === DKG_HAS_TEXT_CHUNK).map((q) => q.object); + if (chunkSubjects.length !== count) { + throw new Error(`Chunked text body ${currentBodySubject} expected ${count} chunks but found ${chunkSubjects.length}`); + } + + const chunks = chunkSubjects.map((chunkSubject) => { + const chunkQuads = bySubject.get(chunkSubject) ?? []; + const index = integerObject(chunkQuads, DKG_CHUNK_INDEX); + if (index === undefined) throw new Error(`Chunk ${chunkSubject} is missing chunkIndex`); + const valueTerm = literalTermObject(chunkQuads, DKG_CHUNK_VALUE); + if (!valueTerm) throw new Error(`Chunk ${chunkSubject} is missing chunkValue`); + const parsed = parseRdfLiteralTerm(valueTerm); + if (!parsed) throw new Error(`Chunk ${chunkSubject} has invalid chunkValue`); + return { index, lexical: parsed.lexical }; + }).sort((a, b) => a.index - b.index); + + for (let i = 0; i < chunks.length; i++) { + if (chunks[i]!.index !== i) { + throw new Error(`Chunked text body ${currentBodySubject} has non-contiguous chunk index ${chunks[i]!.index}`); + } + } + + const lexical = chunks.map((chunk) => chunk.lexical).join(''); + const suffix = suffixFromMetadata(language, datatype); + const literalTerm = rdfLiteralTerm(lexical, suffix); + if (sha256Hex(lexical) !== lexicalSha256) { + throw new Error(`Chunked text body ${currentBodySubject} content hash mismatch`); + } + if (sha256Hex(literalTerm) !== literalTermSha256) { + throw new Error(`Chunked text body ${currentBodySubject} literal term hash mismatch`); + } + + reconstructed.push({ + subject: link.subject, + bodySubject: currentBodySubject, + sourcePredicate, + lexical, + literalTerm, + chunkCount: chunks.length, + lexicalSha256, + literalTermSha256, + ...(language !== undefined ? { language } : {}), + ...(datatype !== undefined ? { datatype } : {}), + }); + } + + return reconstructed; +} + +function indexQuadsBySubject(quads: readonly ChunkedTextQuad[]): Map { + const map = new Map(); + for (const quad of quads) { + const list = map.get(quad.subject); + if (list) list.push(quad); + else map.set(quad.subject, [quad]); + } + return map; +} + +function findOwnerSubject(quads: readonly ChunkedTextQuad[], bodySubject: string): string { + return quads.find((q) => q.predicate === DKG_HAS_TEXT_BODY && q.object === bodySubject)?.subject ?? ''; +} + +function literalTermObject(quads: readonly ChunkedTextQuad[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object?.startsWith('"') ? object : undefined; +} + +function literalObject(quads: readonly ChunkedTextQuad[], predicate: string): string | undefined { + const term = literalTermObject(quads, predicate); + if (!term) return undefined; + const parsed = parseRdfLiteralTerm(term); + return parsed?.lexical; +} + +function iriObject(quads: readonly ChunkedTextQuad[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object && !object.startsWith('"') ? object : undefined; +} + +function integerObject(quads: readonly ChunkedTextQuad[], predicate: string): number | undefined { + const value = literalObject(quads, predicate); + if (value === undefined || !/^\d+$/.test(value)) return undefined; + return Number(value); +} + +function suffixFromMetadata(language?: string, datatype?: string): string { + if (language) return `@${language}`; + if (datatype) return `^^<${datatype}>`; + return ''; +} + +function sha256Hex(value: string): string { + return createHash('sha256').update(value, 'utf8').digest('hex'); +} diff --git a/packages/core/test/rdf-literal-size.test.ts b/packages/core/test/rdf-literal-size.test.ts index a42f688f3..4f675ee48 100644 --- a/packages/core/test/rdf-literal-size.test.ts +++ b/packages/core/test/rdf-literal-size.test.ts @@ -1,5 +1,5 @@ -import { createHash } from 'node:crypto'; import { describe, expect, it } from 'vitest'; +import { reconstructChunkedTextBodies } from './helpers/chunked-text.js'; import { DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, OVERSIZED_RDF_LITERAL_ERROR_CODE, @@ -7,168 +7,18 @@ import { assertRdfLiteralMutf8Safe, javaModifiedUtf8ByteLength, rdfLiteralTermMutf8ByteLength, - type QuadLiteralLike, } from '../src/rdf-literal-size.js'; import { - DKG_CHUNK_INDEX, DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY, DKG_HAS_TEXT_CHUNK, - DKG_TEXT_CHUNK_COUNT, DKG_TEXT_CONTENT_SHA256, - DKG_TEXT_DATATYPE, - DKG_TEXT_LANGUAGE, DKG_TEXT_LITERAL_TERM_SHA256, - DKG_TEXT_SOURCE_PREDICATE, XSD_STRING_IRI, normalizeLargeRdfLiteralsForBlazegraph, parseRdfLiteralTerm, } from '../src/rdf-text-literal-normalization.js'; -interface ReconstructedChunkedTextBody { - readonly subject: string; - readonly bodySubject: string; - readonly sourcePredicate: string; - readonly lexical: string; - readonly literalTerm: string; - readonly chunkCount: number; - readonly lexicalSha256: string; - readonly literalTermSha256: string; - readonly language?: string; - readonly datatype?: string; -} - -function reconstructChunkedTextBodies( - quads: readonly QuadLiteralLike[], - options: { subject?: string; bodySubject?: string; sourcePredicate?: string } = {}, -): ReconstructedChunkedTextBody[] { - const bySubject = indexQuadsBySubject(quads); - const bodyLinks = quads.filter((q) => - q.predicate === DKG_HAS_TEXT_BODY && - (!options.subject || q.subject === options.subject) && - (!options.bodySubject || q.object === options.bodySubject) - ); - const explicitBody = options.bodySubject && bodyLinks.length === 0 - ? [{ subject: findOwnerSubject(quads, options.bodySubject), object: options.bodySubject }] - : []; - const bodies = [...bodyLinks, ...explicitBody]; - const reconstructed: ReconstructedChunkedTextBody[] = []; - - for (const link of bodies) { - const bodySubject = link.object; - const bodyQuads = bySubject.get(bodySubject) ?? []; - const sourcePredicate = iriObject(bodyQuads, DKG_TEXT_SOURCE_PREDICATE); - if (!sourcePredicate) throw new Error(`Chunked text body ${bodySubject} is missing source predicate`); - if (options.sourcePredicate && sourcePredicate !== options.sourcePredicate) continue; - - const count = integerObject(bodyQuads, DKG_TEXT_CHUNK_COUNT); - if (count === undefined) throw new Error(`Chunked text body ${bodySubject} is missing chunk count`); - const lexicalSha256 = literalObject(bodyQuads, DKG_TEXT_CONTENT_SHA256); - if (!lexicalSha256) throw new Error(`Chunked text body ${bodySubject} is missing content hash`); - const literalTermSha256 = literalObject(bodyQuads, DKG_TEXT_LITERAL_TERM_SHA256); - if (!literalTermSha256) throw new Error(`Chunked text body ${bodySubject} is missing literal term hash`); - const language = literalObject(bodyQuads, DKG_TEXT_LANGUAGE); - const datatype = iriObject(bodyQuads, DKG_TEXT_DATATYPE); - const chunkSubjects = bodyQuads.filter((q) => q.predicate === DKG_HAS_TEXT_CHUNK).map((q) => q.object); - if (chunkSubjects.length !== count) { - throw new Error(`Chunked text body ${bodySubject} expected ${count} chunks but found ${chunkSubjects.length}`); - } - - const chunks = chunkSubjects.map((chunkSubject) => { - const chunkQuads = bySubject.get(chunkSubject) ?? []; - const index = integerObject(chunkQuads, DKG_CHUNK_INDEX); - if (index === undefined) throw new Error(`Chunk ${chunkSubject} is missing chunkIndex`); - const valueTerm = literalTermObject(chunkQuads, DKG_CHUNK_VALUE); - if (!valueTerm) throw new Error(`Chunk ${chunkSubject} is missing chunkValue`); - const parsed = parseRdfLiteralTerm(valueTerm); - if (!parsed) throw new Error(`Chunk ${chunkSubject} has invalid chunkValue`); - return { index, lexical: parsed.lexical }; - }).sort((a, b) => a.index - b.index); - - for (let i = 0; i < chunks.length; i++) { - if (chunks[i]!.index !== i) { - throw new Error(`Chunked text body ${bodySubject} has non-contiguous chunk index ${chunks[i]!.index}`); - } - } - - const lexical = chunks.map((chunk) => chunk.lexical).join(''); - const suffix = suffixFromMetadata(language, datatype); - const literalTerm = rdfLiteralTerm(lexical, suffix); - if (sha256Hex(lexical) !== lexicalSha256) { - throw new Error(`Chunked text body ${bodySubject} content hash mismatch`); - } - if (sha256Hex(literalTerm) !== literalTermSha256) { - throw new Error(`Chunked text body ${bodySubject} literal term hash mismatch`); - } - - reconstructed.push({ - subject: link.subject, - bodySubject, - sourcePredicate, - lexical, - literalTerm, - chunkCount: chunks.length, - lexicalSha256, - literalTermSha256, - ...(language !== undefined ? { language } : {}), - ...(datatype !== undefined ? { datatype } : {}), - }); - } - - return reconstructed; -} - -function indexQuadsBySubject(quads: readonly QuadLiteralLike[]): Map { - const map = new Map(); - for (const quad of quads) { - const list = map.get(quad.subject); - if (list) list.push(quad); - else map.set(quad.subject, [quad]); - } - return map; -} - -function findOwnerSubject(quads: readonly QuadLiteralLike[], bodySubject: string): string { - return quads.find((q) => q.predicate === DKG_HAS_TEXT_BODY && q.object === bodySubject)?.subject ?? ''; -} - -function literalTermObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { - const object = quads.find((q) => q.predicate === predicate)?.object; - return object?.startsWith('"') ? object : undefined; -} - -function literalObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { - const term = literalTermObject(quads, predicate); - if (!term) return undefined; - const parsed = parseRdfLiteralTerm(term); - return parsed?.lexical; -} - -function iriObject(quads: readonly QuadLiteralLike[], predicate: string): string | undefined { - const object = quads.find((q) => q.predicate === predicate)?.object; - return object && !object.startsWith('"') ? object : undefined; -} - -function integerObject(quads: readonly QuadLiteralLike[], predicate: string): number | undefined { - const value = literalObject(quads, predicate); - if (value === undefined || !/^\d+$/.test(value)) return undefined; - return Number(value); -} - -function suffixFromMetadata(language?: string, datatype?: string): string { - if (language) return `@${language}`; - if (datatype) return `^^<${datatype}>`; - return ''; -} - -function rdfLiteralTerm(lexical: string, suffix = ''): string { - return `${JSON.stringify(lexical)}${suffix}`; -} - -function sha256Hex(value: string): string { - return createHash('sha256').update(value, 'utf8').digest('hex'); -} - describe('rdf literal Java MUTF-8 sizing', () => { it('counts Java Modified UTF-8 byte length by UTF-16 code unit', () => { expect(javaModifiedUtf8ByteLength('abc')).toBe(3); diff --git a/packages/publisher/src/auto-partition.ts b/packages/publisher/src/auto-partition.ts index e2ae9b70b..b8ce0a14e 100644 --- a/packages/publisher/src/auto-partition.ts +++ b/packages/publisher/src/auto-partition.ts @@ -1,7 +1,10 @@ import type { Quad } from '@origintrail-official/dkg-storage'; -import { isSkolemizedUri, rootEntityFromSkolemized, isBlankNode } from './skolemize.js'; - -const GENID_SEGMENT = '/.well-known/genid/'; +import { + isSkolemizedUri, + rootEntityFromSkolemized, + isBlankNode, + skolemizedBlankNodeIri, +} from './skolemize.js'; /** * Skolemizes blank nodes under their parent entity and INDEXES the result by @@ -116,7 +119,7 @@ function rootForNonBlankSubject(subject: string): string | undefined { } function skolemizeTermForRoot(term: string, root: string): string { - return isBlankNode(term) ? `${root}${GENID_SEGMENT}${term.slice(2)}` : term; + return isBlankNode(term) ? skolemizedBlankNodeIri(root, term) : term; } /** diff --git a/packages/publisher/src/index.ts b/packages/publisher/src/index.ts index bd1d4df7a..1c63cd7bf 100644 --- a/packages/publisher/src/index.ts +++ b/packages/publisher/src/index.ts @@ -1,5 +1,5 @@ export * from './publisher.js'; -export { skolemize, isBlankNode, isSkolemizedUri, rootEntityFromSkolemized } from './skolemize.js'; +export { skolemize, skolemizedBlankNodeIri, SKOLEMIZED_BLANK_NODE_SEGMENT, isBlankNode, isSkolemizedUri, rootEntityFromSkolemized } from './skolemize.js'; export { RESERVED_SUBJECT_PREFIXES, findReservedSubjectPrefix, isReservedSubject } from './reserved-subjects.js'; export { skolemizeByEntity, autoPartition } from './auto-partition.js'; export { diff --git a/packages/publisher/src/skolemize.ts b/packages/publisher/src/skolemize.ts index 7a61df671..3cc654154 100644 --- a/packages/publisher/src/skolemize.ts +++ b/packages/publisher/src/skolemize.ts @@ -1,6 +1,6 @@ import type { Quad } from '@origintrail-official/dkg-storage'; -const GENID_SEGMENT = '/.well-known/genid/'; +export const SKOLEMIZED_BLANK_NODE_SEGMENT = '/.well-known/genid/'; /** * Replaces blank node identifiers with deterministic URIs scoped under rootEntity. @@ -20,8 +20,7 @@ export function skolemize(rootEntity: string, quads: Quad[]): Quad[] { const mapping = new Map(); for (const bn of blankNodes) { - const label = bn.slice(2); // strip "_:" - mapping.set(bn, `${rootEntity}${GENID_SEGMENT}${label}`); + mapping.set(bn, skolemizedBlankNodeIri(rootEntity, bn)); } return quads.map((q) => ({ @@ -32,12 +31,17 @@ export function skolemize(rootEntity: string, quads: Quad[]): Quad[] { })); } +export function skolemizedBlankNodeIri(rootEntity: string, blankNode: string): string { + const label = blankNode.startsWith('_:') ? blankNode.slice(2) : blankNode; + return `${rootEntity}${SKOLEMIZED_BLANK_NODE_SEGMENT}${label}`; +} + export function isBlankNode(term: string): boolean { return term.startsWith('_:'); } export function isSkolemizedUri(uri: string): boolean { - return uri.includes(GENID_SEGMENT); + return uri.includes(SKOLEMIZED_BLANK_NODE_SEGMENT); } /** @@ -45,7 +49,7 @@ export function isSkolemizedUri(uri: string): boolean { * e.g., "did:dkg:agent:QmBot/.well-known/genid/o1" → "did:dkg:agent:QmBot" */ export function rootEntityFromSkolemized(uri: string): string | null { - const idx = uri.indexOf(GENID_SEGMENT); + const idx = uri.indexOf(SKOLEMIZED_BLANK_NODE_SEGMENT); if (idx === -1) return null; return uri.slice(0, idx); } diff --git a/packages/publisher/test/dkg-publisher-compat.test.ts b/packages/publisher/test/dkg-publisher-compat.test.ts index b5667d84f..162a80bfb 100644 --- a/packages/publisher/test/dkg-publisher-compat.test.ts +++ b/packages/publisher/test/dkg-publisher-compat.test.ts @@ -1,14 +1,13 @@ import { describe, expect, it } from 'vitest'; import { NoChainAdapter } from '@origintrail-official/dkg-chain'; import { - DKG_CHUNK_INDEX, DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY, - DKG_HAS_TEXT_CHUNK, TypedEventBus, generateEd25519Keypair, } from '@origintrail-official/dkg-core'; import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; +import { reconstructChunkedText } from '../../core/test/helpers/chunked-text.js'; import { DKGPublisher } from '../src/dkg-publisher.js'; import { skolemizeByEntity } from '../src/auto-partition.js'; import { preparePublicWriteQuads } from '../src/public-write-normalization.js'; @@ -22,27 +21,6 @@ function q(s: string, p: string, o: string): Quad { }; } -function reconstructPlainChunkedText(quads: readonly Quad[], subject: string): string { - const bodySubject = quads.find((quad) => - quad.subject === subject && - quad.predicate === DKG_HAS_TEXT_BODY - )?.object; - if (!bodySubject) throw new Error(`Missing text body for ${subject}`); - return quads - .filter((quad) => quad.subject === bodySubject && quad.predicate === DKG_HAS_TEXT_CHUNK) - .map((link) => { - const chunkQuads = quads.filter((quad) => quad.subject === link.object); - const indexTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_INDEX)?.object; - const valueTerm = chunkQuads.find((quad) => quad.predicate === DKG_CHUNK_VALUE)?.object; - const index = Number(/^"(\d+)"/.exec(indexTerm ?? '')?.[1] ?? NaN); - if (!Number.isInteger(index) || !valueTerm) throw new Error(`Invalid chunk ${link.object}`); - return { index, value: JSON.parse(valueTerm) as string }; - }) - .sort((a, b) => a.index - b.index) - .map((chunk) => chunk.value) - .join(''); -} - async function makePublisher(): Promise<{ publisher: DKGPublisher; store: OxigraphStore }> { const store = new OxigraphStore(); const publisher = new DKGPublisher({ @@ -89,7 +67,7 @@ describe('DKGPublisher compatibility aliases', () => { )).toBe(true); expect(result.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); - expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); it('chunks oversized schema:text literals on linked blank nodes before shared-memory writes', async () => { @@ -117,7 +95,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.predicate === 'http://schema.org/hasPart' && quad.object === child )).toBe(true); - expect(reconstructPlainChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); }); it('rejects oversized non-text RDF literals at shared-memory producer boundary', async () => { @@ -194,7 +172,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.predicate === 'http://schema.org/text' )).toBe(false); expect(result.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); - expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); it('chunks oversized schema:text literals during direct publish', async () => { @@ -218,7 +196,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.subject === root && quad.predicate === 'http://schema.org/text' )).toBe(false); - expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); it('chunks oversized schema:text literals on linked blank nodes during direct publish', async () => { @@ -251,7 +229,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.predicate === 'http://schema.org/hasPart' && quad.object === child )).toBe(true); - expect(reconstructPlainChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); }); it('chunks oversized schema:text literals during update', async () => { @@ -280,7 +258,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.subject === root && quad.predicate === 'http://schema.org/text' )).toBe(false); - expect(reconstructPlainChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); }); it('chunks oversized schema:text literals on linked blank nodes during update', async () => { @@ -318,7 +296,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.predicate === 'http://schema.org/hasPart' && quad.object === child )).toBe(true); - expect(reconstructPlainChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); }); it('chunks oversized schema:text literals during assertion write', async () => { @@ -337,7 +315,7 @@ describe('DKGPublisher compatibility aliases', () => { quad.subject === root && quad.predicate === 'http://schema.org/text' )).toBe(false); - expect(reconstructPlainChunkedText(quads, root)).toBe('x'.repeat(60_000)); + expect(reconstructChunkedText(quads, root)).toBe('x'.repeat(60_000)); }); it('preserves already-skolemized quads when normal roots are present', () => {