diff --git a/packages/agent/src/dkg-agent-publish.ts b/packages/agent/src/dkg-agent-publish.ts index 801994616..d28923768 100644 --- a/packages/agent/src/dkg-agent-publish.ts +++ b/packages/agent/src/dkg-agent-publish.ts @@ -109,6 +109,7 @@ import { resolveWorkspaceAgentRecipients, computeTripleHashV10 as computeTripleHash, computeFlatKCRootV10 as computeFlatKCRoot, skolemizeByEntity, isReservedSubject, canonicalPublishPayload, + preparePublicWriteQuads, generatedPrivateCatalogTripleKeys, resolveLiftWorkspaceSlice, validateLiftPublishPayload, @@ -426,6 +427,10 @@ function rejectOversizedRdfLiterals(quads: Quad[] | undefined, label: string): v assertQuadLiteralsMutf8Safe(quads, { label }); } +function normalizePublicRdfLiterals(quads: Quad[], label: string): Quad[] { + return preparePublicWriteQuads(quads, { label }).quads; +} + export class PublishMethods extends DKGAgentBase { async publishWorkspaceGossip(this: DKGAgent, contextGraphId: string, @@ -995,7 +1000,7 @@ export class PublishMethods extends DKGAgentBase { if (publicQuads.length === 0 && privateQuads.length === 0) { throw new InvalidContentError('Content must include at least one public or private payload'); } - rejectOversizedRdfLiterals(publicQuads, 'publishAsync.publicQuads'); + publicQuads = normalizePublicRdfLiterals(publicQuads, 'publishAsync.publicQuads'); rejectOversizedRdfLiterals(privateQuads, 'publishAsync.privateQuads'); const partitioned = partitionPublishAsyncQuads(publicQuads, privateQuads); @@ -1292,8 +1297,8 @@ export class PublishMethods extends DKGAgentBase { ): Promise { const ctx = opts?.operationCtx ?? createOperationContext('publish'); const onPhase = opts?.onPhase; + quads = normalizePublicRdfLiterals(quads, 'agent.publish.quads'); this.log.info(ctx, `Starting publish to context graph "${contextGraphId}" with ${quads.length} triples`); - rejectOversizedRdfLiterals(quads, 'agent.publish.quads'); rejectOversizedRdfLiterals(privateQuads, 'agent.publish.privateQuads'); const isSystem = contextGraphId === SYSTEM_CONTEXT_GRAPHS.AGENTS || contextGraphId === SYSTEM_CONTEXT_GRAPHS.ONTOLOGY; @@ -1528,8 +1533,8 @@ export class PublishMethods extends DKGAgentBase { ): Promise { const ctx = opts?.operationCtx ?? createOperationContext('update'); const onPhase = opts?.onPhase; + quads = normalizePublicRdfLiterals(quads, 'agent.update.quads'); this.log.info(ctx, `Starting update of kaId=${kaId} in context graph "${contextGraphId}" with ${quads.length} triples`); - rejectOversizedRdfLiterals(quads, 'agent.update.quads'); rejectOversizedRdfLiterals(privateQuads, 'agent.update.privateQuads'); // GH #842: thread the on-chain cgId so the publisher can promote the update // payload into the per-cgId partition the RS prover reads. Without it, @@ -2640,6 +2645,7 @@ export class PublishMethods extends DKGAgentBase { privateQuads?: Quad[]; }, ): Promise { + quads = normalizePublicRdfLiterals(quads, '_buildPrecomputedAttestationForSelection.quads'); if ( opts?.authorAgentAddress != null && opts?.preSignedAuthorAttestation != null diff --git a/packages/agent/test/publish-literal-size.test.ts b/packages/agent/test/publish-literal-size.test.ts index bc626dee6..b74b38fc9 100644 --- a/packages/agent/test/publish-literal-size.test.ts +++ b/packages/agent/test/publish-literal-size.test.ts @@ -1,6 +1,11 @@ import { describe, expect, it, vi } from 'vitest'; +import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, +} from '@origintrail-official/dkg-core'; +import { canonicalPublishPayload, preparePublicWriteQuads } from '@origintrail-official/dkg-publisher'; import { PublishMethods } from '../src/dkg-agent-publish.js'; -import type { Quad } from '@origintrail-official/dkg-storage'; +import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; const OVERSIZED_TEXT_QUAD: Quad = { subject: 'http://example.org/root', @@ -9,6 +14,26 @@ const OVERSIZED_TEXT_QUAD: Quad = { graph: 'http://example.org/graph', }; +const OVERSIZED_NAME_QUAD: Quad = { + ...OVERSIZED_TEXT_QUAD, + predicate: 'http://schema.org/name', +}; + +const LINKED_BLANK_OVERSIZED_TEXT_QUADS: Quad[] = [ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:body', + graph: 'http://example.org/graph', + }, + { + subject: '_:body', + predicate: 'http://schema.org/text', + object: `"${'x'.repeat(60_000)}"`, + graph: 'http://example.org/graph', + }, +]; + describe('agent publish literal size validation', () => { it('rejects publishAsync private quads before workspace staging', async () => { const agentStub = { @@ -32,7 +57,48 @@ describe('agent publish literal size validation', () => { }); }); - it('rejects direct publish quads before chain or publisher work', async () => { + it('chunks publishAsync public schema:text before workspace staging', async () => { + const writeToWorkspace = vi.fn(async () => ({ + shareOperationId: 'swm-test', + message: new Uint8Array([1, 2, 3]), + })); + const agentStub = { + contextGraphExists: vi.fn(async () => true), + publisher: { + writeToWorkspace, + }, + peerId: 'peer-test', + store: new OxigraphStore(), + log: { + warn: vi.fn(), + }, + buildAsyncLiftSeal: vi.fn(async () => undefined), + }; + + const result = await PublishMethods.prototype.publishAsync.call( + agentStub as never, + 'computer-history', + { + publicQuads: [OVERSIZED_TEXT_QUAD], + privateQuads: [], + }, + { localOnly: true }, + ); + + expect(result.captureID).toEqual(expect.any(String)); + const stagedQuads = writeToWorkspace.mock.calls[0]?.[1] as Quad[]; + expect(stagedQuads.some((quad) => + quad.subject === OVERSIZED_TEXT_QUAD.subject && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(stagedQuads.some((quad) => + quad.subject === OVERSIZED_TEXT_QUAD.subject && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(stagedQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + }); + + it('rejects direct publish non-text quads before chain or publisher work', async () => { const agentStub = { log: { info: vi.fn() }, }; @@ -41,13 +107,100 @@ describe('agent publish literal size validation', () => { PublishMethods.prototype._publish.call( agentStub as never, 'computer-history', - [OVERSIZED_TEXT_QUAD], + [OVERSIZED_NAME_QUAD], ), ).rejects.toMatchObject({ code: 'OVERSIZED_RDF_LITERAL', actualBytes: 60_002, maxBytes: 60_000, - predicate: 'http://schema.org/text', + predicate: 'http://schema.org/name', }); }); + + it('builds selection precomputed attestations over chunked public text quads', async () => { + const authorAddress = '0x000000000000000000000000000000000000dEaD'; + const agentStub = { + chain: { + getEvmChainId: vi.fn(async () => 31337n), + getKnowledgeAssetsLifecycleAddress: vi.fn(async () => '0x000000000000000000000000000000000000c0de'), + }, + getContextGraphOnChainId: vi.fn(async () => 42n), + isPrivateContextGraph: vi.fn(async () => false), + publisher: { + publisherFallbackAuthorAddress: vi.fn(async () => authorAddress), + signAuthorAttestationAsPublisher: vi.fn(async () => ({ + r: new Uint8Array(32).fill(1), + vs: new Uint8Array(32).fill(2), + })), + }, + kaNumberAllocator: { + reconcile: vi.fn(), + markReconciled: vi.fn(), + allocate: vi.fn(() => ({ number: 7n })), + }, + reconciledKaAuthors: new Set(), + }; + + const attestation = + await PublishMethods.prototype._buildPrecomputedAttestationForSelection.call( + agentStub as never, + 'computer-history', + [OVERSIZED_TEXT_QUAD], + ); + + const chunkedQuads = preparePublicWriteQuads( + [OVERSIZED_TEXT_QUAD], + { label: 'test.expected' }, + ).quads; + const expectedChunkedRoot = canonicalPublishPayload(chunkedQuads, []).kcMerkleRoot; + const unchunkedRoot = canonicalPublishPayload([OVERSIZED_TEXT_QUAD], []).kcMerkleRoot; + + expect(Array.from(attestation?.expectedMerkleRoot ?? [])).toEqual(Array.from(expectedChunkedRoot)); + expect(Array.from(attestation?.expectedMerkleRoot ?? [])).not.toEqual(Array.from(unchunkedRoot)); + }); + + it('builds selection precomputed attestations over skolemized linked blank-node text chunks', async () => { + const authorAddress = '0x000000000000000000000000000000000000dEaD'; + const agentStub = { + chain: { + getEvmChainId: vi.fn(async () => 31337n), + getKnowledgeAssetsLifecycleAddress: vi.fn(async () => '0x000000000000000000000000000000000000c0de'), + }, + getContextGraphOnChainId: vi.fn(async () => 42n), + isPrivateContextGraph: vi.fn(async () => false), + publisher: { + publisherFallbackAuthorAddress: vi.fn(async () => authorAddress), + signAuthorAttestationAsPublisher: vi.fn(async () => ({ + r: new Uint8Array(32).fill(1), + vs: new Uint8Array(32).fill(2), + })), + }, + kaNumberAllocator: { + reconcile: vi.fn(), + markReconciled: vi.fn(), + allocate: vi.fn(() => ({ number: 7n })), + }, + reconciledKaAuthors: new Set(), + }; + + const attestation = + await PublishMethods.prototype._buildPrecomputedAttestationForSelection.call( + agentStub as never, + 'computer-history', + LINKED_BLANK_OVERSIZED_TEXT_QUADS, + ); + + const prepared = preparePublicWriteQuads(LINKED_BLANK_OVERSIZED_TEXT_QUADS, { label: 'test.expected' }).quads; + const child = 'http://example.org/root/.well-known/genid/body'; + expect(prepared.some((quad) => + quad.subject === child && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(prepared.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + const expectedChunkedRoot = canonicalPublishPayload(prepared, []).kcMerkleRoot; + expect(Array.from(attestation?.expectedMerkleRoot ?? [])).toEqual(Array.from(expectedChunkedRoot)); + }); }); diff --git a/packages/cli/src/daemon/http-utils.ts b/packages/cli/src/daemon/http-utils.ts index d77338ad1..06253588e 100644 --- a/packages/cli/src/daemon/http-utils.ts +++ b/packages/cli/src/daemon/http-utils.ts @@ -9,6 +9,8 @@ import { PayloadTooLargeError, assertQuadLiteralsMutf8Safe, isOversizedRdfLiteralError, + parseRdfLiteralTerm, + type RdfTextLiteralRewrite, validateContextGraphId, validateSubGraphName, isSafeIri, @@ -17,6 +19,7 @@ import { } from '@origintrail-official/dkg-core'; import { enrichEvmError, isChainRpcTransportError } from '@origintrail-official/dkg-chain'; import type { DKGAgent, ContextGraphWritePreflightProbe } from '@origintrail-official/dkg-agent'; +import { preparePublicWriteQuads as prepareCanonicalPublicWriteQuads } from '@origintrail-official/dkg-publisher'; import type { DkgConfig } from '../config.js'; import { enforceSignedRequestPostBody } from '../auth.js'; @@ -42,6 +45,11 @@ export interface PublishRequestBody { onChainContextGraphId?: string; } +export interface ParsedPublishRequest { + readonly body: PublishRequestBody; + readonly literalRewrites: RdfTextLiteralRewrite[]; +} + import type { CorsAllowlist } from './state.js'; export function isPayloadTooLargeError(err: unknown): err is PayloadTooLargeError { @@ -113,6 +121,8 @@ export function respondWithDaemonError(res: ServerResponse, err: any): void { (typeof err?.message === 'string' && err.message.includes('reserved namespace')) ) { jsonResponse(res, 400, { error: err.message }); + } else if (isOversizedRdfLiteralError(err)) { + jsonResponse(res, 400, oversizedRdfLiteralResponseBody(err)); } else if (isNoFundedPublisherWalletLike(err)) { // Funded-wallet selection found no operational wallet with gas + TRAC — a // user-actionable funding condition (4xx), not a server bug. @@ -217,6 +227,60 @@ export function validateWritableQuadLiteralSizes( } } +export interface PreparedPublicWriteQuads { + readonly quads: PublishQuad[]; + readonly rewrites: RdfTextLiteralRewrite[]; +} + +export interface PreparedPublicWriteStorageQuads extends PreparedPublicWriteQuads { + readonly totalQuads: number; +} + +/** + * Canonical HTTP-facing public-write preparation. Route handlers that must know + * the normalized quad count or return rewrite metadata call this once and pass + * its explicit storage quads onward; writer boundaries still run the same + * publisher-owned normalizer as an idempotent SDK/backstop path. + */ +export function preparePublicWriteQuads( + label: string, + quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>, +): { ok: true; value: PreparedPublicWriteQuads } | { ok: false; body: Record } { + try { + const result = prepareCanonicalPublicWriteQuads(quads, { label }); + return { ok: true, value: { quads: result.quads, rewrites: result.rewrites } }; + } catch (err) { + if (isOversizedRdfLiteralError(err)) { + return { ok: false, body: oversizedRdfLiteralResponseBody(err) }; + } + throw err; + } +} + +/** + * Prepares already shape-checked route quads for public storage. This helper + * intentionally does not own route shape, subject/predicate IRI, blank-node + * subject, or graph-policy validation; those remain with the route parsers that + * know their input contract. It validates object terms because parsing/chunking + * consumes them, then returns explicit storage quads plus rewrite/count metadata. + */ +export function preparePublicWriteStorageQuads( + label: string, + quads: Array<{ subject: string; predicate: string; object: string; graph?: string }>, +): { ok: true; value: PreparedPublicWriteStorageQuads } | { ok: false; body: Record } { + const objectError = validateQuadObjectTerms(label, quads); + if (objectError) return { ok: false, body: { error: objectError } }; + const prepared = preparePublicWriteQuads(label, quads); + if (!prepared.ok) return prepared; + return { + ok: true, + value: { + ...prepared.value, + totalQuads: prepared.value.quads.length, + }, + }; +} + /** * GH #306 / #787 (follow-up) — validate each quad's `object` term is either a * quoted RDF literal (`"…"`) or an absolute IRI. Shared by the publish path AND @@ -226,18 +290,43 @@ export function validateWritableQuadLiteralSizes( * word `hello` or a number `123`) slips past them and crashes the RDF parser * with an uncaught "No scheme found in an absolute IRI" → HTTP 500 instead of an * actionable 400. Operates on any `{ object: string }` (PublishQuad or writable - * quad alike). + * quad alike). Blank-node object terms are valid only for public write paths that + * subsequently skolemize them; callers that persist quads without normalization + * should set `allowBlankNodes: false`. */ export function validateQuadObjectTerms( label: string, quads: ReadonlyArray<{ object: string }>, + options: { allowBlankNodes?: boolean } = {}, ): string | null { + const allowBlankNodes = options.allowBlankNodes ?? true; const badIndex = quads.findIndex((q) => { - const object = q.object.trim(); - return !object.startsWith('"') && !isSafeIri(object); + const object = q.object; + if (object.trim() !== object) return true; + if (object.startsWith('"')) return parseRdfLiteralTerm(object) === null; + return ( + !(allowBlankNodes && isSafeBlankNode(object)) && + !isSafeIri(object) + ); }); if (badIndex === -1) return null; - return `Invalid "${label}[${badIndex}].object": RDF object must be a quoted literal term or absolute IRI`; + const expected = allowBlankNodes + ? "a quoted literal term, blank node, or absolute IRI" + : "a quoted literal term or absolute IRI"; + return `Invalid "${label}[${badIndex}].object": RDF object must be ${expected}`; +} + +function validateNoBlankNodeSubjects( + label: string, + quads: ReadonlyArray<{ subject: string }>, +): string | null { + const badIndex = quads.findIndex((q) => q.subject.trim().startsWith("_:")); + if (badIndex === -1) return null; + return `Invalid "${label}[${badIndex}].subject": RDF subject must not be a blank node`; +} + +function isSafeBlankNode(term: string): boolean { + return /^_:[A-Za-z][A-Za-z0-9_-]*$/.test(term); } /** @@ -371,7 +460,7 @@ export function respondIfChainRpcTransportError( export function parsePublishRequestBody( body: string, -): { ok: true; value: PublishRequestBody } | { ok: false; error: string; body?: Record } { +): { ok: true; value: ParsedPublishRequest } | { ok: false; error: string; body?: Record } { let parsed: unknown; try { parsed = JSON.parse(body); @@ -405,12 +494,16 @@ export function parsePublishRequestBody( error: 'Missing or invalid "quads" (must be a non-empty quad array)', }; } - const quadObjectError = validateQuadObjectTerms("quads", quads); - if (quadObjectError) return { ok: false, error: quadObjectError }; - const quadSize = validateWritableQuadLiteralSizes("quads", quads); - if (!quadSize.ok) { - return { ok: false, error: String(quadSize.body.error ?? 'Oversized RDF literal'), body: quadSize.body }; + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); + if (!normalizedQuads.ok) { + return { ok: false, error: String(normalizedQuads.body.error ?? 'Oversized RDF literal'), body: normalizedQuads.body }; } + const publishQuads = normalizedQuads.value.quads.map((quad) => ({ + subject: quad.subject, + predicate: quad.predicate, + object: quad.object, + graph: quad.graph, + })); if ( privateQuads !== undefined && @@ -422,7 +515,11 @@ export function parsePublishRequestBody( }; } if (privateQuads !== undefined) { - const privateQuadObjectError = validateQuadObjectTerms("privateQuads", privateQuads); + const privateQuadSubjectError = validateNoBlankNodeSubjects("privateQuads", privateQuads); + if (privateQuadSubjectError) return { ok: false, error: privateQuadSubjectError }; + const privateQuadObjectError = validateQuadObjectTerms("privateQuads", privateQuads, { + allowBlankNodes: false, + }); if (privateQuadObjectError) return { ok: false, error: privateQuadObjectError }; const privateQuadSize = validateWritableQuadLiteralSizes("privateQuads", privateQuads); if (!privateQuadSize.ok) { @@ -500,13 +597,16 @@ export function parsePublishRequestBody( return { ok: true, value: { - contextGraphId, - quads, - privateQuads, - accessPolicy, - allowedPeers, - subGraphName: subGraphName as string | undefined, - onChainContextGraphId: normalizedOnChainContextGraphId, + body: { + contextGraphId, + quads: publishQuads, + privateQuads, + accessPolicy, + allowedPeers, + subGraphName: subGraphName as string | undefined, + onChainContextGraphId: normalizedOnChainContextGraphId, + }, + literalRewrites: normalizedQuads.value.rewrites, }, }; } diff --git a/packages/cli/src/daemon/routes/knowledge-assets-import.ts b/packages/cli/src/daemon/routes/knowledge-assets-import.ts index d31e4d03d..e8083ce33 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets-import.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets-import.ts @@ -25,6 +25,7 @@ import { validateAssertionName, PayloadTooLargeError, assertQuadLiteralsMutf8Safe, + normalizeLargeRdfLiteralsForBlazegraph, IMPORTED_ARTIFACT_MAX_PAGE_BYTES, isDkgContentHash, verifyDkgContentHash, @@ -42,6 +43,7 @@ import { normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, oversizedRdfLiteralResponseBody, + preparePublicWriteStorageQuads, SMALL_BODY_BYTES, MAX_UPLOAD_BYTES, type ImportFileExtractionPayload, @@ -637,7 +639,11 @@ export async function handleKaSemanticEnrichmentWrite(ctx: RequestContext): Prom generationMethod, semanticQuads, }); - const quads = [...semanticQuads, ...provenanceQuads]; + const normalizedQuads = preparePublicWriteStorageQuads("semanticQuads", [...semanticQuads, ...provenanceQuads]); + if (!normalizedQuads.ok) { + return jsonResponse(res, 400, normalizedQuads.body); + } + const quads = normalizedQuads.value.quads; const targetAssertionUri = contextGraphAssertionUri( artifact.contextGraphId, artifact.assertionAgentAddress, @@ -1840,9 +1846,22 @@ export async function handleKaImportFile(ctx: RequestContext, name: string): Pro }); } try { - assertQuadLiteralsMutf8Safe([...dataGraphQuads, ...metaQuads], { + const normalizedImportQuads = normalizeLargeRdfLiteralsForBlazegraph([...dataGraphQuads, ...metaQuads], { label: 'import-file.quads', }); + const normalizedGraphQuads = normalizedImportQuads.quads.map((q) => { + if (q.graph === undefined) { + throw new Error('import-file.quads normalization produced a quad without graph'); + } + return { + subject: q.subject, + predicate: q.predicate, + object: q.object, + graph: q.graph, + }; + }); + dataGraphQuads = normalizedGraphQuads.filter((q) => q.graph === assertionGraph); + metaQuads.splice(0, metaQuads.length, ...normalizedGraphQuads.filter((q) => q.graph === metaGraph)); } catch (err: any) { if (err?.code === "OVERSIZED_RDF_LITERAL") { const oversizedBody = oversizedRdfLiteralResponseBody(err); diff --git a/packages/cli/src/daemon/routes/knowledge-assets.ts b/packages/cli/src/daemon/routes/knowledge-assets.ts index 8962b2369..edfb7982a 100644 --- a/packages/cli/src/daemon/routes/knowledge-assets.ts +++ b/packages/cli/src/daemon/routes/knowledge-assets.ts @@ -37,11 +37,10 @@ import { validateRequiredContextGraphId, parsePublishRequestBody, isWritableQuad, - validateQuadObjectTerms, respondIfReconcileUnavailable, respondIfChainRpcTransportError, sanitizeRpcMessage, - validateWritableQuadLiteralSizes, + preparePublicWriteStorageQuads, normalizeContextGraphIdOrUri, resolveRequiredWriteContextGraphId, isNoFundedPublisherWalletLike, @@ -673,6 +672,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< const parsed = parsePublishRequestBody(rawBody); if (!parsed.ok) return jsonResponse(res, 400, parsed.body ?? { error: parsed.error }); const raw = JSON.parse(rawBody) as Record; + const { body: publishBody, literalRewrites } = parsed.value; const { contextGraphId, quads, @@ -681,7 +681,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< allowedPeers, subGraphName, onChainContextGraphId, - } = parsed.value; + } = publishBody; const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -736,6 +736,7 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< ...(chain?.blockNumber !== undefined ? { blockNumber: chain.blockNumber } : {}), ...(chain?.batchId !== undefined ? { batchId: String(chain.batchId) } : {}), ...(chain?.publisherAddress ? { publisherAddress: chain.publisherAddress } : {}), + ...(literalRewrites && literalRewrites.length > 0 ? { literalRewrites } : {}), ...(typeof pub?.contextGraphError === "string" ? { contextGraphError: pub.contextGraphError } : {}), ...(reason ? { error: reason } : {}), }); @@ -819,12 +820,14 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< // author attestation and reserves the on-chain identity, so it requires the // CG to be registered). OT-RFC-43 §10.5.5. const hasQuads = Array.isArray(quads) && quads.length > 0; + let quadsToWrite = quads; if (hasQuads) { if (!quads.every(isWritableQuad)) { return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quadsToWrite = normalizedQuads.value.quads; } const shouldFinalize = hasQuads && finalize !== false; // #1116 D5: the create ROUTE stays a primitive — create+write+seal, with @@ -897,9 +900,9 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< // explicit finalize:false leaves an editable WM draft and never touches // the chain (OT-RFC-43 §10.5.5). `also*` are opt-in transitions on top. if (hasQuads) { - await agent.assertion.write(resolvedContextGraphId, name, quads, { subGraphName, ...atomicAuthorLane }); - result.written = quads.length; - emitMemoryGraphChanged?.({ contextGraphId: resolvedContextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: quads.length } }); + await agent.assertion.write(resolvedContextGraphId, name, quadsToWrite, { subGraphName, ...atomicAuthorLane }); + result.written = quadsToWrite.length; + emitMemoryGraphChanged?.({ contextGraphId: resolvedContextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: quadsToWrite.length } }); } if (shouldFinalize) { const seal = await agent.assertion.finalize(resolvedContextGraphId, name, { @@ -1139,12 +1142,10 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< if (!parsed.quads.every(isWritableQuad)) { return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); } - // GH #306/#787 (follow-up) — reject objects that are neither a quoted - // literal nor an absolute IRI before they reach (and crash) the parser. - const wmObjErr = validateQuadObjectTerms("quads", parsed.quads); - if (wmObjErr) return jsonResponse(res, 400, { error: wmObjErr }); - const literalSize = validateWritableQuadLiteralSizes("quads", parsed.quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + // Validate object terms and chunk oversized schema:text literals before + // the write path sees these public-write quads. + const normalizedQuads = preparePublicWriteStorageQuads("quads", parsed.quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); // A bare write to a name that was never created used to fall through to // the legacy `/assertion/{addr}/{name}` graph and produce a KA that is // permanently 404 in the descriptor API (no `_meta` lifecycle record, @@ -1170,9 +1171,9 @@ export async function handleKnowledgeAssetsRoutes(ctx: RequestContext): Promise< ...writeAuthorLane, }); } - await agent.assertion.write(contextGraphId, name, parsed.quads, { subGraphName, ...writeAuthorLane }); - emitMemoryGraphChanged?.({ contextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: parsed.quads.length } }); - return jsonResponse(res, 200, { written: parsed.quads.length }); + await agent.assertion.write(contextGraphId, name, normalizedQuads.value.quads, { subGraphName, ...writeAuthorLane }); + emitMemoryGraphChanged?.({ contextGraphId, layers: ["wm"], subGraphName, operation: "assertion_written", source: "api", counts: { triples: normalizedQuads.value.quads.length } }); + return jsonResponse(res, 200, { written: normalizedQuads.value.quads.length }); } if (verb === "finalize") { const finalizeOptions = resolveFinalizeOptions(parsed, res, writePreflightCallerAgentAddress); diff --git a/packages/cli/src/daemon/routes/memory.ts b/packages/cli/src/daemon/routes/memory.ts index adeef15fc..dfc1147c5 100644 --- a/packages/cli/src/daemon/routes/memory.ts +++ b/packages/cli/src/daemon/routes/memory.ts @@ -64,7 +64,7 @@ import { loadOpWallets, } from '@origintrail-official/dkg-agent'; import { computeNetworkId, createOperationContext, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, assertSafeRdfTerm, sparqlIri, contextGraphSharedMemoryUri, sharedMemoryReadBothFilter, contextGraphAssertionUri, contextGraphMetaUri, escapeDkgRdfLiteral, escapeSparqlLiteral, PROTOCOL_SYNC } from '@origintrail-official/dkg-core'; -import { skolemizeByEntity, findReservedSubjectPrefix, isSkolemizedUri, type PublishOptions, type PublishResult } from '@origintrail-official/dkg-publisher'; +import { skolemizeByEntity, findReservedSubjectPrefix, isSkolemizedUri, SKOLEMIZED_BLANK_NODE_SEGMENT, type PublishOptions, type PublishResult } from '@origintrail-official/dkg-publisher'; import type { Quad } from '@origintrail-official/dkg-storage'; import { buildAutoRegisterFailureBody } from "./shared-assertion-helpers.js"; import { @@ -199,8 +199,7 @@ import { resolveNameToPeerId, isPublishQuad, isWritableQuad, - validateQuadObjectTerms, - validateWritableQuadLiteralSizes, + preparePublicWriteStorageQuads, oversizedRdfLiteralResponseBody, parsePublishRequestBody, jsonResponse, @@ -368,10 +367,8 @@ type PreSignedAuthorAttestation = { type SharedMemoryPublishSelection = "all" | { rootEntities: string[] }; const WORKSPACE_OWNER_PREDICATE = 'http://dkg.io/ontology/workspaceOwner'; -const SKOLEM_GENID_SEGMENT = '/.well-known/genid/'; - function subjectMatchesPublishRoot(subject: string, root: string): boolean { - return subject === root || (isSkolemizedUri(subject) && subject.startsWith(`${root}${SKOLEM_GENID_SEGMENT}`)); + return subject === root || (isSkolemizedUri(subject) && subject.startsWith(`${root}${SKOLEMIZED_BLANK_NODE_SEGMENT}`)); } // Exported for the OT-RFC-46 read-both regression test (the route mocks the @@ -403,7 +400,7 @@ export async function resolvePublishRootEntities( VALUES ?root { ${values} } ?s ?p ?o . FILTER(?p != <${WORKSPACE_OWNER_PREDICATE}>) - FILTER(?s = ?root || STRSTARTS(STR(?s), CONCAT(STR(?root), "${SKOLEM_GENID_SEGMENT}"))) + FILTER(?s = ?root || STRSTARTS(STR(?s), CONCAT(STR(?root), "${SKOLEMIZED_BLANK_NODE_SEGMENT}"))) } ${swmGraphScope} }`, @@ -639,7 +636,7 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { const graph = `did:dkg:context-graph:${resolvedContextGraphId}/meta/query-catalog`; try { assertSafeIri(graph); - const normalized = quads.map((quad: unknown, index: number) => { + let normalized = quads.map((quad: unknown, index: number) => { if (!quad || typeof quad !== "object" || Array.isArray(quad)) { throw new Error(`quads[${index}] must be an object`); } @@ -670,8 +667,9 @@ export async function handleMemoryRoutes(ctx: RequestContext): Promise { }; }); - const literalSize = validateWritableQuadLiteralSizes("quads", normalized); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = preparePublicWriteStorageQuads("quads", normalized); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + normalized = normalizedQuads.value.quads; await agent.store.insert(normalized); return jsonResponse(res, 200, { ok: true, @@ -1629,7 +1627,7 @@ WHERE { const body = await readBody(req); const parsed = safeParseJson(body, res); if (!parsed) return; - const { quads, subGraphName } = parsed; + let { quads, subGraphName } = parsed; const localOnly = parsed.localOnly === true; if ( parsed.localOnly !== undefined && @@ -1654,15 +1652,13 @@ WHERE { // of crashing the SWM write path with a TypeError (HTTP 500). if (!Array.isArray(quads) || !quads.every(isWritableQuad)) return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); - // GH #306/#787 (follow-up) — also reject objects that are neither a quoted - // literal nor an absolute IRI; otherwise they slip past the shape guard and - // crash the RDF parser ("No scheme found in an absolute IRI") with HTTP 500. + // Validate object terms and chunk oversized schema:text literals before + // storage/share sees these public-write quads. { - const objErr = validateQuadObjectTerms("quads", quads); - if (objErr) return jsonResponse(res, 400, { error: objErr }); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quads = normalizedQuads.value.quads; } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2251,22 +2247,20 @@ WHERE { const body = await readBody(req); const parsed = safeParseJson(body, res); if (!parsed) return; - const { quads, conditions, subGraphName } = parsed; + let { quads, conditions, subGraphName } = parsed; const contextGraphId = parsed.contextGraphId; if (!quads?.length) return jsonResponse(res, 400, { error: 'Missing "quads"' }); // GH #787 / #306 — reject string-shaped / malformed quads (4xx, not a 500 crash). if (!Array.isArray(quads) || !quads.every(isWritableQuad)) return jsonResponse(res, 400, { error: '"quads" must be an array of { subject, predicate, object } objects (graph optional); string-shaped quads are not accepted' }); - // GH #306/#787 (follow-up) — also reject objects that are neither a quoted - // literal nor an absolute IRI; otherwise they slip past the shape guard and - // crash the RDF parser ("No scheme found in an absolute IRI") with HTTP 500. + // Validate object terms and chunk oversized schema:text literals before + // storage/share sees these public-write quads. { - const objErr = validateQuadObjectTerms("quads", quads); - if (objErr) return jsonResponse(res, 400, { error: objErr }); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + quads = normalizedQuads.value.quads; } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); const resolvedContextGraphId = await resolveRequiredWriteContextGraphId( agent, contextGraphId, @@ -2462,14 +2456,15 @@ WHERE { }); } - const literalSize = validateWritableQuadLiteralSizes("quads", quads); - if (!literalSize.ok) return jsonResponse(res, 400, literalSize.body); + const normalizedQuads = preparePublicWriteStorageQuads("quads", quads); + if (!normalizedQuads.ok) return jsonResponse(res, 400, normalizedQuads.body); + const quadsToWrite = normalizedQuads.value.quads; // 5. Write to target layer try { if (targetLayer === 'swm') { // agent.share sets the graph field itself — pass quads with empty graph - const shareQuads = quads.map(({ subject, predicate, object }) => ({ subject, predicate, object, graph: '' })); + const shareQuads = quadsToWrite.map(({ subject, predicate, object }) => ({ subject, predicate, object, graph: '' })); const ctx = createOperationContext('share'); tracker.start(ctx, { contextGraphId: resolvedContextGraphId, details: { tripleCount: shareQuads.length, source: 'memory-turn', subGraphName } }); try { @@ -2487,7 +2482,7 @@ WHERE { throw err; } } else { - await agent.store.insert(quads); + await agent.store.insert(quadsToWrite); } } catch (err: any) { if (err?.code === "OVERSIZED_RDF_LITERAL") { @@ -2501,7 +2496,7 @@ WHERE { subGraphName, operation: "memory_turn_written", source: "memory-turn", - counts: { triples: quads.length }, + counts: { triples: quadsToWrite.length }, }); // 6. Generate embedding (best-effort, non-blocking for response) @@ -2532,7 +2527,7 @@ WHERE { graph: targetGraph, structuralTripleCount: extractResult.triples.length, semanticTripleCount: semanticTriples.length, - totalQuads: quads.length, + totalQuads: quadsToWrite.length, embeddingId, sessionUri: sessionUri ?? null, }); diff --git a/packages/cli/src/daemon/routes/shared-assertion-helpers.ts b/packages/cli/src/daemon/routes/shared-assertion-helpers.ts index 967ef1fe3..2a13f60d0 100644 --- a/packages/cli/src/daemon/routes/shared-assertion-helpers.ts +++ b/packages/cli/src/daemon/routes/shared-assertion-helpers.ts @@ -18,7 +18,6 @@ import { ImportedArtifactMetadataError, isDkgContentHash, resolveImportedArtifactMetadata, - assertRdfLiteralMutf8Safe, } from '@origintrail-official/dkg-core'; import { type PromoteJob, type PromoteJobState } from '@origintrail-official/dkg-publisher'; import { daemonState } from '../state.js'; @@ -325,11 +324,6 @@ export function normalizeSemanticQuads(raw: unknown): Array<{ subject: string; p } else { normalizedObject = rdfLiteral(object); } - assertRdfLiteralMutf8Safe(normalizedObject, { - label: `semanticQuads[${index}].object`, - subject, - predicate, - }); return { subject, predicate, object: normalizedObject }; }); } diff --git a/packages/cli/test/daemon-ka-transport.test.ts b/packages/cli/test/daemon-ka-transport.test.ts index 5ea3785ff..e2e250c28 100644 --- a/packages/cli/test/daemon-ka-transport.test.ts +++ b/packages/cli/test/daemon-ka-transport.test.ts @@ -8,6 +8,7 @@ // daemon / storage / native deps. import { describe, it, expect } from 'vitest'; import { ChainRpcTransportError } from '@origintrail-official/dkg-chain'; +import { DKG_CHUNK_VALUE, DKG_HAS_TEXT_BODY } from '@origintrail-official/dkg-core'; import { handleKnowledgeAssetsRoutes } from '../src/daemon/routes/knowledge-assets.js'; import type { RequestContext } from '../src/daemon/routes/context.js'; @@ -66,6 +67,43 @@ function revert() { describe('knowledge-assets publish routes — transport-status mapping (#1329)', () => { describe('POST /api/knowledge-assets/publish (direct explicit-quads mint)', () => { + it('returns literalRewrites after oversized schema:text normalization on success', async () => { + const root = 'http://example.org/direct-oversized'; + const oversized = `"${'x'.repeat(60_000)}"`; + let publishedQuads: any[] = []; + const agent = publishAgent({ + publish: async (_cg: string, quads: any[]) => { + publishedQuads = quads; + return { + status: 'confirmed', + kaId: '42', + kaManifest: [{ tokenId: '42', rootEntity: root }], + }; + }, + }); + + const { res, done } = runKaCtx('POST', '/api/knowledge-assets/publish', agent, { + contextGraphId: 'cg-1', + quads: [{ subject: root, predicate: 'http://schema.org/text', object: oversized, graph: '' }], + }); + + await done; + const body = JSON.parse(res.body); + + expect(res.statusCode).toBe(200); + expect(body.literalRewrites).toHaveLength(1); + expect(body.literalRewrites[0]).toMatchObject({ + subject: root, + predicate: 'http://schema.org/text', + graph: '', + originalMutf8Bytes: 60_002, + }); + expect(publishedQuads.some((quad) => quad.predicate === 'http://schema.org/text')).toBe(false); + expect(publishedQuads.some((quad) => quad.predicate === DKG_HAS_TEXT_BODY)).toBe(true); + expect(publishedQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(publishedQuads.every((quad) => quad.graph === '')).toBe(true); + }); + it('→ 503 on RPC_ENDPOINTS_EXHAUSTED, with a sanitized body (no URL/key leak)', async () => { const agent = publishAgent({ publish: async () => { throw exhaustion(); } }); const { res, done } = runKaCtx('POST', '/api/knowledge-assets/publish', agent, { contextGraphId: 'cg-1', quads: QUADS }); diff --git a/packages/cli/test/daemon-memory-turn-literal-size.test.ts b/packages/cli/test/daemon-memory-turn-literal-size.test.ts new file mode 100644 index 000000000..6f1fa7777 --- /dev/null +++ b/packages/cli/test/daemon-memory-turn-literal-size.test.ts @@ -0,0 +1,118 @@ +import { describe, expect, it, vi } from 'vitest'; +import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, +} from '@origintrail-official/dkg-core'; +import { handleMemoryRoutes } from '../src/daemon/routes/memory.js'; +import type { RequestContext } from '../src/daemon/routes/context.js'; + +function fakeRes() { + const res: any = { statusCode: 0, body: '', headers: {} as Record, writableEnded: false }; + res.writeHead = (status: number, headers?: Record) => { + res.statusCode = status; + if (headers) Object.assign(res.headers, headers); + return res; + }; + res.setHeader = (key: string, value: string) => { + res.headers[key] = value; + }; + res.end = (body?: string) => { + res.body = body ?? ''; + res.writableEnded = true; + }; + return res; +} + +function fakeReq(body: unknown) { + return { + method: 'POST', + headers: {}, + __dkgPrebufferedBody: Buffer.from(JSON.stringify(body)), + } as any; +} + +const noopTracker = { + start() {}, + complete() {}, + fail() {}, + setCost() {}, + setTxHash() {}, + trackPhase: (_ctx: unknown, _phase: unknown, fn: () => unknown) => fn(), +}; + +const ACCEPT_PROBE = { + exists: true, + hasLocalContent: true, + declarationFound: true, + accessPolicy: 'public', + callerAuthorized: true, +}; + +describe('POST /api/memory/turn RDF literal normalization', () => { + it('chunks oversized schema:text emitted from markdown frontmatter before WM insert', async () => { + const inserted: Array<{ subject: string; predicate: string; object: string; graph: string }> = []; + const events: unknown[] = []; + const agent = { + peerId: 'peer-memory-turn', + probeContextGraphWritePreflight: vi.fn(async () => ACCEPT_PROBE), + store: { + insert: vi.fn(async (quads: typeof inserted) => { + inserted.push(...quads); + }), + }, + }; + const res = fakeRes(); + const url = new URL('http://127.0.0.1/api/memory/turn'); + const largeBody = 'computer history '.repeat(4_000); + const markdown = [ + '---', + 'id: http://example.org/turn/frontmatter-text', + 'text: |-', + ` ${largeBody}`, + '---', + '# Conversation turn', + ].join('\n'); + + await handleMemoryRoutes({ + req: fakeReq({ + contextGraphId: 'memory-turn-cg', + layer: 'wm', + markdown, + }), + res, + agent, + tracker: noopTracker, + config: {}, + fileStore: { + put: vi.fn(async (bytes: Buffer, contentType: string) => ({ + keccak256: 'abc123', + size: bytes.length, + contentType, + })), + }, + vectorStore: { insert: vi.fn() }, + embeddingProvider: null, + path: url.pathname, + url, + requestAgentAddress: `0x${'12'.repeat(20)}`, + emitMemoryGraphChanged: (event: unknown) => { + events.push(event); + }, + } as unknown as RequestContext); + + expect(res.statusCode, res.body).toBe(200); + const response = JSON.parse(res.body); + expect(agent.store.insert).toHaveBeenCalledTimes(1); + expect(inserted.some((quad) => + quad.subject === response.turnUri && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(inserted.some((quad) => + quad.subject === response.turnUri && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(inserted.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(response.totalQuads).toBe(inserted.length); + expect(events).toHaveLength(1); + }); +}); diff --git a/packages/cli/test/http-literal-size-validation.test.ts b/packages/cli/test/http-literal-size-validation.test.ts index 190daf3f0..3898860e1 100644 --- a/packages/cli/test/http-literal-size-validation.test.ts +++ b/packages/cli/test/http-literal-size-validation.test.ts @@ -1,14 +1,21 @@ import { describe, expect, it } from 'vitest'; +import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + normalizeLargeRdfLiteralsForBlazegraph, +} from '@origintrail-official/dkg-core'; import { buildImportFileResponse, parsePublishRequestBody, + preparePublicWriteQuads, + preparePublicWriteStorageQuads, validateWritableQuadLiteralSizes, } from '../src/daemon/http-utils.js'; const OVERSIZED_LITERAL = `"${'x'.repeat(60_000)}"`; describe('HTTP RDF literal size validation', () => { - it('returns structured parse failure for oversized direct publish literals', () => { + it('normalizes oversized direct publish schema:text literals', () => { const parsed = parsePublishRequestBody(JSON.stringify({ contextGraphId: 'literal-size-cg', quads: [{ @@ -19,6 +26,90 @@ describe('HTTP RDF literal size validation', () => { }], })); + expect(parsed.ok).toBe(true); + if (parsed.ok) { + expect(parsed.value.literalRewrites).toHaveLength(1); + expect(parsed.value.literalRewrites?.[0]).toMatchObject({ + subject: 'http://example.org/s', + predicate: 'http://schema.org/text', + originalMutf8Bytes: 60_002, + }); + expect(parsed.value.body.quads.some((quad) => + quad.subject === 'http://example.org/s' && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(parsed.value.body.quads.some((quad) => + quad.subject === 'http://example.org/s' && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(parsed.value.body.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + } + }); + + it('normalizes linked blank-node schema:text literals and derives rewrite metadata server-side', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + literalRewrites: [{ subject: 'client-supplied', predicate: 'ignored' }], + quads: [ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:body', + graph: 'http://example.org/g', + }, + { + subject: '_:body', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: 'http://example.org/g', + }, + ], + })); + + expect(parsed.ok).toBe(true); + if (parsed.ok) { + const child = 'http://example.org/root/.well-known/genid/body'; + expect(parsed.value.literalRewrites).toHaveLength(1); + expect(parsed.value.literalRewrites[0]).toMatchObject({ + subject: child, + predicate: 'http://schema.org/text', + originalMutf8Bytes: 60_002, + }); + expect(parsed.value.literalRewrites[0]?.subject).not.toBe('client-supplied'); + expect(parsed.value.body.quads.some((quad) => + quad.subject === 'http://example.org/root' && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(parsed.value.body.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(parsed.value.body.quads.some((quad) => + quad.subject === child && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(parsed.value.body.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + } + }); + + it('returns structured parse failure for oversized private publish literals', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + quads: [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + object: '"safe"', + graph: 'http://example.org/g', + }], + privateQuads: [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: 'http://example.org/g', + }], + })); + expect(parsed.ok).toBe(false); if (!parsed.ok) { expect(parsed.body).toMatchObject({ @@ -30,10 +121,66 @@ describe('HTTP RDF literal size validation', () => { } }); - it('returns structured validation failure for writable route quads', () => { + it('rejects blank-node private publish object terms because private quads are not skolemized', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + quads: [{ + subject: 'http://example.org/root', + predicate: 'http://schema.org/name', + object: '"safe"', + graph: 'http://example.org/g', + }], + privateQuads: [ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:secret', + graph: 'http://example.org/g', + }, + { + subject: 'http://example.org/root/secret', + predicate: 'http://schema.org/name', + object: '"hidden"', + graph: 'http://example.org/g', + }, + ], + })); + + expect(parsed.ok).toBe(false); + if (!parsed.ok) { + expect(parsed.error).toContain('Invalid "privateQuads[0].object"'); + expect(parsed.error).toContain('quoted literal term or absolute IRI'); + } + }); + + it('rejects blank-node private publish subjects even when public quads link to the same blank node', () => { + const parsed = parsePublishRequestBody(JSON.stringify({ + contextGraphId: 'literal-size-cg', + quads: [{ + subject: 'http://example.org/root', + predicate: 'http://schema.org/hasPart', + object: '_:secret', + graph: 'http://example.org/g', + }], + privateQuads: [{ + subject: '_:secret', + predicate: 'http://schema.org/name', + object: '"hidden"', + graph: 'http://example.org/g', + }], + })); + + expect(parsed.ok).toBe(false); + if (!parsed.ok) { + expect(parsed.error).toContain('Invalid "privateQuads[0].subject"'); + expect(parsed.error).toContain('must not be a blank node'); + } + }); + + it('returns structured validation failure for reject-only private/writable guards', () => { const result = validateWritableQuadLiteralSizes('quads', [{ subject: 'http://example.org/s', - predicate: 'http://schema.org/text', + predicate: 'http://schema.org/name', object: OVERSIZED_LITERAL, }]); @@ -47,6 +194,133 @@ describe('HTTP RDF literal size validation', () => { } }); + it('prepares public-write storage quads with explicit normalized count', () => { + const prepared = preparePublicWriteStorageQuads('quads', [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: 'http://example.org/g', + }]); + + expect(prepared.ok).toBe(true); + if (prepared.ok) { + expect(prepared.value.rewrites).toHaveLength(1); + expect(prepared.value.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(prepared.value.totalQuads).toBe(prepared.value.quads.length); + } + }); + + it('returns structured object-term validation failures from the public-write storage helper', () => { + const prepared = preparePublicWriteStorageQuads('quads', [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + object: 'not-a-valid-rdf-object', + graph: 'http://example.org/g', + }]); + + expect(prepared.ok).toBe(false); + if (!prepared.ok) { + expect(prepared.body.error).toContain('Invalid "quads[0].object"'); + } + }); + + it('rejects small malformed quoted literal terms before storage routes see them', () => { + for (const object of ['"bad\nliteral"', ' "ok"', '"ok"\n']) { + const prepared = preparePublicWriteStorageQuads('quads', [{ + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + object, + graph: 'http://example.org/g', + }]); + + expect(prepared.ok).toBe(false); + if (!prepared.ok) { + expect(prepared.body.error).toContain('quoted literal term'); + } + } + }); + + it('keeps import-file chunk quads in their source data or metadata graph', () => { + const dataGraph = 'did:dkg:context-graph:test/_working_memory/0xabc/1'; + const metaGraph = 'did:dkg:context-graph:test/_meta'; + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/import-root', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: dataGraph, + }, + { + subject: 'http://example.org/import-meta', + predicate: 'http://schema.org/name', + object: '"metadata"', + graph: metaGraph, + }, + ], { label: 'import-file.quads' }); + + const dataQuads = normalized.quads.filter((quad) => quad.graph === dataGraph); + const metaQuads = normalized.quads.filter((quad) => quad.graph === metaGraph); + expect(dataQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(dataQuads.some((quad) => quad.predicate === 'http://schema.org/text')).toBe(false); + expect(metaQuads).toEqual([{ + subject: 'http://example.org/import-meta', + predicate: 'http://schema.org/name', + object: '"metadata"', + graph: metaGraph, + }]); + }); + + it('prepares semantic enrichment quads without moving provenance quads', () => { + const graph = 'did:dkg:context-graph:test/_working_memory/0xabc/semantic'; + const prepared = preparePublicWriteQuads('semanticQuads', [ + { + subject: 'http://example.org/semantic', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph, + }, + { + subject: 'http://example.org/provenance', + predicate: 'http://dkg.io/ontology/generatedBy', + object: '"semantic-enrichment"', + graph, + }, + ]); + + expect(prepared.ok).toBe(true); + if (prepared.ok) { + expect(prepared.value.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(prepared.value.quads.some((quad) => + quad.subject === 'http://example.org/provenance' && + quad.predicate === 'http://dkg.io/ontology/generatedBy' + )).toBe(true); + expect(prepared.value.rewrites).toHaveLength(1); + } + }); + + it('preserves empty graph on normalized public-write quads', () => { + const prepared = preparePublicWriteQuads('quads', [{ + subject: 'http://example.org/empty-graph', + predicate: 'http://schema.org/text', + object: OVERSIZED_LITERAL, + graph: '', + }]); + + expect(prepared.ok).toBe(true); + if (prepared.ok) { + expect(prepared.value.quads.length).toBeGreaterThan(1); + expect(prepared.value.quads.every((quad) => + Object.prototype.hasOwnProperty.call(quad, 'graph') + )).toBe(true); + expect(prepared.value.quads.every((quad) => quad.graph === '')).toBe(true); + expect(prepared.value.rewrites[0]).toMatchObject({ + subject: 'http://example.org/empty-graph', + predicate: 'http://schema.org/text', + graph: '', + }); + } + }); + it('preserves oversized literal fields in import-file extraction responses', () => { expect(buildImportFileResponse({ assertionUri: 'http://example.org/assertion', diff --git a/packages/cli/test/import-artifact-routes.test.ts b/packages/cli/test/import-artifact-routes.test.ts index 6a86d8458..20872866b 100644 --- a/packages/cli/test/import-artifact-routes.test.ts +++ b/packages/cli/test/import-artifact-routes.test.ts @@ -5,12 +5,17 @@ import { mkdtemp, rm } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import { + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, contextGraphAssertionUri, contextGraphMetaUri, contextGraphSharedMemoryUri, keccak256ContentHash, sharedMemoryReadBothFilter, } from '@origintrail-official/dkg-core'; +import { reconstructChunkedText } from '../../core/test/helpers/chunked-text.js'; import { FileStore } from '../src/file-store.js'; import type { ExtractionStatusRecord } from '../src/extraction-status.js'; import { handleKnowledgeAssetsRoutes } from '../src/daemon/routes/knowledge-assets.js'; @@ -18,7 +23,7 @@ import { handleKnowledgeAssetsRoutes } from '../src/daemon/routes/knowledge-asse const DKG = 'http://dkg.io/ontology/'; const PROV = 'http://www.w3.org/ns/prov#'; -type Quad = { subject: string; predicate: string; object: string }; +type Quad = { subject: string; predicate: string; object: string; graph?: string }; function sha256Hash(bytes: Buffer): string { return `sha256:${createHash('sha256').update(bytes).digest('hex')}`; @@ -111,6 +116,26 @@ describe('import artifact daemon routes', () => { return { status: res.status, body: await res.json() }; } + async function postMultipart( + path: string, + parts: Array< + | { name: string; value: string } + | { name: string; filename: string; contentType: string; value: string | Uint8Array } + >, + ) { + const form = new FormData(); + for (const part of parts) { + if ('filename' in part) { + const bytes = typeof part.value === 'string' ? part.value : new Uint8Array(part.value); + form.append(part.name, new Blob([bytes], { type: part.contentType }), part.filename); + } else { + form.append(part.name, part.value); + } + } + const res = await fetch(`${baseUrl}${path}`, { method: 'POST', body: form }); + return { status: res.status, body: await res.json() }; + } + async function get(path: string) { const res = await fetch(`${baseUrl}${path}`); return { status: res.status, body: await res.json() }; @@ -168,6 +193,7 @@ describe('import artifact daemon routes', () => { agentAddress?: string; subGraphName?: string; }> = []; + const insertedQuads: Quad[] = []; const queries: string[] = []; const queryQuads = args.queryQuads ?? [ { subject: 'urn:z', predicate: 'urn:p', object: 'urn:o' }, @@ -228,6 +254,14 @@ describe('import artifact daemon routes', () => { if (args.publisherDiscardError) throw args.publisherDiscardError; discards.push({ contextGraphId, name, agentAddress, subGraphName }); }, + async wmGraphUri( + contextGraphId: string, + agentAddress: string, + name: string, + subGraphName?: string, + ) { + return contextGraphAssertionUri(contextGraphId, agentAddress, name, subGraphName); + }, }, ...(args.onChainPolicy ? { @@ -252,8 +286,20 @@ describe('import artifact daemon routes', () => { async hasGraph() { return Boolean(args.targetGraphExists); }, + async insert(quads: Quad[]) { + insertedQuads.push(...quads); + }, + async deleteByPattern() { + return 0; + }, + async dropGraph() { + return undefined; + }, async query(sparql: string) { queries.push(sparql); + if (sparql.includes('CONSTRUCT')) { + return { type: 'quads', quads: [] }; + } if (sparql.includes('SELECT ?p ?o')) { return { type: 'bindings', bindings: [] }; } @@ -309,7 +355,7 @@ describe('import artifact daemon routes', () => { }, }, }; - return { agent, created, writes, discards, queries }; + return { agent, created, writes, discards, queries, insertedQuads }; } it('resolves and safely reads a completed Markdown import artifact by content hash', async () => { @@ -361,6 +407,61 @@ describe('import artifact daemon routes', () => { }); }); + it('wm/import-file chunks oversized schema:text through handleKaImportFile', async () => { + const contextGraphId = 'cg-import-file-large-text'; + const assertionName = 'large-text-import'; + const assertionUri = contextGraphAssertionUri(contextGraphId, 'did:dkg:agent:test', assertionName); + const { agent, insertedQuads } = makeAgent({ + contextGraphId, + assertionName, + assertionUri, + fileHash: `keccak256:${'0'.repeat(64)}`, + markdownHash: `keccak256:${'1'.repeat(64)}`, + markdownForm: `urn:dkg:file:keccak256:${'1'.repeat(64)}`, + }); + await startRoutes({ agent }); + + const largeText = 'x'.repeat(60_000); + const markdown = [ + '---', + 'id: route-large-text', + `text: "${largeText}"`, + '---', + '', + '# Large Text', + '', + ].join('\n'); + + const result = await postMultipart(`/api/knowledge-assets/${assertionName}/wm/import-file`, [ + { name: 'contextGraphId', value: contextGraphId }, + { name: 'file', filename: 'large.md', contentType: 'text/markdown', value: markdown }, + ]); + + expect(result.status).toBe(200); + expect(result.body.assertionUri).toBe(assertionUri); + const assertionGraph = contextGraphAssertionUri(contextGraphId, 'did:dkg:agent:test', assertionName); + const dataQuads = insertedQuads.filter((quad) => quad.graph === assertionGraph); + expect(dataQuads.some((quad) => + quad.subject === assertionUri && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(dataQuads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructChunkedText(dataQuads, assertionUri)).toBe(largeText); + + const chunkQuads = insertedQuads.filter((quad) => + quad.predicate === DKG_HAS_TEXT_BODY || + quad.predicate === DKG_HAS_TEXT_CHUNK || + quad.predicate === DKG_CHUNK_INDEX || + quad.predicate === DKG_CHUNK_VALUE + ); + expect(chunkQuads.length).toBeGreaterThan(0); + expect(chunkQuads.every((quad) => quad.graph === assertionGraph)).toBe(true); + expect(insertedQuads.some((quad) => + quad.graph === contextGraphMetaUri(contextGraphId) && + quad.predicate === DKG_CHUNK_VALUE + )).toBe(false); + }); + it('reads a generic source artifact as bounded base64 bytes', async () => { const entry = await fileStore.put(Buffer.from('source bytes'), 'text/markdown'); const contextGraphId = 'cg-generic-source-artifact'; @@ -1872,15 +1973,15 @@ describe('import artifact daemon routes', () => { expect(writes[0]!.name).toBe(assertionName); expect(writes[0]!.agentAddress).toBe('did:dkg:agent:test'); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Semantic topic"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceAssertion`, object: assertionUri }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceFileHash`, object: `"${entry.keccak256}"` }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownHash`, object: `"${entry.keccak256}"` }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownForm`, object: markdownForm }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"unit-test-model"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:reviewer' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:reviewer' }, - { subject: 'urn:doc:imported', predicate: `${PROV}wasDerivedFrom`, object: assertionUri }, + expect.objectContaining({ subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Semantic topic"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceAssertion`, object: assertionUri }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}sourceFileHash`, object: `"${entry.keccak256}"` }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownHash`, object: `"${entry.keccak256}"` }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}markdownForm`, object: markdownForm }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"unit-test-model"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:reviewer' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:reviewer' }), + expect.objectContaining({ subject: 'urn:doc:imported', predicate: `${PROV}wasDerivedFrom`, object: assertionUri }), ])); expect(events).toEqual(expect.arrayContaining([ expect.objectContaining({ operation: 'semantic_enrichment_written', layers: ['wm'] }), @@ -1890,6 +1991,41 @@ describe('import artifact daemon routes', () => { ])); }); + it('chunks oversized semantic enrichment schema:text before assertion write', async () => { + const entry = await fileStore.put(Buffer.from('# Imported\n'), 'text/markdown'); + const contextGraphId = 'cg-semantic-enrichment-large-text'; + const assertionName = 'imported-large-text'; + const assertionUri = contextGraphAssertionUri(contextGraphId, 'did:dkg:agent:test', assertionName); + const largeText = 'x'.repeat(60_000); + const { agent, writes } = makeAgent({ + contextGraphId, + assertionName, + assertionUri, + fileHash: entry.keccak256, + markdownHash: entry.keccak256, + markdownForm: `urn:dkg:file:${entry.keccak256}`, + }); + await startRoutes({ agent }); + + const result = await post('/api/knowledge-assets/semantic-enrichment/write', { + contextGraphId, + assertionUri, + semanticQuads: [ + { subject: 'urn:doc:semantic-large', predicate: 'http://schema.org/text', object: `"${largeText}"` }, + ], + }); + + expect(result.status).toBe(200); + expect(writes).toHaveLength(1); + const written = writes[0]!.quads; + expect(written.some((quad) => + quad.subject === 'urn:doc:semantic-large' && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(written.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructChunkedText(written, 'urn:doc:semantic-large')).toBe(largeText); + }); + it('rejects semantic enrichment of imported assertions owned by another agent', async () => { const entry = await fileStore.put(Buffer.from('# Imported\n'), 'text/markdown'); const contextGraphId = 'cg-semantic-enrichment-cross-agent'; @@ -1960,7 +2096,7 @@ describe('import artifact daemon routes', () => { expect(result.status).toBe(200); expect(writes).toHaveLength(1); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer Bot"' }, + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer Bot"' }), ])); expect(writes[0]!.quads.filter((quad) => quad.predicate === `${PROV}wasAttributedTo`)).toHaveLength(0); }); @@ -1997,9 +2133,9 @@ describe('import artifact daemon routes', () => { expect(result.status).toBe(200); expect(writes).toHaveLength(1); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Topic\\u0000vertical\\u000Bdel\\u007F"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"model\\u0000unit\\u007F"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer\\u000BBot"' }, + expect.objectContaining({ subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Topic\\u0000vertical\\u000Bdel\\u007F"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generationMethod`, object: '"model\\u0000unit\\u007F"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: '"Reviewer\\u000BBot"' }), ])); expect(writes[0]!.quads.filter((quad) => quad.predicate === `${PROV}wasAttributedTo`)).toHaveLength(0); }); @@ -2030,9 +2166,9 @@ describe('import artifact daemon routes', () => { expect(result.status).toBe(200); expect(writes).toHaveLength(1); expect(writes[0]!.quads).toEqual(expect.arrayContaining([ - { subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Fallback topic"' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:test' }, - { subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:test' }, + expect.objectContaining({ subject: 'urn:doc:imported', predicate: 'http://schema.org/about', object: '"Fallback topic"' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${DKG}generatedBy`, object: 'did:dkg:agent:test' }), + expect.objectContaining({ subject: expect.stringMatching(/^urn:dkg:semantic-enrichment:/), predicate: `${PROV}wasAttributedTo`, object: 'did:dkg:agent:test' }), ])); expect(writes[0]!.quads).not.toEqual(expect.arrayContaining([ expect.objectContaining({ object: 'did:dkg:agent:did:dkg:agent:test' }), diff --git a/packages/cli/test/import-file-integration.part-01.test.ts b/packages/cli/test/import-file-integration.part-01.test.ts index 45cd7bb30..70c04b35d 100644 --- a/packages/cli/test/import-file-integration.part-01.test.ts +++ b/packages/cli/test/import-file-integration.part-01.test.ts @@ -1,5 +1,12 @@ import { describe, it, expect, beforeEach, afterEach, mkdtemp, rm, readFile, tmpdir, join, existsSync, ExtractionPipelineRegistry, autoPartition, findReservedSubjectPrefix, FileStore, parseBoundary, extractFromMarkdown, contextGraphAssertionUri, contextGraphMetaUri, assertionLifecycleUri, ImportFileRouteError, makeMockAgent, getDataGraphQuads, BOUNDARY, CRLF, buildMultipart, type ExtractionPipeline, type ExtractionInput, type ConverterOutput, type ExtractionStatusRecord, type CapturedQuad, type MockAgent } from './import-file-test-helpers'; import { runImportFileOrchestration } from './import-file-orchestration.shared'; +import { + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, +} from '@origintrail-official/dkg-core'; +import { reconstructChunkedText } from '../../core/test/helpers/chunked-text.js'; describe('import-file orchestration — happy paths', () => { @@ -143,6 +150,52 @@ describe('import-file orchestration — happy paths', () => { expect(record.tripleCount).toBe(result.extraction.tripleCount); }); + it('text/markdown upload chunks oversized schema:text frontmatter literals in the assertion graph', async () => { + const largeText = 'x'.repeat(60_000); + const markdown = [ + '---', + 'id: large-literal-note', + `text: "${largeText}"`, + '---', + '', + '# Large Literal', + '', + ].join('\n'); + + const body = buildMultipart([ + { kind: 'text', name: 'contextGraphId', value: 'large-literal-cg' }, + { kind: 'file', name: 'file', filename: 'large.md', contentType: 'text/markdown', content: Buffer.from(markdown, 'utf-8') }, + ]); + + const result = await runImportFileOrchestration({ + agent, fileStore, extractionRegistry: registry, extractionStatus: status, + multipartBody: body, boundary: BOUNDARY, assertionName: 'large-literal', + }); + + expect(result.extraction.status).toBe('completed'); + const assertionGraph = contextGraphAssertionUri('large-literal-cg', agent.peerId, 'large-literal'); + const writtenTriples = getDataGraphQuads(agent, 'large-literal-cg', 'large-literal'); + expect(writtenTriples.some((quad) => + quad.subject === result.assertionUri && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(writtenTriples.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructChunkedText(writtenTriples, result.assertionUri)).toBe(largeText); + + const chunkQuads = agent.insertedQuads.filter((quad) => + quad.predicate === DKG_HAS_TEXT_BODY || + quad.predicate === DKG_HAS_TEXT_CHUNK || + quad.predicate === DKG_CHUNK_INDEX || + quad.predicate === DKG_CHUNK_VALUE + ); + expect(chunkQuads.length).toBeGreaterThan(0); + expect(chunkQuads.every((quad) => quad.graph === assertionGraph)).toBe(true); + expect(agent.insertedQuads.some((quad) => + quad.graph === contextGraphMetaUri('large-literal-cg') && + quad.predicate === DKG_CHUNK_VALUE + )).toBe(false); + }); + it('text/markdown upload uses filePart content type when contentType field is not provided', async () => { const body = buildMultipart([ diff --git a/packages/cli/test/import-file-orchestration.shared.ts b/packages/cli/test/import-file-orchestration.shared.ts index b4224e2ba..957131904 100644 --- a/packages/cli/test/import-file-orchestration.shared.ts +++ b/packages/cli/test/import-file-orchestration.shared.ts @@ -1,4 +1,5 @@ import { contextGraphAssertionUri, contextGraphMetaUri, assertionLifecycleUri, parseMultipart, findReservedSubjectPrefix, isSkolemizedUri, FileStore, ExtractionPipelineRegistry, extractFromMarkdown, randomUUID, buildImportFileResponse, normalizeDetectedContentType, ImportFileRouteError, type CapturedQuad, type MockAgent, type ImportFileResult, type ExtractionStatusRecord } from './import-file-test-helpers'; +import { normalizeLargeRdfLiteralsForBlazegraph } from '@origintrail-official/dkg-core'; import { inferContentTypeFromFilename } from '../src/daemon/manifest.js'; @@ -655,7 +656,20 @@ export async function runImportFileOrchestration(params: { metaCleanupSucceeded = true; await agent.store.dropGraph(assertionGraph); dataDropSucceeded = true; - await agent.store.insert([...dataGraphQuads, ...metaQuads]); + const normalizedImportQuads = normalizeLargeRdfLiteralsForBlazegraph([...dataGraphQuads, ...metaQuads], { + label: 'import-file.quads', + }).quads.map((q) => { + if (q.graph === undefined) { + throw new Error('import-file.quads normalization produced a quad without graph'); + } + return { + subject: q.subject, + predicate: q.predicate, + object: q.object, + graph: q.graph, + }; + }); + await agent.store.insert(normalizedImportQuads); } catch (writeErr: any) { const rollbackErrors: string[] = []; if (dataDropSucceeded && dataSnapshot.length > 0) { diff --git a/packages/cli/test/issue-306-787-write-quad-validation.test.ts b/packages/cli/test/issue-306-787-write-quad-validation.test.ts index c59003532..1ff9f9916 100644 --- a/packages/cli/test/issue-306-787-write-quad-validation.test.ts +++ b/packages/cli/test/issue-306-787-write-quad-validation.test.ts @@ -54,14 +54,13 @@ describe('GH #787 — POST /api/shared-memory/write quad-shape validation', () = expect(status, JSON.stringify(body)).toBe(200); }); - it('returns 400 for oversized RDF literals before SWM write', async () => { + it('chunks oversized schema:text literals before SWM write', async () => { const { status, body } = await postJson(daemon!, '/api/shared-memory/write', { contextGraphId: CG, quads: [{ subject: 'urn:wq:oversized-swm', predicate: 'http://schema.org/text', object: OVERSIZED_LITERAL }], }); - expect(status, JSON.stringify(body)).toBe(400); - expect(body.code).toBe('OVERSIZED_RDF_LITERAL'); - expect(body.actualBytes).toBeGreaterThan(60_000); + expect(status, JSON.stringify(body)).toBe(200); + expect(body.triplesWritten).toBeGreaterThan(1); }); }); @@ -86,16 +85,15 @@ describe('GH #306 — POST /api/knowledge-assets/{name}/wm/write quad-shape vali expect(status, JSON.stringify(body)).toBe(200); }); - it('returns 400 for oversized RDF literals before WM write', async () => { + it('chunks oversized schema:text literals before WM write', async () => { const created = await postJson(daemon!, '/api/knowledge-assets', { contextGraphId: CG, name: 'ka-306-oversized' }); expect(created.status).toBeLessThan(300); const { status, body } = await postJson(daemon!, '/api/knowledge-assets/ka-306-oversized/wm/write', { contextGraphId: CG, quads: [{ subject: 'urn:wq:oversized-wm', predicate: 'http://schema.org/text', object: OVERSIZED_LITERAL }], }); - expect(status, JSON.stringify(body)).toBe(400); - expect(body.code).toBe('OVERSIZED_RDF_LITERAL'); - expect(body.actualBytes).toBeGreaterThan(60_000); + expect(status, JSON.stringify(body)).toBe(200); + expect(body.written).toBeGreaterThan(1); }); }); diff --git a/packages/cli/test/knowledge-assets-route.test.ts b/packages/cli/test/knowledge-assets-route.test.ts index a2a84572f..edfdbda97 100644 --- a/packages/cli/test/knowledge-assets-route.test.ts +++ b/packages/cli/test/knowledge-assets-route.test.ts @@ -1089,7 +1089,7 @@ describe('/api/knowledge-assets routes (real daemon, real chain)', () => { quads: [{ subject: 'urn:s', predicate: 'urn:p', object: 'bare string', graph: '' }], }); expect(res.status).toBe(400); - expect(String(res.body.error)).toMatch(/RDF object must be a quoted literal term or absolute IRI/); + expect(String(res.body.error)).toMatch(/RDF object must be a quoted literal term, blank node, or absolute IRI/); }); it('rejects a malformed onChainContextGraphId with 400 before direct publish', async () => { diff --git a/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts b/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts index 4840e78f6..abcf1a7b1 100644 --- a/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts +++ b/packages/cli/test/no-funded-publisher-wallet-helpers.test.ts @@ -10,6 +10,7 @@ */ import { describe, expect, it } from 'vitest'; +import { OversizedRdfLiteralError } from '@origintrail-official/dkg-core'; import { isNoFundedPublisherWalletLike, noFundedPublisherWalletBody, @@ -91,6 +92,27 @@ describe('respondWithDaemonError', () => { expect(JSON.parse(res.body).code).toBeUndefined(); }); + it('maps oversized RDF literal errors to HTTP 400 with structured details', () => { + const res = mockRes(); + respondWithDaemonError(res, new OversizedRdfLiteralError({ + actualBytes: 60_002, + maxBytes: 60_000, + label: 'quads[0].object', + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + graph: 'http://example.org/g', + })); + expect(res.statusCode).toBe(400); + expect(JSON.parse(res.body)).toMatchObject({ + code: 'OVERSIZED_RDF_LITERAL', + actualBytes: 60_002, + limitBytes: 60_000, + subject: 'http://example.org/s', + predicate: 'http://schema.org/name', + graph: 'http://example.org/g', + }); + }); + it('is a no-op once the response has already been sent', () => { const res = mockRes(); res.writableEnded = true; diff --git a/packages/cli/vitest.unit.config.ts b/packages/cli/vitest.unit.config.ts index 4426b7b51..f10898288 100644 --- a/packages/cli/vitest.unit.config.ts +++ b/packages/cli/vitest.unit.config.ts @@ -65,6 +65,9 @@ export default defineConfig({ // #761 — context graph write-target validation (from main). 'test/context-graph-write-path-validation.test.ts', 'test/http-literal-size-validation.test.ts', + 'test/daemon-memory-turn-literal-size.test.ts', + 'test/no-funded-publisher-wallet-helpers.test.ts', + 'test/daemon-ka-transport.test.ts', 'test/epcis-route-readiness.test.ts', // Notifications-pane redesign (A3) — assertion_activity emitter // helper. Pure logic + a tmp SQLite DashboardDB, no hardhat. diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 03cbb02b5..94728bd2b 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -214,6 +214,34 @@ export { type RdfLiteralSizeContext, type QuadLiteralLike, } from './rdf-literal-size.js'; +export { + RDF_TYPE_IRI, + XSD_INTEGER_IRI, + XSD_STRING_IRI, + SCHEMA_TEXT_PREDICATES, + DKG_TEXT_BODY_CLASS, + DKG_TEXT_CHUNK_CLASS, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, + DKG_TEXT_SOURCE_PREDICATE, + DKG_TEXT_CONTENT_SHA256, + DKG_TEXT_LITERAL_TERM_SHA256, + DKG_TEXT_LITERAL_MUTF8_BYTES, + DKG_TEXT_UTF8_BYTES, + DKG_TEXT_CHUNK_COUNT, + DKG_TEXT_CHUNK_LIMIT, + DKG_TEXT_LANGUAGE, + DKG_TEXT_DATATYPE, + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, + parseRdfLiteralTerm, + rdfLiteralTerm, + normalizeLargeRdfLiteralsForBlazegraph, + type ParsedRdfLiteralTerm, + type RdfTextLiteralRewrite, + type RdfLiteralNormalizationResult, + type RdfLiteralNormalizationOptions, +} from './rdf-text-literal-normalization.js'; export { DKGError, DKGUserError, diff --git a/packages/core/src/rdf-literal-codec.ts b/packages/core/src/rdf-literal-codec.ts new file mode 100644 index 000000000..f3580cbdf --- /dev/null +++ b/packages/core/src/rdf-literal-codec.ts @@ -0,0 +1,108 @@ +export interface ParsedRdfLiteralTerm { + readonly lexical: string; + readonly suffix: string; + readonly language?: string; + readonly datatype?: string; +} + +export function parseRdfLiteralTerm(term: string): ParsedRdfLiteralTerm | null { + if (!term.startsWith('"')) return null; + let escaped = false; + for (let i = 1; i < term.length; i++) { + const ch = term[i]!; + if (escaped) { + escaped = false; + continue; + } + if (ch === '\\') { + escaped = true; + continue; + } + if (isRawLiteralControlCharacter(ch)) return null; + if (ch !== '"') continue; + + try { + const body = term.slice(1, i); + const suffix = term.slice(i + 1); + const metadata = parseLiteralSuffix(suffix); + return { + lexical: decodeRdfLiteralBody(body), + suffix, + ...metadata, + }; + } catch { + return null; + } + } + return null; +} + +export function rdfLiteralTerm(lexical: string, suffix = ''): string { + return `${JSON.stringify(lexical)}${suffix}`; +} + +function isRawLiteralControlCharacter(value: string): boolean { + return value.charCodeAt(0) < 0x20; +} + +function parseLiteralSuffix(suffix: string): { language?: string; datatype?: string } { + if (suffix === '') return {}; + const language = /^@([A-Za-z]+(?:-[A-Za-z0-9]+)*)$/.exec(suffix); + if (language) return { language: language[1] }; + const datatype = /^\^\^<([^<>"{}|\\^`\x00-\x20>]+)>$/.exec(suffix); + if (datatype) return { datatype: datatype[1] }; + throw new Error(`Invalid RDF literal suffix: ${suffix.slice(0, 80)}`); +} + +function decodeRdfLiteralBody(body: string): string { + let out = ''; + for (let i = 0; i < body.length; i++) { + const ch = body[i]!; + if (ch !== '\\') { + out += ch; + continue; + } + i += 1; + if (i >= body.length) throw new Error('Invalid trailing RDF literal escape'); + const escaped = body[i]!; + switch (escaped) { + case 't': + out += '\t'; + break; + case 'b': + out += '\b'; + break; + case 'n': + out += '\n'; + break; + case 'r': + out += '\r'; + break; + case 'f': + out += '\f'; + break; + case '"': + case "'": + case '\\': + out += escaped; + break; + case 'u': { + const hex = body.slice(i + 1, i + 5); + if (!/^[0-9a-fA-F]{4}$/.test(hex)) throw new Error('Invalid RDF \\u escape'); + out += String.fromCharCode(parseInt(hex, 16)); + i += 4; + break; + } + case 'U': { + const hex = body.slice(i + 1, i + 9); + if (!/^[0-9a-fA-F]{8}$/.test(hex)) throw new Error('Invalid RDF \\U escape'); + out += String.fromCodePoint(parseInt(hex, 16)); + i += 8; + break; + } + default: + throw new Error(`Invalid RDF literal escape: \\${escaped}`); + } + } + return out; +} diff --git a/packages/core/src/rdf-text-literal-normalization.ts b/packages/core/src/rdf-text-literal-normalization.ts new file mode 100644 index 000000000..6bf502a45 --- /dev/null +++ b/packages/core/src/rdf-text-literal-normalization.ts @@ -0,0 +1,276 @@ +import { createHash } from 'node:crypto'; +import { + DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, + JAVA_WRITE_UTF_MAX_BYTES, + OversizedRdfLiteralError, + assertQuadLiteralsMutf8Safe, + javaModifiedUtf8ByteLength, + rdfLiteralTermMutf8ByteLength, + type QuadLiteralLike, +} from './rdf-literal-size.js'; +import { + parseRdfLiteralTerm, + rdfLiteralTerm, + type ParsedRdfLiteralTerm, +} from './rdf-literal-codec.js'; +import { isSafeIri } from './sparql-safe.js'; + +export const RDF_TYPE_IRI = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; +export const XSD_INTEGER_IRI = 'http://www.w3.org/2001/XMLSchema#integer'; +export const XSD_STRING_IRI = 'http://www.w3.org/2001/XMLSchema#string'; +export const SCHEMA_TEXT_PREDICATES = [ + 'http://schema.org/text', + 'https://schema.org/text', +] as const; + +export const DKG_TEXT_BODY_CLASS = 'http://dkg.io/ontology/TextBody'; +export const DKG_TEXT_CHUNK_CLASS = 'http://dkg.io/ontology/TextChunk'; +export const DKG_HAS_TEXT_BODY = 'http://dkg.io/ontology/hasTextBody'; +export const DKG_HAS_TEXT_CHUNK = 'http://dkg.io/ontology/hasTextChunk'; +export const DKG_TEXT_SOURCE_PREDICATE = 'http://dkg.io/ontology/textSourcePredicate'; +export const DKG_TEXT_CONTENT_SHA256 = 'http://dkg.io/ontology/textContentSha256'; +export const DKG_TEXT_LITERAL_TERM_SHA256 = 'http://dkg.io/ontology/textLiteralTermSha256'; +export const DKG_TEXT_LITERAL_MUTF8_BYTES = 'http://dkg.io/ontology/textLiteralMutf8Bytes'; +export const DKG_TEXT_UTF8_BYTES = 'http://dkg.io/ontology/textUtf8Bytes'; +export const DKG_TEXT_CHUNK_COUNT = 'http://dkg.io/ontology/textChunkCount'; +export const DKG_TEXT_CHUNK_LIMIT = 'http://dkg.io/ontology/textChunkMutf8Limit'; +export const DKG_TEXT_LANGUAGE = 'http://dkg.io/ontology/textLanguage'; +export const DKG_TEXT_DATATYPE = 'http://dkg.io/ontology/textDatatype'; +export const DKG_CHUNK_INDEX = 'http://dkg.io/ontology/chunkIndex'; +export const DKG_CHUNK_VALUE = 'http://dkg.io/ontology/chunkValue'; + +export interface RdfTextLiteralRewrite { + readonly subject: string; + readonly predicate: string; + readonly graph?: string; + readonly bodySubject: string; + readonly originalMutf8Bytes: number; + readonly originalUtf8Bytes: number; + readonly chunkCount: number; + readonly lexicalSha256: string; + readonly literalTermSha256: string; +} + +export interface RdfLiteralNormalizationResult { + readonly quads: QuadLiteralLike[]; + readonly rewrites: RdfTextLiteralRewrite[]; +} + +export interface RdfLiteralNormalizationOptions { + readonly maxBytes?: number; + readonly chunkMaxBytes?: number; + readonly textPredicates?: Iterable; + readonly label?: string; +} + +export function normalizeLargeRdfLiteralsForBlazegraph( + quads: readonly QuadLiteralLike[], + options: RdfLiteralNormalizationOptions = {}, +): RdfLiteralNormalizationResult { + const maxBytes = options.maxBytes ?? DKG_RDF_LITERAL_SAFE_MUTF8_BYTES; + const chunkMaxBytes = options.chunkMaxBytes ?? maxBytes; + validateNormalizationLimits(maxBytes, chunkMaxBytes); + + const textPredicates = new Set(options.textPredicates ?? SCHEMA_TEXT_PREDICATES); + const normalized: QuadLiteralLike[] = []; + const rewrites: RdfTextLiteralRewrite[] = []; + + for (let i = 0; i < quads.length; i++) { + const quad = quads[i]!; + const literalBytes = rdfLiteralTermMutf8ByteLength(quad.object); + if (literalBytes === undefined || literalBytes <= maxBytes) { + normalized.push({ ...quad }); + continue; + } + + if (!textPredicates.has(quad.predicate)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const parsed = parseRdfLiteralTerm(quad.object); + if (!parsed || !isChunkableTextLiteral(parsed)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + if (!isSafeIri(quad.subject)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const canonicalLiteralTerm = rdfLiteralTerm(parsed.lexical, parsed.suffix); + const literalTermSha256 = sha256Hex(canonicalLiteralTerm); + const lexicalSha256 = sha256Hex(parsed.lexical); + const bodyIdentitySha256 = sha256Hex(`${quad.predicate}\u0000${canonicalLiteralTerm}`); + const bodySubject = `${quad.subject}/.well-known/genid/dkg-text-body-${bodyIdentitySha256}`; + if (!isSafeIri(bodySubject)) { + throwOversizedForQuad(quad, literalBytes, maxBytes, labelFor(options.label, i)); + } + + const chunks = splitLexicalIntoSafeChunks(parsed.lexical, chunkMaxBytes); + const graph = quad.graph ?? ''; + const bodyQuads = buildTextBodyQuads({ + source: quad, + graph, + bodySubject, + parsed, + chunks, + maxBytes, + chunkMaxBytes, + originalMutf8Bytes: literalBytes, + lexicalSha256, + literalTermSha256, + }); + normalized.push(...bodyQuads); + rewrites.push({ + subject: quad.subject, + predicate: quad.predicate, + graph: quad.graph, + bodySubject, + originalMutf8Bytes: literalBytes, + originalUtf8Bytes: utf8ByteLength(parsed.lexical), + chunkCount: chunks.length, + lexicalSha256, + literalTermSha256, + }); + } + + assertQuadLiteralsMutf8Safe(normalized, { + label: options.label ? `${options.label}.normalized` : 'normalizedQuads', + maxBytes, + }); + return { quads: normalized, rewrites }; +} + +function validateNormalizationLimits(maxBytes: number, chunkMaxBytes: number): void { + if (!Number.isInteger(maxBytes) || maxBytes <= 0 || maxBytes > JAVA_WRITE_UTF_MAX_BYTES) { + throw new Error(`Invalid maxBytes: ${maxBytes}`); + } + if (!Number.isInteger(chunkMaxBytes) || chunkMaxBytes <= 0 || chunkMaxBytes > maxBytes) { + throw new Error(`Invalid chunkMaxBytes: ${chunkMaxBytes}`); + } +} + +function labelFor(label: string | undefined, index: number): string { + return label ? `${label}[${index}].object` : `quads[${index}].object`; +} + +function throwOversizedForQuad( + quad: QuadLiteralLike, + actualBytes: number, + maxBytes: number, + label: string, +): never { + throw new OversizedRdfLiteralError({ + actualBytes, + maxBytes, + label, + subject: quad.subject, + predicate: quad.predicate, + graph: quad.graph, + }); +} + +function buildTextBodyQuads(args: { + source: QuadLiteralLike; + graph: string; + bodySubject: string; + parsed: ParsedRdfLiteralTerm; + chunks: readonly string[]; + maxBytes: number; + chunkMaxBytes: number; + originalMutf8Bytes: number; + lexicalSha256: string; + literalTermSha256: string; +}): QuadLiteralLike[] { + const quads: QuadLiteralLike[] = [ + { subject: args.source.subject, predicate: DKG_HAS_TEXT_BODY, object: args.bodySubject, graph: args.graph }, + { subject: args.bodySubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_BODY_CLASS, graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_SOURCE_PREDICATE, object: args.source.predicate, graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CONTENT_SHA256, object: rdfLiteralTerm(args.lexicalSha256), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_TERM_SHA256, object: rdfLiteralTerm(args.literalTermSha256), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_LITERAL_MUTF8_BYTES, object: xsdInteger(args.originalMutf8Bytes), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_UTF8_BYTES, object: xsdInteger(utf8ByteLength(args.parsed.lexical)), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_COUNT, object: xsdInteger(args.chunks.length), graph: args.graph }, + { subject: args.bodySubject, predicate: DKG_TEXT_CHUNK_LIMIT, object: xsdInteger(args.chunkMaxBytes), graph: args.graph }, + ]; + + if (args.parsed.language) { + quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_LANGUAGE, object: rdfLiteralTerm(args.parsed.language), graph: args.graph }); + } + if (args.parsed.datatype) { + quads.push({ subject: args.bodySubject, predicate: DKG_TEXT_DATATYPE, object: args.parsed.datatype, graph: args.graph }); + } + + args.chunks.forEach((chunk, index) => { + const chunkSubject = `${args.bodySubject}/chunk-${index}`; + if (!isSafeIri(chunkSubject)) { + throw new OversizedRdfLiteralError({ + actualBytes: args.originalMutf8Bytes, + maxBytes: args.maxBytes, + subject: args.source.subject, + predicate: args.source.predicate, + graph: args.source.graph, + }); + } + quads.push( + { subject: args.bodySubject, predicate: DKG_HAS_TEXT_CHUNK, object: chunkSubject, graph: args.graph }, + { subject: chunkSubject, predicate: RDF_TYPE_IRI, object: DKG_TEXT_CHUNK_CLASS, graph: args.graph }, + { subject: chunkSubject, predicate: DKG_CHUNK_INDEX, object: xsdInteger(index), graph: args.graph }, + { subject: chunkSubject, predicate: DKG_CHUNK_VALUE, object: rdfLiteralTerm(chunk), graph: args.graph }, + ); + }); + + return quads; +} + +function isChunkableTextLiteral(parsed: ParsedRdfLiteralTerm): boolean { + return parsed.suffix === '' || + parsed.language !== undefined || + parsed.datatype === XSD_STRING_IRI; +} + +function splitLexicalIntoSafeChunks(lexical: string, maxBytes: number): string[] { + const chunks: string[] = []; + let current = ''; + let currentBytes = javaModifiedUtf8ByteLength('""'); + + for (const ch of lexical) { + const chBytes = serializedLiteralBodyMutf8ByteLength(ch); + if (current.length > 0 && currentBytes + chBytes > maxBytes) { + chunks.push(current); + current = ch; + currentBytes = javaModifiedUtf8ByteLength('""') + chBytes; + } else { + current += ch; + currentBytes += chBytes; + } + + if (currentBytes > maxBytes) { + throw new Error('A single literal character exceeds the MUTF-8 chunk budget'); + } + } + + if (current.length > 0 || lexical.length === 0) chunks.push(current); + return chunks; +} + +function serializedLiteralBodyMutf8ByteLength(value: string): number { + const serialized = JSON.stringify(value); + return javaModifiedUtf8ByteLength(serialized.slice(1, -1)); +} + +function xsdInteger(value: number): string { + return `"${value}"^^<${XSD_INTEGER_IRI}>`; +} + +function sha256Hex(value: string): string { + return createHash('sha256').update(value, 'utf8').digest('hex'); +} + +function utf8ByteLength(value: string): number { + return new TextEncoder().encode(value).length; +} + +export { + parseRdfLiteralTerm, + rdfLiteralTerm, + type ParsedRdfLiteralTerm, +} from './rdf-literal-codec.js'; diff --git a/packages/core/test/helpers/chunked-text.ts b/packages/core/test/helpers/chunked-text.ts new file mode 100644 index 000000000..bc943aa0f --- /dev/null +++ b/packages/core/test/helpers/chunked-text.ts @@ -0,0 +1,171 @@ +import { createHash } from 'node:crypto'; +import { + DKG_CHUNK_INDEX, + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, + DKG_TEXT_CHUNK_COUNT, + DKG_TEXT_CONTENT_SHA256, + DKG_TEXT_DATATYPE, + DKG_TEXT_LANGUAGE, + DKG_TEXT_LITERAL_TERM_SHA256, + DKG_TEXT_SOURCE_PREDICATE, + parseRdfLiteralTerm, + rdfLiteralTerm, +} from '../../src/rdf-text-literal-normalization.js'; + +export interface ChunkedTextQuad { + readonly subject: string; + readonly predicate: string; + readonly object: string; + readonly graph?: string; +} + +export interface ReconstructedChunkedTextBody { + readonly subject: string; + readonly bodySubject: string; + readonly sourcePredicate: string; + readonly lexical: string; + readonly literalTerm: string; + readonly chunkCount: number; + readonly lexicalSha256: string; + readonly literalTermSha256: string; + readonly language?: string; + readonly datatype?: string; +} + +export function reconstructChunkedText( + quads: readonly ChunkedTextQuad[], + subject: string, +): string { + const reconstructed = reconstructChunkedTextBodies(quads, { subject }); + if (reconstructed.length === 0) throw new Error(`Missing chunked text body for ${subject}`); + return reconstructed[0]!.lexical; +} + +export function reconstructChunkedTextBodies( + quads: readonly ChunkedTextQuad[], + options: { subject?: string; bodySubject?: string; sourcePredicate?: string } = {}, +): ReconstructedChunkedTextBody[] { + const bySubject = indexQuadsBySubject(quads); + const bodyLinks = quads.filter((q) => + q.predicate === DKG_HAS_TEXT_BODY && + (!options.subject || q.subject === options.subject) && + (!options.bodySubject || q.object === options.bodySubject) + ); + const explicitBody = options.bodySubject && bodyLinks.length === 0 + ? [{ subject: findOwnerSubject(quads, options.bodySubject), object: options.bodySubject }] + : []; + const bodies = [...bodyLinks, ...explicitBody]; + const reconstructed: ReconstructedChunkedTextBody[] = []; + + for (const link of bodies) { + const currentBodySubject = link.object; + const bodyQuads = bySubject.get(currentBodySubject) ?? []; + const sourcePredicate = iriObject(bodyQuads, DKG_TEXT_SOURCE_PREDICATE); + if (!sourcePredicate) throw new Error(`Chunked text body ${currentBodySubject} is missing source predicate`); + if (options.sourcePredicate && sourcePredicate !== options.sourcePredicate) continue; + + const count = integerObject(bodyQuads, DKG_TEXT_CHUNK_COUNT); + if (count === undefined) throw new Error(`Chunked text body ${currentBodySubject} is missing chunk count`); + const lexicalSha256 = literalObject(bodyQuads, DKG_TEXT_CONTENT_SHA256); + if (!lexicalSha256) throw new Error(`Chunked text body ${currentBodySubject} is missing content hash`); + const literalTermSha256 = literalObject(bodyQuads, DKG_TEXT_LITERAL_TERM_SHA256); + if (!literalTermSha256) throw new Error(`Chunked text body ${currentBodySubject} is missing literal term hash`); + const language = literalObject(bodyQuads, DKG_TEXT_LANGUAGE); + const datatype = iriObject(bodyQuads, DKG_TEXT_DATATYPE); + const chunkSubjects = bodyQuads.filter((q) => q.predicate === DKG_HAS_TEXT_CHUNK).map((q) => q.object); + if (chunkSubjects.length !== count) { + throw new Error(`Chunked text body ${currentBodySubject} expected ${count} chunks but found ${chunkSubjects.length}`); + } + + const chunks = chunkSubjects.map((chunkSubject) => { + const chunkQuads = bySubject.get(chunkSubject) ?? []; + const index = integerObject(chunkQuads, DKG_CHUNK_INDEX); + if (index === undefined) throw new Error(`Chunk ${chunkSubject} is missing chunkIndex`); + const valueTerm = literalTermObject(chunkQuads, DKG_CHUNK_VALUE); + if (!valueTerm) throw new Error(`Chunk ${chunkSubject} is missing chunkValue`); + const parsed = parseRdfLiteralTerm(valueTerm); + if (!parsed) throw new Error(`Chunk ${chunkSubject} has invalid chunkValue`); + return { index, lexical: parsed.lexical }; + }).sort((a, b) => a.index - b.index); + + for (let i = 0; i < chunks.length; i++) { + if (chunks[i]!.index !== i) { + throw new Error(`Chunked text body ${currentBodySubject} has non-contiguous chunk index ${chunks[i]!.index}`); + } + } + + const lexical = chunks.map((chunk) => chunk.lexical).join(''); + const suffix = suffixFromMetadata(language, datatype); + const literalTerm = rdfLiteralTerm(lexical, suffix); + if (sha256Hex(lexical) !== lexicalSha256) { + throw new Error(`Chunked text body ${currentBodySubject} content hash mismatch`); + } + if (sha256Hex(literalTerm) !== literalTermSha256) { + throw new Error(`Chunked text body ${currentBodySubject} literal term hash mismatch`); + } + + reconstructed.push({ + subject: link.subject, + bodySubject: currentBodySubject, + sourcePredicate, + lexical, + literalTerm, + chunkCount: chunks.length, + lexicalSha256, + literalTermSha256, + ...(language !== undefined ? { language } : {}), + ...(datatype !== undefined ? { datatype } : {}), + }); + } + + return reconstructed; +} + +function indexQuadsBySubject(quads: readonly ChunkedTextQuad[]): Map { + const map = new Map(); + for (const quad of quads) { + const list = map.get(quad.subject); + if (list) list.push(quad); + else map.set(quad.subject, [quad]); + } + return map; +} + +function findOwnerSubject(quads: readonly ChunkedTextQuad[], bodySubject: string): string { + return quads.find((q) => q.predicate === DKG_HAS_TEXT_BODY && q.object === bodySubject)?.subject ?? ''; +} + +function literalTermObject(quads: readonly ChunkedTextQuad[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object?.startsWith('"') ? object : undefined; +} + +function literalObject(quads: readonly ChunkedTextQuad[], predicate: string): string | undefined { + const term = literalTermObject(quads, predicate); + if (!term) return undefined; + const parsed = parseRdfLiteralTerm(term); + return parsed?.lexical; +} + +function iriObject(quads: readonly ChunkedTextQuad[], predicate: string): string | undefined { + const object = quads.find((q) => q.predicate === predicate)?.object; + return object && !object.startsWith('"') ? object : undefined; +} + +function integerObject(quads: readonly ChunkedTextQuad[], predicate: string): number | undefined { + const value = literalObject(quads, predicate); + if (value === undefined || !/^\d+$/.test(value)) return undefined; + return Number(value); +} + +function suffixFromMetadata(language?: string, datatype?: string): string { + if (language) return `@${language}`; + if (datatype) return `^^<${datatype}>`; + return ''; +} + +function sha256Hex(value: string): string { + return createHash('sha256').update(value, 'utf8').digest('hex'); +} diff --git a/packages/core/test/rdf-literal-size.test.ts b/packages/core/test/rdf-literal-size.test.ts index 87b4fea06..4f675ee48 100644 --- a/packages/core/test/rdf-literal-size.test.ts +++ b/packages/core/test/rdf-literal-size.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'vitest'; +import { reconstructChunkedTextBodies } from './helpers/chunked-text.js'; import { DKG_RDF_LITERAL_SAFE_MUTF8_BYTES, OVERSIZED_RDF_LITERAL_ERROR_CODE, @@ -7,6 +8,16 @@ import { javaModifiedUtf8ByteLength, rdfLiteralTermMutf8ByteLength, } from '../src/rdf-literal-size.js'; +import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, + DKG_HAS_TEXT_CHUNK, + DKG_TEXT_CONTENT_SHA256, + DKG_TEXT_LITERAL_TERM_SHA256, + XSD_STRING_IRI, + normalizeLargeRdfLiteralsForBlazegraph, + parseRdfLiteralTerm, +} from '../src/rdf-text-literal-normalization.js'; describe('rdf literal Java MUTF-8 sizing', () => { it('counts Java Modified UTF-8 byte length by UTF-16 code unit', () => { @@ -69,4 +80,249 @@ describe('rdf literal Java MUTF-8 sizing', () => { ], { label: 'publish.quads' }), ).toThrow(/publish\.quads\[1\]\.object/); }); + + it('chunks oversized schema text into DKG chunk values and reconstructs exactly', () => { + const lexical = `${'computer history '.repeat(200)}"quoted"\n\\slash 😀`; + const literal = `${JSON.stringify(lexical)}@en`; + const result = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/root', + predicate: 'http://schema.org/text', + object: literal, + graph: 'did:dkg:context-graph:test', + }, + ], { + maxBytes: 400, + chunkMaxBytes: 180, + label: 'test.quads', + }); + + expect(result.rewrites).toHaveLength(1); + expect(result.rewrites[0].chunkCount).toBeGreaterThan(1); + expect(result.quads.some((quad) => + quad.subject === 'http://example.org/root' && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + + const bodySubject = result.quads.find((quad) => quad.predicate === DKG_HAS_TEXT_BODY)?.object; + expect(bodySubject).toContain('/.well-known/genid/dkg-text-body-'); + expect(result.quads.some((quad) => + quad.predicate === 'http://schema.org/text' && + quad.subject.includes('/chunk-') + )).toBe(false); + + const chunkValueQuads = result.quads.filter((quad) => quad.predicate === DKG_CHUNK_VALUE); + expect(chunkValueQuads).toHaveLength(result.rewrites[0].chunkCount); + expect(chunkValueQuads.every((quad) => javaModifiedUtf8ByteLength(quad.object) <= 180)).toBe(true); + + const reconstructed = reconstructChunkedTextBodies(result.quads, { + subject: 'http://example.org/root', + sourcePredicate: 'http://schema.org/text', + }); + expect(reconstructed).toHaveLength(1); + expect(reconstructed[0]).toMatchObject({ + subject: 'http://example.org/root', + bodySubject, + sourcePredicate: 'http://schema.org/text', + lexical, + literalTerm: literal, + language: 'en', + chunkCount: result.rewrites[0].chunkCount, + }); + }); + + it('rejects raw control characters in quoted RDF literal source', () => { + expect(parseRdfLiteralTerm('"bad\nbody"')).toBeNull(); + expect(parseRdfLiteralTerm('"bad\rbody"')).toBeNull(); + expect(parseRdfLiteralTerm(`"bad${String.fromCharCode(1)}body"`)).toBeNull(); + + const rawNewlineLiteral = `"${'x'.repeat(250)}\n${'y'.repeat(250)}"`; + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/raw-control', + predicate: 'http://schema.org/text', + object: rawNewlineLiteral, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200, chunkMaxBytes: 100 }), + ).toThrow(/Blazegraph-compatible safe limit/); + }); + + it('chunks escaped control sequences and reconstructs their lexical value', () => { + const escapedLiteral = `"${'line\\nbreak '.repeat(80)}"@en`; + expect(parseRdfLiteralTerm('"line\\nbreak"')?.lexical).toBe('line\nbreak'); + + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/escaped-controls', + predicate: 'http://schema.org/text', + object: escapedLiteral, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 300, chunkMaxBytes: 120 }); + + const reconstructed = reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/escaped-controls', + }); + expect(reconstructed).toHaveLength(1); + expect(reconstructed[0]).toMatchObject({ + lexical: 'line\nbreak '.repeat(80), + language: 'en', + }); + }); + + it('normalizes idempotently and keeps generated literals below the safe limit', () => { + const oversized = JSON.stringify('x'.repeat(1_000)); + const first = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/idempotent', + predicate: 'https://schema.org/text', + object: oversized, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200, chunkMaxBytes: 100 }); + const second = normalizeLargeRdfLiteralsForBlazegraph(first.quads, { maxBytes: 200, chunkMaxBytes: 100 }); + + expect(second.rewrites).toHaveLength(0); + expect(second.quads).toEqual(first.quads); + expect(first.quads.every((quad) => { + const bytes = rdfLiteralTermMutf8ByteLength(quad.object); + return bytes === undefined || bytes <= 200; + })).toBe(true); + }); + + it('keeps same-subject http and https schema:text bodies distinct', () => { + const object = JSON.stringify('shared body '.repeat(120)); + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/dual', + predicate: 'http://schema.org/text', + object, + graph: 'did:dkg:context-graph:test', + }, + { + subject: 'http://example.org/dual', + predicate: 'https://schema.org/text', + object, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 400, chunkMaxBytes: 180 }); + + expect(normalized.rewrites).toHaveLength(2); + expect(new Set(normalized.rewrites.map((rewrite) => rewrite.bodySubject)).size).toBe(2); + expect(reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/dual', + sourcePredicate: 'http://schema.org/text', + })).toHaveLength(1); + expect(reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/dual', + sourcePredicate: 'https://schema.org/text', + })).toHaveLength(1); + }); + + it('uses canonical literal hashes so escaped source forms reconstruct', () => { + const escaped = `"${'\\u0061'.repeat(260)}"`; + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/escaped', + predicate: 'http://schema.org/text', + object: escaped, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 600, chunkMaxBytes: 120 }); + + const reconstructed = reconstructChunkedTextBodies(normalized.quads, { + subject: 'http://example.org/escaped', + }); + expect(reconstructed).toHaveLength(1); + expect(reconstructed[0].lexical).toBe('a'.repeat(260)); + expect(reconstructed[0].literalTerm).toBe(JSON.stringify('a'.repeat(260))); + }); + + it('preserves xsd:string metadata but rejects arbitrary typed oversized text', () => { + const body = 'typed text '.repeat(100); + const typedString = `${JSON.stringify(body)}^^<${XSD_STRING_IRI}>`; + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/typed', + predicate: 'http://schema.org/text', + object: typedString, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 250, chunkMaxBytes: 100 }); + expect(reconstructChunkedTextBodies(normalized.quads)[0]).toMatchObject({ + datatype: XSD_STRING_IRI, + literalTerm: typedString, + }); + + const arbitraryTyped = `${JSON.stringify(body)}^^`; + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/custom', + predicate: 'http://schema.org/text', + object: arbitraryTyped, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 250, chunkMaxBytes: 100 }), + ).toThrow(/Blazegraph-compatible safe limit/); + }); + + it('rejects oversized non-text literals and unsafe chunk subjects', () => { + const oversized = JSON.stringify('x'.repeat(500)); + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/name', + predicate: 'http://schema.org/name', + object: oversized, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200 }), + ).toThrow(/Blazegraph-compatible safe limit/); + + expect(() => + normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: '_:blank', + predicate: 'http://schema.org/text', + object: oversized, + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200 }), + ).toThrow(/Blazegraph-compatible safe limit/); + }); + + it('detects corrupt chunk metadata during reconstruction', () => { + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://example.org/corrupt', + predicate: 'http://schema.org/text', + object: JSON.stringify('z'.repeat(500)), + graph: 'did:dkg:context-graph:test', + }, + ], { maxBytes: 200, chunkMaxBytes: 100 }); + const withoutOneChunk = normalized.quads.filter((quad) => quad.predicate !== DKG_HAS_TEXT_CHUNK || !quad.object.endsWith('/chunk-1')); + expect(() => reconstructChunkedTextBodies(withoutOneChunk)).toThrow(/expected .* chunks/); + + const withoutCount = normalized.quads.filter((quad) => quad.predicate !== 'http://dkg.io/ontology/textChunkCount'); + expect(() => reconstructChunkedTextBodies(withoutCount)).toThrow(/missing chunk count/); + + const withoutHash = normalized.quads.filter((quad) => quad.predicate !== DKG_TEXT_CONTENT_SHA256); + expect(() => reconstructChunkedTextBodies(withoutHash)).toThrow(/missing content hash/); + + const withBadHash = normalized.quads.map((quad) => + quad.predicate === DKG_TEXT_CONTENT_SHA256 || quad.predicate === DKG_TEXT_LITERAL_TERM_SHA256 + ? { ...quad, object: '"bad"' } + : quad + ); + expect(() => reconstructChunkedTextBodies(withBadHash)).toThrow(/hash mismatch/); + }); + + it('parses RDF literal suffixes supported by the chunker', () => { + expect(parseRdfLiteralTerm('"hello"')?.lexical).toBe('hello'); + expect(parseRdfLiteralTerm('"hello"@en-US')?.language).toBe('en-US'); + expect(parseRdfLiteralTerm(`"hello"^^<${XSD_STRING_IRI}>`)?.datatype).toBe(XSD_STRING_IRI); + }); }); diff --git a/packages/publisher/src/auto-partition.ts b/packages/publisher/src/auto-partition.ts index e528634b4..b8ce0a14e 100644 --- a/packages/publisher/src/auto-partition.ts +++ b/packages/publisher/src/auto-partition.ts @@ -1,5 +1,10 @@ import type { Quad } from '@origintrail-official/dkg-storage'; -import { skolemize, isSkolemizedUri, rootEntityFromSkolemized, isBlankNode } from './skolemize.js'; +import { + isSkolemizedUri, + rootEntityFromSkolemized, + isBlankNode, + skolemizedBlankNodeIri, +} from './skolemize.js'; /** * Skolemizes blank nodes under their parent entity and INDEXES the result by @@ -18,71 +23,103 @@ import { skolemize, isSkolemizedUri, rootEntityFromSkolemized, isBlankNode } fro * is just an index. Returns a Map of entity → Quad[]. */ export function skolemizeByEntity(quads: Quad[]): Map { - // Phase 1: Find root entities (non-blank, non-skolemized unique subjects) - const rootEntities = new Set(); - for (const q of quads) { - if (!isBlankNode(q.subject) && !isSkolemizedUri(q.subject)) { - rootEntities.add(q.subject); - } + const skolemized = skolemizeFlatQuads(quads); + const rootQuadsMap = new Map(); + for (const quad of skolemized) { + const root = rootForNonBlankSubject(quad.subject); + if (!root) continue; + const existing = rootQuadsMap.get(root); + if (existing) existing.push(quad); + else rootQuadsMap.set(root, [quad]); } + return rootQuadsMap; +} - // Phase 2: Skolemize blank nodes under their parent root entity. - // For each blank node, we need to determine which root entity it belongs to. - // Heuristic: a blank node belongs to the root entity that references it as an object. - const blankToRoot = new Map(); - for (const q of quads) { - if (rootEntities.has(q.subject) && isBlankNode(q.object)) { - blankToRoot.set(q.object, q.subject); +/** + * Losslessly skolemizes blank-node subjects/objects using the root inferred from + * the blank node itself, not from the currently visited edge. If a blank node is + * shared by multiple roots, every object reference points at the deterministic + * canonical generated IRI instead of manufacturing empty per-root generated nodes. + */ +export function skolemizeFlatQuads(quads: readonly Quad[]): Quad[] { + const blankToRoot = inferBlankNodeRoots(quads); + return quads.map((quad) => { + const subjectRoot = rootForQuadSubject(quad.subject, blankToRoot); + const objectRoot = isBlankNode(quad.object) + ? blankToRoot.get(quad.object) ?? subjectRoot + : undefined; + return { + subject: subjectRoot ? skolemizeTermForRoot(quad.subject, subjectRoot) : quad.subject, + predicate: quad.predicate, + object: objectRoot ? skolemizeTermForRoot(quad.object, objectRoot) : quad.object, + graph: quad.graph, + }; + }); +} + +function inferBlankNodeRoots(quads: readonly Quad[]): Map { + const blankRootCandidates = new Map>(); + for (const quad of quads) { + const root = rootForNonBlankSubject(quad.subject); + if (root && isBlankNode(quad.object)) { + addRootCandidate(blankRootCandidates, quad.object, root); } } - // Propagate: blank nodes referenced by other blank nodes let changed = true; while (changed) { changed = false; - for (const q of quads) { - if ( - isBlankNode(q.subject) && - blankToRoot.has(q.subject) && - isBlankNode(q.object) && - !blankToRoot.has(q.object) - ) { - blankToRoot.set(q.object, blankToRoot.get(q.subject)!); - changed = true; + for (const quad of quads) { + if (!isBlankNode(quad.subject) || !isBlankNode(quad.object)) continue; + const roots = blankRootCandidates.get(quad.subject); + if (!roots) continue; + for (const root of roots) { + if (addRootCandidate(blankRootCandidates, quad.object, root)) { + changed = true; + } } } } - // Skolemize per root entity - const skolemized: Quad[] = []; - const perRoot = new Map(); - for (const root of rootEntities) { - perRoot.set(root, []); + const blankToRoot = new Map(); + for (const [blankNode, roots] of blankRootCandidates) { + const canonicalRoot = [...roots].sort()[0]; + if (canonicalRoot) blankToRoot.set(blankNode, canonicalRoot); } + return blankToRoot; +} - // Collect which quads belong to which root, skolemizing as we go - const rootQuadsMap = new Map(); - for (const root of rootEntities) { - const rootQuads = quads.filter( - (q) => - q.subject === root || - (isBlankNode(q.subject) && blankToRoot.get(q.subject) === root), - ); - const sk = skolemize(root, rootQuads); - rootQuadsMap.set(root, sk); +function addRootCandidate( + candidates: Map>, + blankNode: string, + root: string, +): boolean { + const roots = candidates.get(blankNode); + if (roots) { + const size = roots.size; + roots.add(root); + return roots.size !== size; } + candidates.set(blankNode, new Set([root])); + return true; +} - // Also handle already-skolemized quads (no blank nodes) - for (const q of quads) { - if (isSkolemizedUri(q.subject)) { - const root = rootEntityFromSkolemized(q.subject); - if (root && rootQuadsMap.has(root)) { - rootQuadsMap.get(root)!.push(q); - } - } - } +function rootForQuadSubject( + subject: string, + blankToRoot: ReadonlyMap, +): string | undefined { + if (isBlankNode(subject)) return blankToRoot.get(subject); + return rootForNonBlankSubject(subject); +} - return rootQuadsMap; +function rootForNonBlankSubject(subject: string): string | undefined { + if (isBlankNode(subject)) return undefined; + if (isSkolemizedUri(subject)) return rootEntityFromSkolemized(subject) ?? undefined; + return subject; +} + +function skolemizeTermForRoot(term: string, root: string): string { + return isBlankNode(term) ? skolemizedBlankNodeIri(root, term) : term; } /** diff --git a/packages/publisher/src/dkg-publisher.ts b/packages/publisher/src/dkg-publisher.ts index c1701798c..202d833a3 100644 --- a/packages/publisher/src/dkg-publisher.ts +++ b/packages/publisher/src/dkg-publisher.ts @@ -49,6 +49,7 @@ import { } from './metadata.js'; import { storeWorkspaceOperationPublicQuads } from './workspace-resolution.js'; import type { WorkspacePublicSnapshotStore } from './workspace-snapshot-store.js'; +import { preparePublicWriteQuads } from './public-write-normalization.js'; import { ethers } from 'ethers'; import type { WorkspaceAgentRecipientResolver } from './workspace-agent-recipients.js'; import { @@ -414,6 +415,10 @@ function rejectOversizedRdfLiterals(quads: Quad[], label: string): void { assertQuadLiteralsMutf8Safe(quads, { label }); } +function normalizePublicRdfLiterals(quads: Quad[], label: string): Quad[] { + return preparePublicWriteQuads(quads, { label }).quads; +} + async function stampTrustLevel( store: TripleStore, graph: string, @@ -1074,7 +1079,7 @@ export class DKGPublisher implements Publisher { // reserved-namespace violation cannot be masked by a lock timeout // or subject-level validation error downstream. rejectUserAuthoredProtocolMetadata(quads); - rejectOversizedRdfLiterals(quads, 'share.quads'); + quads = normalizePublicRdfLiterals(quads, 'share.quads'); const subjects = [...new Set(quads.map(q => q.subject))]; const lockPrefix = options.subGraphName ? `${contextGraphId}\0${options.subGraphName}` : contextGraphId; const lockKeys = subjects.map(s => `${lockPrefix}\0${s}`); @@ -1364,12 +1369,17 @@ export class DKGPublisher implements Publisher { // violation with a StaleWriteError). Short-circuit per // `19_MARKDOWN_CONTENT_TYPE.md §10.2`. rejectUserAuthoredProtocolMetadata(quads); - rejectOversizedRdfLiterals(quads, 'conditionalShare.quads'); + quads = normalizePublicRdfLiterals(quads, 'conditionalShare.quads'); for (const cond of options.conditions) { assertSafeIri(cond.subject); assertSafeIri(cond.predicate); if (cond.expectedValue !== null) { assertSafeRdfTerm(cond.expectedValue); + assertQuadLiteralsMutf8Safe([{ + subject: cond.subject, + predicate: cond.predicate, + object: cond.expectedValue, + }], { label: 'conditionalShare.conditions' }); } } @@ -1972,7 +1982,7 @@ export class DKGPublisher implements Publisher { }; } - const { + let { contextGraphId, quads, privateQuads = [], @@ -1996,7 +2006,7 @@ export class DKGPublisher implements Publisher { rejectUserAuthoredProtocolMetadata(quads); if (privateQuads.length > 0) rejectUserAuthoredProtocolMetadata(privateQuads); } - rejectOversizedRdfLiterals(quads, 'publish.quads'); + quads = normalizePublicRdfLiterals(quads, 'publish.quads'); if (privateQuads.length > 0) rejectOversizedRdfLiterals(privateQuads, 'publish.privateQuads'); const ctx: OperationContext = operationCtx ?? createOperationContext('publish'); const effectiveAccessPolicy = accessPolicy ?? (privateQuads.length > 0 ? 'ownerOnly' : 'public'); @@ -3452,7 +3462,7 @@ export class DKGPublisher implements Publisher { 'Publish a new KC instead, or remove and recreate the sub-graph.', ); } - const { contextGraphId, quads, privateQuads = [], operationCtx, onPhase } = options; + let { contextGraphId, quads, privateQuads = [], operationCtx, onPhase } = options; // Round 12 Bug 34: `update()` is a Bucket A public write entry // point (accepts user-authored quads) that Round 9 missed. Apply // the same reserved-namespace guard as `publish()` / `assertionWrite` @@ -3465,7 +3475,7 @@ export class DKGPublisher implements Publisher { rejectUserAuthoredProtocolMetadata(quads); if (privateQuads.length > 0) rejectUserAuthoredProtocolMetadata(privateQuads); } - rejectOversizedRdfLiterals(quads, 'update.quads'); + quads = normalizePublicRdfLiterals(quads, 'update.quads'); if (privateQuads.length > 0) rejectOversizedRdfLiterals(privateQuads, 'update.privateQuads'); const ctx: OperationContext = operationCtx ?? createOperationContext('publish'); let publisherContextGraphId: bigint | undefined; @@ -5354,8 +5364,8 @@ export class DKGPublisher implements Publisher { // Round 9 Bug 25: reject user-authored quads whose subject is in a // protocol-reserved URN namespace. See RESERVED_SUBJECT_PREFIXES above. rejectUserAuthoredProtocolMetadata(quads); - rejectOversizedRdfLiterals(quads, 'assertionWrite.quads'); - await this.store.insert(quads); + const normalizedQuads = normalizePublicRdfLiterals(quads, 'assertionWrite.quads'); + await this.store.insert(normalizedQuads); } async assertionQuery( diff --git a/packages/publisher/src/index.ts b/packages/publisher/src/index.ts index 4fb56c93e..1c63cd7bf 100644 --- a/packages/publisher/src/index.ts +++ b/packages/publisher/src/index.ts @@ -1,7 +1,13 @@ export * from './publisher.js'; -export { skolemize, isBlankNode, isSkolemizedUri, rootEntityFromSkolemized } from './skolemize.js'; +export { skolemize, skolemizedBlankNodeIri, SKOLEMIZED_BLANK_NODE_SEGMENT, isBlankNode, isSkolemizedUri, rootEntityFromSkolemized } from './skolemize.js'; export { RESERVED_SUBJECT_PREFIXES, findReservedSubjectPrefix, isReservedSubject } from './reserved-subjects.js'; export { skolemizeByEntity, autoPartition } from './auto-partition.js'; +export { + preparePublicWriteQuads, + toPublicWriteQuad, + type PreparedPublicWriteQuads, + type PublicWriteNormalizationOptions, +} from './public-write-normalization.js'; export { canonicalPublishPayload, type CanonicalPublishPayload, diff --git a/packages/publisher/src/public-write-normalization.ts b/packages/publisher/src/public-write-normalization.ts new file mode 100644 index 000000000..558932344 --- /dev/null +++ b/packages/publisher/src/public-write-normalization.ts @@ -0,0 +1,41 @@ +import { + normalizeLargeRdfLiteralsForBlazegraph, + type QuadLiteralLike, + type RdfLiteralNormalizationOptions, + type RdfTextLiteralRewrite, +} from '@origintrail-official/dkg-core'; +import type { Quad } from '@origintrail-official/dkg-storage'; +import { skolemizeFlatQuads } from './auto-partition.js'; + +export interface PreparedPublicWriteQuads { + readonly quads: Quad[]; + readonly rewrites: RdfTextLiteralRewrite[]; +} + +export interface PublicWriteNormalizationOptions extends RdfLiteralNormalizationOptions { + readonly skolemize?: boolean; +} + +export function preparePublicWriteQuads( + quads: readonly QuadLiteralLike[], + options: PublicWriteNormalizationOptions = {}, +): PreparedPublicWriteQuads { + const concreteQuads = quads.map(toPublicWriteQuad); + const input = options.skolemize === false + ? concreteQuads + : skolemizeFlatQuads(concreteQuads); + const result = normalizeLargeRdfLiteralsForBlazegraph(input, options); + return { + quads: result.quads.map(toPublicWriteQuad), + rewrites: result.rewrites, + }; +} + +export function toPublicWriteQuad(quad: QuadLiteralLike): Quad { + return { + subject: quad.subject, + predicate: quad.predicate, + object: quad.object, + graph: quad.graph ?? '', + }; +} diff --git a/packages/publisher/src/skolemize.ts b/packages/publisher/src/skolemize.ts index 7a61df671..3cc654154 100644 --- a/packages/publisher/src/skolemize.ts +++ b/packages/publisher/src/skolemize.ts @@ -1,6 +1,6 @@ import type { Quad } from '@origintrail-official/dkg-storage'; -const GENID_SEGMENT = '/.well-known/genid/'; +export const SKOLEMIZED_BLANK_NODE_SEGMENT = '/.well-known/genid/'; /** * Replaces blank node identifiers with deterministic URIs scoped under rootEntity. @@ -20,8 +20,7 @@ export function skolemize(rootEntity: string, quads: Quad[]): Quad[] { const mapping = new Map(); for (const bn of blankNodes) { - const label = bn.slice(2); // strip "_:" - mapping.set(bn, `${rootEntity}${GENID_SEGMENT}${label}`); + mapping.set(bn, skolemizedBlankNodeIri(rootEntity, bn)); } return quads.map((q) => ({ @@ -32,12 +31,17 @@ export function skolemize(rootEntity: string, quads: Quad[]): Quad[] { })); } +export function skolemizedBlankNodeIri(rootEntity: string, blankNode: string): string { + const label = blankNode.startsWith('_:') ? blankNode.slice(2) : blankNode; + return `${rootEntity}${SKOLEMIZED_BLANK_NODE_SEGMENT}${label}`; +} + export function isBlankNode(term: string): boolean { return term.startsWith('_:'); } export function isSkolemizedUri(uri: string): boolean { - return uri.includes(GENID_SEGMENT); + return uri.includes(SKOLEMIZED_BLANK_NODE_SEGMENT); } /** @@ -45,7 +49,7 @@ export function isSkolemizedUri(uri: string): boolean { * e.g., "did:dkg:agent:QmBot/.well-known/genid/o1" → "did:dkg:agent:QmBot" */ export function rootEntityFromSkolemized(uri: string): string | null { - const idx = uri.indexOf(GENID_SEGMENT); + const idx = uri.indexOf(SKOLEMIZED_BLANK_NODE_SEGMENT); if (idx === -1) return null; return uri.slice(0, idx); } diff --git a/packages/publisher/test/dkg-publisher-compat.test.ts b/packages/publisher/test/dkg-publisher-compat.test.ts index b3f4d6db4..162a80bfb 100644 --- a/packages/publisher/test/dkg-publisher-compat.test.ts +++ b/packages/publisher/test/dkg-publisher-compat.test.ts @@ -1,11 +1,16 @@ import { describe, expect, it } from 'vitest'; import { NoChainAdapter } from '@origintrail-official/dkg-chain'; import { + DKG_CHUNK_VALUE, + DKG_HAS_TEXT_BODY, TypedEventBus, generateEd25519Keypair, } from '@origintrail-official/dkg-core'; import { OxigraphStore, type Quad } from '@origintrail-official/dkg-storage'; +import { reconstructChunkedText } from '../../core/test/helpers/chunked-text.js'; import { DKGPublisher } from '../src/dkg-publisher.js'; +import { skolemizeByEntity } from '../src/auto-partition.js'; +import { preparePublicWriteQuads } from '../src/public-write-normalization.js'; function q(s: string, p: string, o: string): Quad { return { @@ -16,18 +21,20 @@ function q(s: string, p: string, o: string): Quad { }; } -async function makePublisher(): Promise { - return new DKGPublisher({ - store: new OxigraphStore(), +async function makePublisher(): Promise<{ publisher: DKGPublisher; store: OxigraphStore }> { + const store = new OxigraphStore(); + const publisher = new DKGPublisher({ + store, chain: new NoChainAdapter(), eventBus: new TypedEventBus(), keypair: await generateEd25519Keypair(), }); + return { publisher, store }; } describe('DKGPublisher compatibility aliases', () => { it('keeps autoPartition as a deprecated alias for skolemizeByEntity', async () => { - const publisher = await makePublisher(); + const { publisher } = await makePublisher(); const quads = [ q('urn:compat:one', 'http://schema.org/name', '"One"'), q('urn:compat:two', 'http://schema.org/name', '"Two"'), @@ -36,22 +43,352 @@ describe('DKGPublisher compatibility aliases', () => { expect(publisher.autoPartition(quads)).toEqual(publisher.skolemizeByEntity(quads)); }); - it('rejects oversized RDF literals at shared-memory producer boundary', async () => { - const publisher = await makePublisher(); + it('chunks oversized schema:text literals at shared-memory producer boundary', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:oversized'; + + await publisher.share('test', [ + q(root, 'http://schema.org/text', oversized), + ], { publisherPeerId: 'test-peer', localOnly: true }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === DKG_HAS_TEXT_BODY + )).toBe(true); + expect(result.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals on linked blank nodes before shared-memory writes', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:share-blank-root'; + const child = `${root}/.well-known/genid/body`; + + await publisher.share('test', [ + q(root, 'http://schema.org/hasPart', '_:body'), + q('_:body', 'http://schema.org/text', oversized), + ], { publisherPeerId: 'test-peer', localOnly: true }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(reconstructChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + }); + + it('rejects oversized non-text RDF literals at shared-memory producer boundary', async () => { + const { publisher } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; await expect( publisher.share('test', [ - q('urn:compat:oversized', 'http://schema.org/text', oversized), + q('urn:compat:oversized-name', 'http://schema.org/name', oversized), ], { publisherPeerId: 'test-peer' }), ).rejects.toMatchObject({ code: 'OVERSIZED_RDF_LITERAL', actualBytes: 60_002, + predicate: 'http://schema.org/name', + }); + }); + + it('keeps reserved-subject failures ahead of large-text chunking', async () => { + const { publisher } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + + await expect( + publisher.share('test', [ + q('urn:dkg:file:reserved', 'http://schema.org/text', oversized), + ], { publisherPeerId: 'test-peer' }), + ).rejects.toThrow(/reserved namespace/i); + }); + + it('rejects oversized conditional-write expected values instead of chunking conditions', async () => { + const { publisher } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + + await expect( + publisher.conditionalShare('test', [ + q('urn:compat:cas', 'http://schema.org/name', '"CAS"'), + ], { + publisherPeerId: 'test-peer', + conditions: [{ + subject: 'urn:compat:cas', + predicate: 'http://schema.org/text', + expectedValue: oversized, + }], + }), + ).rejects.toMatchObject({ + code: 'OVERSIZED_RDF_LITERAL', + predicate: 'http://schema.org/text', + }); + }); + + it('chunks oversized schema:text literals during conditionalShare writes', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:cas-chunked'; + + await publisher.conditionalShare('test', [ + q(root, 'http://schema.org/text', oversized), + ], { + publisherPeerId: 'test-peer', + localOnly: true, + conditions: [{ + subject: root, + predicate: 'http://schema.org/name', + expectedValue: null, + }], + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => quad.predicate === DKG_CHUNK_VALUE)).toBe(true); + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals during direct publish', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:publish-oversized'; + + await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/text', oversized)], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals on linked blank nodes during direct publish', async () => { + const { publisher, store } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:publish-blank-root'; + const child = `${root}/.well-known/genid/body`; + + await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [ + q(root, 'http://schema.org/hasPart', '_:body'), + q('_:body', 'http://schema.org/text', oversized), + ], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(reconstructChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals during update', async () => { + const { publisher, store } = await makePublisher(); + const root = 'urn:compat:update-oversized'; + const original = await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/name', '"Before"')], + skipContextGraphEnsure: true, + }); + + await publisher.update(original.kaId, { + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/text', `"${'x'.repeat(60_000)}"`)], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH ?g { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(reconstructChunkedText(result.quads, root)).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals on linked blank nodes during update', async () => { + const { publisher, store } = await makePublisher(); + const root = 'urn:compat:update-blank-root'; + const child = `${root}/.well-known/genid/body`; + const original = await publisher.publish({ + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [q(root, 'http://schema.org/name', '"Before"')], + skipContextGraphEnsure: true, + }); + + await publisher.update(original.kaId, { + contextGraphId: 'test', + publisherPeerId: 'test-peer', + quads: [ + q(root, 'http://schema.org/hasPart', '_:body'), + q('_:body', 'http://schema.org/text', `"${'x'.repeat(60_000)}"`), + ], + skipContextGraphEnsure: true, + }); + + const result = await store.query( + 'CONSTRUCT { ?s ?p ?o } WHERE { GRAPH ?g { ?s ?p ?o } }', + ); + expect(result.type).toBe('quads'); + if (result.type !== 'quads') return; + expect(result.quads.some((quad) => + quad.subject === child && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(result.quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/hasPart' && + quad.object === child + )).toBe(true); + expect(reconstructChunkedText(result.quads, child)).toBe('x'.repeat(60_000)); + }); + + it('chunks oversized schema:text literals during assertion write', async () => { + const { publisher } = await makePublisher(); + const oversized = `"${'x'.repeat(60_000)}"`; + const root = 'urn:compat:assertion-oversized'; + const agent = '0x1234567890abcdef1234567890abcdef12345678'; + + await publisher.assertionCreate('test', 'large-text', agent); + await publisher.assertionWrite('test', 'large-text', agent, [ + q(root, 'http://schema.org/text', oversized), + ]); + + const quads = await publisher.assertionQuery('test', 'large-text', agent); + expect(quads.some((quad) => + quad.subject === root && + quad.predicate === 'http://schema.org/text' + )).toBe(false); + expect(reconstructChunkedText(quads, root)).toBe('x'.repeat(60_000)); + }); + + it('preserves already-skolemized quads when normal roots are present', () => { + const externalSkolemized = q( + 'http://example.org/external/.well-known/genid/body', + 'http://schema.org/name', + '"kept"', + ); + + const prepared = preparePublicWriteQuads([ + q('http://example.org/root', 'http://schema.org/name', '"root"'), + externalSkolemized, + ]).quads; + + expect(prepared).toContainEqual(externalSkolemized); + expect(prepared).toHaveLength(2); + }); + + it('uses a blank node inferred root when skolemizing shared blank-node object links', () => { + const rootA = 'http://example.org/root-a'; + const rootB = 'http://example.org/root-b'; + const shared = `${rootA}/.well-known/genid/shared`; + const wrongShared = `${rootB}/.well-known/genid/shared`; + + const prepared = preparePublicWriteQuads([ + q(rootA, 'http://schema.org/hasPart', '_:shared'), + q(rootB, 'http://schema.org/hasPart', '_:shared'), + q('_:shared', 'http://schema.org/name', '"Shared"'), + ]).quads; + + expect(prepared).toContainEqual({ + ...q(rootA, 'http://schema.org/hasPart', '_:shared'), + object: shared, + }); + expect(prepared).toContainEqual({ + ...q(rootB, 'http://schema.org/hasPart', '_:shared'), + object: shared, + }); + expect(prepared).toContainEqual({ + ...q('_:shared', 'http://schema.org/name', '"Shared"'), + subject: shared, + }); + expect(prepared.some((quad) => + quad.subject === wrongShared || quad.object === wrongShared + )).toBe(false); + }); + + it('indexes already-skolemized subjects even when their root subject is absent', () => { + const root = 'http://example.org/external'; + const externalSkolemized = q( + `${root}/.well-known/genid/body`, + 'http://schema.org/hasPart', + '_:nested', + ); + + const partitioned = skolemizeByEntity([ + q('http://example.org/root', 'http://schema.org/name', '"root"'), + externalSkolemized, + q('_:nested', 'http://schema.org/name', '"kept"'), + ]); + + expect(partitioned.get(root)).toContainEqual({ + ...externalSkolemized, + object: `${root}/.well-known/genid/nested`, + }); + expect(partitioned.get(root)).toContainEqual({ + ...q('_:nested', 'http://schema.org/name', '"kept"'), + subject: `${root}/.well-known/genid/nested`, }); }); it('rejects oversized private literals before publish canonicalization', async () => { - const publisher = await makePublisher(); + const { publisher } = await makePublisher(); const oversized = `"${'x'.repeat(60_000)}"`; await expect( diff --git a/packages/publisher/test/pure-functions.test.ts b/packages/publisher/test/pure-functions.test.ts index 474c37e55..848fa1767 100644 --- a/packages/publisher/test/pure-functions.test.ts +++ b/packages/publisher/test/pure-functions.test.ts @@ -100,6 +100,34 @@ describe('autoPartition', () => { const kaQuads = result.get(ENTITY)!; expect(kaQuads.length).toBe(3); }); + + it('uses the inferred blank-node root when multiple roots link to one blank object', () => { + const rootA = 'http://example.org/root-a'; + const rootB = 'http://example.org/root-b'; + const shared = `${rootA}/.well-known/genid/shared`; + const wrongShared = `${rootB}/.well-known/genid/shared`; + const quads: Quad[] = [ + q(rootA, 'http://example.org/hasPart', '_:shared'), + q(rootB, 'http://example.org/mentions', '_:shared'), + q('_:shared', 'http://schema.org/name', '"Shared"'), + ]; + + const result = autoPartition(quads); + const reordered = autoPartition([...quads].reverse()); + const allQuads = [...result.values()].flat(); + const allReorderedQuads = [...reordered.values()].flat(); + + expect(result.get(rootA)).toEqual(expect.arrayContaining([ + q(rootA, 'http://example.org/hasPart', shared), + q(shared, 'http://schema.org/name', '"Shared"'), + ])); + expect(result.get(rootB)).toEqual(expect.arrayContaining([ + q(rootB, 'http://example.org/mentions', shared), + ])); + expect(allQuads.some((quad) => quad.subject === wrongShared || quad.object === wrongShared)).toBe(false); + expect(allReorderedQuads).toEqual(expect.arrayContaining(allQuads)); + expect(allReorderedQuads).toHaveLength(allQuads.length); + }); }); describe('merkle', () => { diff --git a/packages/storage/test/blazegraph.unit.test.ts b/packages/storage/test/blazegraph.unit.test.ts index 066f4e609..16d56a484 100644 --- a/packages/storage/test/blazegraph.unit.test.ts +++ b/packages/storage/test/blazegraph.unit.test.ts @@ -3,6 +3,10 @@ * no live Blazegraph required). */ import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { + DKG_CHUNK_VALUE, + normalizeLargeRdfLiteralsForBlazegraph, +} from '@origintrail-official/dkg-core'; import { BlazegraphStore } from '../src/adapters/blazegraph.js'; describe('BlazegraphStore (mocked HTTP)', () => { @@ -300,6 +304,27 @@ describe('BlazegraphStore (mocked HTTP)', () => { expect(fetchCalls).toHaveLength(0); }); + it('insert accepts producer-normalized chunked schema:text payloads', async () => { + setFetch(async () => new Response(null, { status: 200 })); + const s = new BlazegraphStore(baseUrl); + const original = 'x'.repeat(70_000); + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://s-chunked', + predicate: 'http://schema.org/text', + object: JSON.stringify(original), + graph: 'http://g', + }, + ]); + + await s.insert(normalized.quads); + + expect(fetchCalls).toHaveLength(1); + const body = String(fetchCalls[0][1]?.body); + expect(body).toContain(DKG_CHUNK_VALUE); + expect(body).not.toContain(original); + }); + it('insert allows 25KB ASCII literal (under MUTF-8 limit)', async () => { setFetch(async () => new Response(null, { status: 200 })); const s = new BlazegraphStore(baseUrl); diff --git a/packages/storage/test/sparql-http.test.ts b/packages/storage/test/sparql-http.test.ts index 918737e76..c21b890ec 100644 --- a/packages/storage/test/sparql-http.test.ts +++ b/packages/storage/test/sparql-http.test.ts @@ -1,5 +1,9 @@ import { createServer, type Server } from 'node:http'; import { describe, it, expect, beforeAll, afterAll, vi } from 'vitest'; +import { + DKG_CHUNK_VALUE, + normalizeLargeRdfLiteralsForBlazegraph, +} from '@origintrail-official/dkg-core'; import { SparqlHttpStore, createTripleStore, type Quad, type SparqlHttpSlowQueryEvent } from '../src/index.js'; let server: Server; @@ -115,6 +119,25 @@ describe('SparqlHttpStore (test server)', () => { expect(insertedQuads).toHaveLength(0); }); + it('accepts producer-normalized chunked schema:text payloads', async () => { + insertedQuads.length = 0; + const original = 'x'.repeat(70_000); + const normalized = normalizeLargeRdfLiteralsForBlazegraph([ + { + subject: 'http://ex.org/chunked', + predicate: 'http://schema.org/text', + object: JSON.stringify(original), + graph: 'http://ex.org/g', + }, + ]); + + await store.insert(normalized.quads); + + expect(insertedQuads).toHaveLength(1); + expect(insertedQuads[0]).toContain(DKG_CHUNK_VALUE); + expect(insertedQuads[0]).not.toContain(original); + }); + it('query SELECT sends query to query endpoint and parses bindings', async () => { const result = await store.query( 'SELECT ?name WHERE { GRAPH { ?name } }',