From f42e22a5447a108ce231f93cd6595b72266494a5 Mon Sep 17 00:00:00 2001 From: Branimir Rakic Date: Tue, 30 Jun 2026 21:38:16 +0200 Subject: [PATCH 1/3] =?UTF-8?q?feat(okf):=20Google=20OKF=20=E2=86=92=20DKG?= =?UTF-8?q?=20integration=20(import=20OKF=20bundles=20as=20deterministic,?= =?UTF-8?q?=20provenance-bearing=20KAs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OKF-only carve-out of #1331 (@Zigoljube). The `@origintrail-official/dkg-ip-oracle` package and its CLI command are intentionally EXCLUDED here so OKF can ship on its own and ip-oracle can follow later — this PR is the OKF half only. Imports a Google OKF bundle into a Context Graph as deterministic, provenance-bearing Knowledge Assets: a deterministic offline mapping, a `verify` completeness gate, `--relate` typed edges, `--private` bulk SWM with a resumable manifest, and export. Adds the `@origintrail-official/dkg-okf` package + the `dkg okf` CLI command; removes the ip-oracle dep/registration/coverage tier from the cli wiring. Validated live on Base mainnet and against a v10.0.1 edge node during #1331 review. okf 77 / okf-subcommands 12 green; 21/21 build. Note: private OKF→VM mainnet publishing depends on the V10 leaf-canonicalization fix (#1386); the import tooling ships here, but hold mainnet private OKF→VM publishes until #1386 is released. Co-Authored-By: Zigoljube <257808628+Zigoljube@users.noreply.github.com> Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/cli/package.json | 1 + packages/cli/src/cli.ts | 2 + packages/cli/src/commands/okf.ts | 823 ++++++++++++++++++ packages/cli/test/okf-subcommands.test.ts | 514 +++++++++++ packages/okf/CONTEXT.md | 123 +++ packages/okf/DEMO.md | 226 +++++ packages/okf/README.md | 55 ++ packages/okf/integration.okf.json | 31 + packages/okf/package.json | 38 + packages/okf/src/bundle.ts | 138 +++ packages/okf/src/constants.ts | 60 ++ packages/okf/src/document.ts | 82 ++ packages/okf/src/export.ts | 225 +++++ packages/okf/src/index.ts | 49 ++ packages/okf/src/loader.ts | 47 + packages/okf/src/mapping.ts | 303 +++++++ packages/okf/src/nquads.ts | 27 + packages/okf/src/paths.ts | 132 +++ packages/okf/src/types.ts | 144 +++ packages/okf/src/utils.ts | 59 ++ packages/okf/src/validation.ts | 82 ++ packages/okf/test/bundle.test.ts | 181 ++++ packages/okf/test/document.test.ts | 45 + packages/okf/test/edge-cases.test.ts | 85 ++ .../fixtures/crypto_bitcoin.ATTRIBUTION.md | 17 + .../crypto_bitcoin/datasets/crypto_bitcoin.md | 55 ++ .../fixtures/crypto_bitcoin/datasets/index.md | 3 + .../okf/test/fixtures/crypto_bitcoin/index.md | 4 + .../fixtures/crypto_bitcoin/tables/blocks.md | 32 + .../fixtures/crypto_bitcoin/tables/index.md | 6 + .../fixtures/crypto_bitcoin/tables/inputs.md | 65 ++ .../fixtures/crypto_bitcoin/tables/outputs.md | 68 ++ .../crypto_bitcoin/tables/transactions.md | 112 +++ .../okf/test/fixtures/edge_cases/extras.md | 13 + .../okf/test/fixtures/edge_cases/index.md | 8 + packages/okf/test/fixtures/edge_cases/log.md | 9 + .../okf/test/fixtures/edge_cases/type_only.md | 7 + .../okf/test/fixtures/synthetic_links/beta.md | 8 + .../okf/test/fixtures/synthetic_links/hub.md | 18 + .../test/fixtures/synthetic_links/index.md | 6 + .../fixtures/synthetic_links/tables/alpha.md | 9 + .../fixtures/synthetic_links/tables/gamma.md | 8 + .../fixtures/synthetic_links/tables/index.md | 4 + packages/okf/test/internals.test.ts | 142 +++ packages/okf/test/loader.test.ts | 34 + packages/okf/test/mapping.test.ts | 140 +++ packages/okf/test/paths.test.ts | 106 +++ packages/okf/test/roundtrip.test.ts | 99 +++ packages/okf/tsconfig.json | 9 + packages/okf/vitest.config.ts | 17 + pnpm-lock.yaml | 28 + vitest.coverage.ts | 7 + 52 files changed, 4506 insertions(+) create mode 100644 packages/cli/src/commands/okf.ts create mode 100644 packages/cli/test/okf-subcommands.test.ts create mode 100644 packages/okf/CONTEXT.md create mode 100644 packages/okf/DEMO.md create mode 100644 packages/okf/README.md create mode 100644 packages/okf/integration.okf.json create mode 100644 packages/okf/package.json create mode 100644 packages/okf/src/bundle.ts create mode 100644 packages/okf/src/constants.ts create mode 100644 packages/okf/src/document.ts create mode 100644 packages/okf/src/export.ts create mode 100644 packages/okf/src/index.ts create mode 100644 packages/okf/src/loader.ts create mode 100644 packages/okf/src/mapping.ts create mode 100644 packages/okf/src/nquads.ts create mode 100644 packages/okf/src/paths.ts create mode 100644 packages/okf/src/types.ts create mode 100644 packages/okf/src/utils.ts create mode 100644 packages/okf/src/validation.ts create mode 100644 packages/okf/test/bundle.test.ts create mode 100644 packages/okf/test/document.test.ts create mode 100644 packages/okf/test/edge-cases.test.ts create mode 100644 packages/okf/test/fixtures/crypto_bitcoin.ATTRIBUTION.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/datasets/crypto_bitcoin.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/datasets/index.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/index.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/tables/blocks.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/tables/index.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/tables/inputs.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/tables/outputs.md create mode 100644 packages/okf/test/fixtures/crypto_bitcoin/tables/transactions.md create mode 100644 packages/okf/test/fixtures/edge_cases/extras.md create mode 100644 packages/okf/test/fixtures/edge_cases/index.md create mode 100644 packages/okf/test/fixtures/edge_cases/log.md create mode 100644 packages/okf/test/fixtures/edge_cases/type_only.md create mode 100644 packages/okf/test/fixtures/synthetic_links/beta.md create mode 100644 packages/okf/test/fixtures/synthetic_links/hub.md create mode 100644 packages/okf/test/fixtures/synthetic_links/index.md create mode 100644 packages/okf/test/fixtures/synthetic_links/tables/alpha.md create mode 100644 packages/okf/test/fixtures/synthetic_links/tables/gamma.md create mode 100644 packages/okf/test/fixtures/synthetic_links/tables/index.md create mode 100644 packages/okf/test/internals.test.ts create mode 100644 packages/okf/test/loader.test.ts create mode 100644 packages/okf/test/mapping.test.ts create mode 100644 packages/okf/test/paths.test.ts create mode 100644 packages/okf/test/roundtrip.test.ts create mode 100644 packages/okf/tsconfig.json create mode 100644 packages/okf/vitest.config.ts diff --git a/packages/cli/package.json b/packages/cli/package.json index 411b2da85..9eefba90e 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -38,6 +38,7 @@ "@origintrail-official/dkg-core": "workspace:*", "@origintrail-official/dkg-mcp": "workspace:*", "@origintrail-official/dkg-epcis": "workspace:*", + "@origintrail-official/dkg-okf": "workspace:*", "@origintrail-official/dkg-node-ui": "workspace:*", "@origintrail-official/dkg-publisher": "workspace:*", "@origintrail-official/dkg-storage": "workspace:*", diff --git a/packages/cli/src/cli.ts b/packages/cli/src/cli.ts index f6d5930c3..1f6b88a1e 100644 --- a/packages/cli/src/cli.ts +++ b/packages/cli/src/cli.ts @@ -26,6 +26,7 @@ import { registerNodeOpsCommands } from './commands/node-ops.js'; import { registerQueryCatalogCommand } from './commands/query-catalog.js'; import { registerMaintenanceCommands } from './commands/maintenance.js'; import { registerRandomSamplingCommand } from './commands/random-sampling.js'; +import { registerOkfCommand } from './commands/okf.js'; const program = new Command(); program @@ -56,6 +57,7 @@ registerNodeOpsCommands(program); registerQueryCatalogCommand(program); registerMaintenanceCommands(program); registerRandomSamplingCommand(program); +registerOkfCommand(program); // ─── dkg integration ───────────────────────────────────────────────── diff --git a/packages/cli/src/commands/okf.ts b/packages/cli/src/commands/okf.ts new file mode 100644 index 000000000..5b022cb1b --- /dev/null +++ b/packages/cli/src/commands/okf.ts @@ -0,0 +1,823 @@ +import { Command } from 'commander'; +import { writeFile, mkdir, readFile } from 'node:fs/promises'; +import { existsSync } from 'node:fs'; +import { dirname, join, resolve, relative, isAbsolute } from 'node:path'; +import { toErrorMessage } from '@origintrail-official/dkg-core'; +import { + loadBundleDirWithReport, + importBundle, + exportBundle, + validateBundle, + quadsToNQuads, + conceptIdToKaName, + DEFAULT_IRI_BASE, + RDF_TYPE, + SCHEMA_NS, + SECTION_GENID_INFIX, + type BundleImport, + type Quad, + type TypeRelation, +} from '@origintrail-official/dkg-okf'; +import { ApiClient } from '../api-client.js'; +import type { ActionOpts } from '../cli-helpers.js'; + +/** + * `dkg okf` — ingest a Google Open Knowledge Format (OKF) bundle into the DKG as + * verifiable, owned Knowledge Assets (reconstructing the cross-concept link + * graph), and serialise a Context Graph back into a conformant OKF bundle. + * + * The OKF→RDF mapping is the pure, deterministic mapper in + * `@origintrail-official/dkg-okf`; this command is the thin node-facing wrapper + * (mirrors `dkg epcis`). Import defaults to **Working Memory** (free, private, + * reversible) and NEVER publishes to Verifiable Memory: `--share` advances the + * assets to Shared Working Memory (free, team-visible); on-chain VM promotion is + * a separate, explicitly-gated capstone (`dkg knowledge publish` / the DEMO + * runbook), not part of import. + */ +export function registerOkfCommand(program: Command): void { + // Bulk-import chunking contract (ADR 0002): ≤5,000 quads per wm/write. + const CHUNK = 5000; + + const OKF_EXIT_CODES = { + SUCCESS: 0, + UNEXPECTED: 1, + CLIENT_ERROR: 2, + PUBLISHER_UNAVAILABLE: 3, + NOT_FOUND: 4, + } as const; + + function exitCodeForOkfHttpStatus(status: number | undefined): number { + if (status === undefined) return OKF_EXIT_CODES.UNEXPECTED; + if (status >= 200 && status < 300) return OKF_EXIT_CODES.SUCCESS; + if (status === 503) return OKF_EXIT_CODES.PUBLISHER_UNAVAILABLE; + if (status === 404) return OKF_EXIT_CODES.NOT_FOUND; + if (status >= 400 && status < 500) return OKF_EXIT_CODES.CLIENT_ERROR; + return OKF_EXIT_CODES.UNEXPECTED; + } + + function reportOkfError(err: unknown): never { + const httpStatus = (err as { httpStatus?: number })?.httpStatus; + const responseBody = (err as { responseBody?: unknown })?.responseBody; + if (responseBody !== undefined) { + try { + console.log(JSON.stringify(responseBody, null, 2)); + } catch { + // not serialisable + } + } + console.error(toErrorMessage(err)); + process.exit(exitCodeForOkfHttpStatus(httpStatus)); + } + + // KA name for a concept: DKG asset names cannot contain '/', so path + // separators in the concept ID are mapped to '__' (the RDF subject IRI keeps + // the original '/'). See `conceptIdToKaName` in @origintrail-official/dkg-okf. + const conceptKaName = conceptIdToKaName; + + // The daemon's /api/query returns SELECT results as `{ bindings: [...] }` — + // WITHOUT the `type: 'bindings'` discriminator the QueryResult union expects. + // Gating on `result.type === 'bindings'` therefore silently yields no rows + // (the bug that made `okf verify` report 0 for everything). Read `bindings` + // structurally instead. + function bindingsOf(result: unknown): Array> { + if (result && typeof result === 'object' && Array.isArray((result as { bindings?: unknown }).bindings)) { + return (result as { bindings: Array> }).bindings; + } + return []; + } + + // A `/api/query` binding cell can arrive as a bare string OR a SPARQL-JSON + // object (`{ value, type, datatype? }`); calling `.startsWith()`/`.exec()` on + // the object form throws at runtime. Normalise every cell to its string value + // before use (mirrors the daemon's `bindingValue`). The static `QueryResult` + // type annotates cells as strings, but the runtime path can return objects. + const cell = (v: unknown): string => { + if (v === null || v === undefined) return ''; + if (typeof v === 'string') return v; + if (typeof v === 'object' && 'value' in (v as Record)) { + const raw = (v as { value?: unknown }).value; + return raw === null || raw === undefined ? '' : String(raw); + } + return String(v); + }; + + function summarize(imported: BundleImport): { + concepts: number; + reservedSkipped: number; + triples: number; + linksResolved: number; + linksBroken: number; + citations: number; + } { + let linksResolved = 0; + let linksBroken = 0; + let citations = 0; + for (const c of imported.concepts) { + linksResolved += new Set(c.resolvedLinks.map((l) => l.targetConceptId)).size; + linksBroken += c.brokenLinks.length; + citations += c.citations.length; + } + return { + concepts: imported.concepts.length, + reservedSkipped: imported.reservedSkipped.length, + triples: imported.quads.length, + linksResolved, + linksBroken, + citations, + }; + } + + const okfCmd = program + .command('okf') + .description('Import / export Google Open Knowledge Format (OKF) bundles as Knowledge Assets'); + + // Collector for repeatable options (commander passes (val, prev)). + const collect = (val: string, prev: string[]): string[] => prev.concat([val]); + + // Parse repeatable `--relate ">="` rules into + // deterministic type-pair edge relations. The predicate is used as-is if it is + // a full IRI, otherwise resolved against schema.org (so `hasPart` → schema:hasPart). + function parseRelateRules(raw: string[]): TypeRelation[] { + return raw.map((rule) => { + const eq = rule.lastIndexOf('='); + const gt = rule.indexOf('>'); + if (eq === -1 || gt === -1 || gt > eq) { + throw new Error(`Invalid --relate rule "${rule}". Expected ">=".`); + } + const from = rule.slice(0, gt).trim(); + const to = rule.slice(gt + 1, eq).trim(); + const predToken = rule.slice(eq + 1).trim(); + if (!from || !to || !predToken) { + throw new Error(`Invalid --relate rule "${rule}". Expected ">=".`); + } + const predicate = predToken.includes('://') ? predToken : SCHEMA_NS + predToken; + return { from, to, predicate }; + }); + } + + // ─── dkg okf import ───────────────────────────────────── + okfCmd + .command('import ') + .description('Import an OKF bundle into a Context Graph (defaults to Working Memory)') + .option('--context-graph-id ', 'Target Context Graph') + .option('--sub-graph-name ', 'Sub-graph within the Context Graph') + .option('--iri-base ', `IRI namespace for concept subjects (default ${DEFAULT_IRI_BASE})`) + .option('--include-code-span-links', 'Treat links inside inline code spans as edges (default: off, per CommonMark)') + .option( + '--relate ', + 'Type a cross-concept edge by endpoint types: ">=" ' + + '(repeatable; predicate is a full IRI or a schema.org term, e.g. ' + + '"BigQuery Dataset>BigQuery Table=hasPart"). Default: all edges schema:mentions.', + collect, + [] as string[], + ) + .option('--replace', 'Discard any existing Working-Memory draft for each concept before writing (avoids stale triples when re-importing a changed bundle). WM-only: it does NOT clear already-shared SWM or --private loose quads (those are append/dedupe; to drop removed triples there, recreate the Context Graph).') + .option('--create-context-graph', 'Create the Context Graph if it does not exist') + .option('--share', 'Finalize and advance assets to Shared Working Memory (free, team-visible)') + .option( + '--private', + 'Bulk-write all triples into the (private) Context Graph\'s Shared Working Memory ' + + 'as loose quads — no per-concept finalize. Batched in 5,000-quad chunks with a ' + + 'resumable manifest; the whole bundle is mapped in memory first, so this is ' + + 'practical to ~100k concepts per run (not yet streaming). Content stays ' + + 'gossip-restricted to allowlisted peers. Implies a private CG on --create-context-graph.', + ) + .option( + '--allowed-peer ', + 'Allowlist a peer id on the private Context Graph (repeatable). On --create-context-graph the ' + + 'peers seed the allowlist; on an existing CG each is invited.', + collect, + [] as string[], + ) + .option( + '--allow-public-context-graph', + 'Override the safety check that refuses --private bulk writes into an existing Context Graph whose accessPolicy is "public" (which would expose the private substance).', + ) + .option('--manifest ', 'Resumability manifest path (default /.okf-import-manifest.json)') + .option('--dry-run', 'Run the deterministic mapping offline and print the summary; never touch the node') + .option('--print-nquads', 'With --dry-run, also print the canonical N-Quads') + .action(async (bundleDir: string, opts: ActionOpts) => { + try { + if (!existsSync(bundleDir)) { + console.error(`Bundle directory not found: ${bundleDir}`); + process.exit(OKF_EXIT_CODES.UNEXPECTED); + } + const iriBase = opts.iriBase ? String(opts.iriBase) : DEFAULT_IRI_BASE; + const includeCodeSpanLinks = Boolean(opts.includeCodeSpanLinks); + const typeRelations = parseRelateRules( + Array.isArray(opts.relate) ? (opts.relate as string[]) : [], + ); + + const { files, skippedSymlinks } = loadBundleDirWithReport(bundleDir); + for (const s of skippedSymlinks) { + console.error(`Warning: skipped symlinked bundle entry (not followed): ${s}`); + } + const conformance = validateBundle(files); + const imported = importBundle(files, { iriBase, includeCodeSpanLinks, typeRelations }); + const summary = summarize(imported); + + // The deterministic, offline portion — no node required. + if (opts.dryRun) { + console.log( + JSON.stringify( + { + mode: 'dry-run', + memoryLayer: opts.private || opts.share ? 'SWM' : 'WM', + importMode: opts.private ? 'bulk-private-swm' : 'per-concept', + conformant: conformance.conformant, + okfVersion: imported.okfVersion, + ...summary, + iris: imported.iriByConceptId, + warnings: imported.warnings, + conformanceErrors: conformance.errors, + }, + null, + 2, + ), + ); + if (opts.printNquads) { + process.stdout.write('\n' + quadsToNQuads(imported.quads)); + } + return; + } + + if (!conformance.conformant) { + // §9: a non-conformant bundle is unusual but we only hard-stop on the + // two rules that make triples meaningless (parse / missing type). + console.error('Bundle is not OKF-conformant:'); + for (const e of conformance.errors) console.error(` - ${e}`); + process.exit(OKF_EXIT_CODES.CLIENT_ERROR); + } + + const contextGraphId = opts.contextGraphId ? String(opts.contextGraphId) : undefined; + if (!contextGraphId) { + console.error('--context-graph-id is required (or use --dry-run).'); + process.exit(OKF_EXIT_CODES.UNEXPECTED); + } + const subGraphName = opts.subGraphName ? String(opts.subGraphName) : undefined; + const graph = `did:dkg:context-graph:${contextGraphId}`; + + const isPrivate = Boolean(opts.private); + const allowedPeers = Array.isArray(opts.allowedPeer) + ? (opts.allowedPeer as string[]) + : []; + + const client = await ApiClient.connect(); + + // Ensure the Context Graph exists. + const { exists } = await client.contextGraphExists(contextGraphId); + if (!exists) { + if (!opts.createContextGraph) { + console.error( + `Context Graph "${contextGraphId}" does not exist. Re-run with --create-context-graph to create it.`, + ); + process.exit(OKF_EXIT_CODES.NOT_FOUND); + } + // --private ⇒ accessPolicy 1 (invite-only, off-chain). allowedPeers seed + // the allowlist; register:false keeps it off-chain (no spend). + await client.createContextGraph( + contextGraphId, + contextGraphId, + undefined, + isPrivate ? { private: true, accessPolicy: 1 } : undefined, + isPrivate && allowedPeers.length ? allowedPeers : undefined, + ); + console.log( + `Created ${isPrivate ? 'private (invite-only) ' : ''}Context Graph "${contextGraphId}"` + + (isPrivate && allowedPeers.length ? ` with ${allowedPeers.length} allowlisted peer(s).` : '.'), + ); + } else if (isPrivate) { + // Existing CG + --private: REFUSE to bulk-write private substance into a + // Context Graph that is publicly readable. accessPolicy comes from the + // daemon's CG list ('public' | 'ownerOnly' | 'allowList'). + const list = await client.listContextGraphs().catch(() => null); + const policy = list?.contextGraphs?.find((c: { id: string }) => c.id === contextGraphId) + ?.accessPolicy; + if (policy === 'public' && !opts.allowPublicContextGraph) { + console.error( + `Refusing --private import: Context Graph "${contextGraphId}" already exists with ` + + `accessPolicy "public". Writing private substance there would expose it. Use a ` + + `private (invite-only) Context Graph, or pass --allow-public-context-graph to override.`, + ); + process.exit(OKF_EXIT_CODES.CLIENT_ERROR); + } + if (policy !== 'allowList' && policy !== 'ownerOnly') { + console.error( + ` warning: could not confirm Context Graph "${contextGraphId}" is invite-only ` + + `(accessPolicy "${policy ?? 'unknown'}"); proceeding with the private write.`, + ); + } + // Invite each allowlisted peer (best-effort; already-member is fine). + for (const peerId of allowedPeers) { + try { + await client.inviteToContextGraph(contextGraphId, peerId); + console.log(` invited peer ${peerId}`); + } catch (e) { + console.error(` could not invite ${peerId}: ${toErrorMessage(e)}`); + } + } + } + + // ─── BULK PRIVATE MODE ────────────────────────────────────────── + // Stream every triple into the private CG's Shared Working Memory as + // loose quads (one shared-memory/write per chunk), NOT as individual + // finalized Knowledge Assets. This is what makes a 100k–10M private + // corpus tractable: the cross-concept link graph (citations, families, + // mentions) is preserved, but there is no per-concept finalize/seal. + // Substance lives ONLY here; the public discoverability signal is a + // separate, gated step. No TRAC, no on-chain anything. + if (isPrivate) { + const allQuads = imported.concepts.flatMap((c) => + c.quads.map((q: Quad) => ({ + subject: q.subject, + predicate: q.predicate, + object: q.object, + graph, + })), + ); + + // Resumable manifest keyed by chunks already acknowledged. + const manifestPath = opts.manifest + ? String(opts.manifest) + : join(bundleDir, '.okf-import-manifest.json'); + let chunksDone = 0; + if (existsSync(manifestPath)) { + try { + const prev = JSON.parse(await readFile(manifestPath, 'utf-8')) as { + contextGraphId?: string; + mode?: string; + chunkSize?: number; + chunksDone?: number; + }; + if ( + prev.contextGraphId === contextGraphId && + prev.mode === 'bulk-private-swm' && + prev.chunkSize === CHUNK && + typeof prev.chunksDone === 'number' + ) { + chunksDone = prev.chunksDone; + } + } catch { + // corrupt manifest → start over (re-writing loose quads is idempotent) + } + } + + const totalChunks = Math.ceil(allQuads.length / CHUNK); + const started = Date.now(); + let triplesWritten = 0; + let skolemized = 0; + + // Write one slice, halving the chunk on a 413 (payload too large) + // down to a floor, so a too-big batch degrades instead of failing. + const writeSlice = async (slice: typeof allQuads): Promise => { + try { + const res = await client.sharedMemoryWrite(contextGraphId, slice); + triplesWritten += res.triplesWritten ?? slice.length; + skolemized += res.skolemizedBlankNodes ?? 0; + } catch (e) { + const status = (e as { httpStatus?: number })?.httpStatus; + if (status === 413 && slice.length > 1) { + const mid = Math.floor(slice.length / 2); + console.error(` 413 on ${slice.length} quads — splitting into ${mid}/${slice.length - mid}`); + await writeSlice(slice.slice(0, mid)); + await writeSlice(slice.slice(mid)); + return; + } + throw e; + } + }; + + let lastTick = started; + for (let ci = chunksDone; ci < totalChunks; ci++) { + const slice = allQuads.slice(ci * CHUNK, (ci + 1) * CHUNK); + const before = triplesWritten; + await writeSlice(slice); + chunksDone = ci + 1; + await writeFile( + manifestPath, + JSON.stringify( + { contextGraphId, mode: 'bulk-private-swm', chunkSize: CHUNK, chunksDone, totalChunks }, + null, + 2, + ), + ); + const now = Date.now(); + // Instantaneous rate for THIS chunk exposes the store-growth slowdown + // (the cost-gate signal); cumulative average alone hides it. + const chunkSecs = (now - lastTick) / 1000; + const instRate = chunkSecs > 0 ? Math.round((triplesWritten - before) / chunkSecs) : triplesWritten - before; + const cumSecs = (now - started) / 1000; + const cumRate = cumSecs > 0 ? Math.round(triplesWritten / cumSecs) : triplesWritten; + lastTick = now; + console.log( + ` SWM(private) chunk ${chunksDone}/${totalChunks} ` + + `(${triplesWritten} triples; this chunk ${instRate} t/s, avg ${cumRate} t/s)`, + ); + } + + const elapsed = (Date.now() - started) / 1000; + console.log( + JSON.stringify( + { + mode: 'import', + importMode: 'bulk-private-swm', + contextGraphId, + datasetPointer: graph, + memoryLayer: 'SWM', + accessPolicy: 'private (invite-only, off-chain)', + allowlistedPeers: allowedPeers.length, + okfVersion: imported.okfVersion, + entities: summary.concepts, + ...summary, + triplesWritten, + skolemizedBlankNodes: skolemized, + chunks: totalChunks, + chunkSize: CHUNK, + elapsedSeconds: Number(elapsed.toFixed(1)), + throughputTriplesPerSec: elapsed > 0 ? Math.round(triplesWritten / elapsed) : triplesWritten, + note: + 'Bulk-written to PRIVATE Shared Working Memory (gossip-restricted to allowlisted peers). ' + + 'Substance stays here; no per-concept finalize, no on-chain verification, no TRAC spent. ' + + 'Public discoverability + VM descriptors are separate, gated steps.', + }, + null, + 2, + ), + ); + return; + } + + // Resumability manifest: per-concept STAGE, not just "done". A bare + // done-set would make the documented `import` → `import --share` flow skip + // every concept (already "done" from the WM pass) before finalize/share + // ran, falsely reporting SWM with nothing shared. We record the furthest + // stage each concept reached ('wm' = created+written, 'swm' = finalized+ + // shared) so a later --share advances WM concepts instead of skipping them. + type Stage = 'wm' | 'swm'; + const manifestPath = opts.manifest + ? String(opts.manifest) + : join(bundleDir, '.okf-import-manifest.json'); + const stages = new Map(); + // --replace forces a fresh import: ignore prior stages so every concept is + // re-created (with its WM draft discarded first, below) rather than skipped. + if (!opts.replace && existsSync(manifestPath)) { + try { + const prev = JSON.parse(await readFile(manifestPath, 'utf-8')) as { + contextGraphId?: string; + mode?: string; + stages?: Record; + done?: string[]; // legacy format → treat as reached 'wm' + }; + if (prev.contextGraphId === contextGraphId && prev.mode !== 'bulk-private-swm') { + if (prev.stages && typeof prev.stages === 'object') { + for (const [id, s] of Object.entries(prev.stages)) { + if (s === 'wm' || s === 'swm') stages.set(id, s); + } + } else if (Array.isArray(prev.done)) { + for (const id of prev.done) stages.set(id, 'wm'); + } + } + } catch { + // ignore a corrupt manifest; re-import is idempotent per KA name + } + } + + const persistManifest = async () => + writeFile( + manifestPath, + JSON.stringify( + { contextGraphId, mode: 'per-concept', stages: Object.fromEntries(stages) }, + null, + 2, + ), + ); + + const layer = opts.share ? 'SWM' : 'WM'; + const targetStage: Stage = opts.share ? 'swm' : 'wm'; + let written = 0; + let created = 0; + let shared = 0; + for (const concept of imported.concepts) { + const name = conceptKaName(concept.conceptId); + const current = stages.get(concept.conceptId); + // Already at or past the target stage for this run → nothing to do. + if (current === 'swm' || current === targetStage) continue; + + const needCreate = current === undefined; // not yet written to WM + const needShare = opts.share; // current is undefined or 'wm' here + + if (needCreate) { + const quads = concept.quads.map((q: Quad) => ({ + subject: q.subject, + predicate: q.predicate, + object: q.object, + graph, + })); + if (opts.replace) { + // Discard any existing WM draft so a changed re-import doesn't + // accumulate stale triples on top of the old ones (best-effort: + // there may be nothing to discard). + await client + .knowledgeAssetDiscard(contextGraphId, name, { subGraphName }) + .catch(() => undefined); + } + await client.createKnowledgeAsset(contextGraphId, name, { subGraphName }); + for (let i = 0; i < quads.length; i += CHUNK) { + await client.knowledgeAssetWrite(contextGraphId, name, quads.slice(i, i + CHUNK), { + subGraphName, + }); + } + written += quads.length; + created += 1; + stages.set(concept.conceptId, 'wm'); + await persistManifest(); + } + if (needShare) { + // Advance WM → SWM (works whether the KA was just created or was + // already in WM from a prior `import` run). + await client.knowledgeAssetFinalize(contextGraphId, name, { subGraphName }); + await client.knowledgeAssetShare(contextGraphId, name, { + subGraphName, + entities: 'all', + }); + shared += 1; + stages.set(concept.conceptId, 'swm'); + await persistManifest(); + } + console.log(` ${layer} ${concept.conceptId} → ${concept.iri}` + + (needCreate ? ` (${concept.quads.length} quads)` : ' (advanced WM→SWM)')); + } + + console.log( + JSON.stringify( + { + mode: 'import', + contextGraphId, + memoryLayer: layer, + okfVersion: imported.okfVersion, + ...summary, + triplesWritten: written, + assetsCreated: created, + assetsShared: shared, + note: + layer === 'WM' + ? 'Assets are in private Working Memory (free, reversible). No on-chain verification.' + : 'Assets sealed and shared to Shared Working Memory (free, team-visible). No on-chain verification — VM promotion is a separate gated step.', + }, + null, + 2, + ), + ); + } catch (err) { + reportOkfError(err); + } + }); + + // ─── dkg okf export ─────────────────────── + okfCmd + .command('export ') + .description('Serialise a Context Graph back into a conformant OKF bundle (clean inverse of import)') + .option('--sub-graph-name ', 'Sub-graph within the Context Graph') + .option('--iri-base ', `IRI namespace concept subjects were minted under (default ${DEFAULT_IRI_BASE})`) + .option('--view ', 'working-memory | shared-working-memory | verifiable-memory', 'shared-working-memory') + .action(async (contextGraphId: string, outDir: string, opts: ActionOpts) => { + try { + const iriBase = opts.iriBase ? String(opts.iriBase) : DEFAULT_IRI_BASE; + // Back-compat: the pre-v10.0.1 token was `verified-memory`; v10.0.1 + // renamed it to `verifiable-memory`. Accept the old spelling and map it + // to the canonical token before querying. + const rawView = String(opts.view ?? 'shared-working-memory'); + const view = (rawView === 'verified-memory' ? 'verifiable-memory' : rawView) as + | 'working-memory' + | 'shared-working-memory' + | 'verifiable-memory'; + + const client = await ApiClient.connect(); + // Fetch all triples whose subject is an OKF concept IRI in this graph. + const sparql = `SELECT ?s ?p ?o WHERE { GRAPH ?g { ?s ?p ?o } FILTER(STRSTARTS(STR(?s), "${iriBase}")) }`; + const { result } = await client.query(sparql, contextGraphId, { + view, + ...(opts.subGraphName ? { subGraphName: String(opts.subGraphName) } : {}), + }); + + const bindings = bindingsOf(result); + + // Daemon bindings are already in N-term form (literals as `"…"`/`"…"^^
`, + // IRIs as raw or `<…>`). Strip IRI brackets so they match the mapper's + // raw-IRI quad-term convention; literals/blank nodes pass through. + const unwrapIri = (term: string): string => + term.startsWith('<') && term.endsWith('>') ? term.slice(1, -1) : term; + + const quadsBySubject = new Map(); + for (const b of bindings) { + const s = cell(b.s); + const p = cell(b.p); + const o = cell(b.o); + if (!s || !p || !o) continue; + const subject = unwrapIri(s); + const quad: Quad = { subject, predicate: unwrapIri(p), object: o }; + if (!quadsBySubject.has(subject)) quadsBySubject.set(subject, []); + quadsBySubject.get(subject)!.push(quad); + } + + // Rebuild a minimal BundleImport for the exporter. Keep only real concept + // roots: subjects under the IRI base that carry an OKF concept rdf:type. + // This excludes the skolemized `dkg:hasSection` nodes + // (`/.well-known/genid/...`), which would otherwise be rebuilt + // as standalone `.well-known/genid/*.md` files that were never concepts. + const concepts = [...quadsBySubject.entries()] + .filter( + ([iri, quads]) => + iri.startsWith(iriBase) && + !iri.includes(SECTION_GENID_INFIX) && + quads.some((q) => q.predicate === RDF_TYPE), + ) + .map(([iri, quads]) => ({ + conceptId: iri.slice(iriBase.length), + iri, + quads, + resolvedLinks: [], + brokenLinks: [], + codeSpanLinks: [], + citations: [], + })); + const imported: BundleImport = { + okfVersion: null, + iriByConceptId: Object.fromEntries(concepts.map((c) => [c.conceptId, c.iri])), + concepts, + reservedSkipped: [], + quads: concepts.flatMap((c) => c.quads), + warnings: [], + }; + + if (concepts.length === 0) { + console.error( + `No OKF concepts (subjects under "${iriBase}") found in Context Graph "${contextGraphId}" (${view}).`, + ); + process.exit(OKF_EXIT_CODES.NOT_FOUND); + } + + const outFiles = exportBundle(imported, { iriBase }); + // Path-traversal guard: concept IDs come from graph subjects (untrusted), + // so a subject like `urn:okf:../../escape` would otherwise write outside + // outDir. Refuse any file that doesn't resolve under the output directory. + const outRoot = resolve(outDir); + for (const f of outFiles) { + const full = resolve(outDir, f.path); + const rel = relative(outRoot, full); + if (rel === '' || rel.startsWith('..') || isAbsolute(rel)) { + console.error(`Refusing to write outside the output directory: "${f.path}"`); + process.exit(OKF_EXIT_CODES.CLIENT_ERROR); + } + await mkdir(dirname(full), { recursive: true }); + await writeFile(full, f.content, 'utf-8'); + } + console.log( + JSON.stringify( + { mode: 'export', contextGraphId, view, outDir, concepts: concepts.length, files: outFiles.length }, + null, + 2, + ), + ); + } catch (err) { + reportOkfError(err); + } + }); + + // ─── dkg okf verify ───────────────────────────────────── + // Completeness gate for a (private) bulk-imported corpus: map the bundle + // offline, then re-query the Context Graph and compare actual triple counts + // per integrity predicate against what the deterministic mapping expects. + // Turns a silent undercount into an exact, actionable report — and exits + // non-zero on any shortfall so it can gate a pipeline. The fix for a + // shortfall is to re-run the same idempotent `import --private` (a second + // loose-write pass; the store dedupes, so only the dropped triples land). + const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; + // Predicates we treat as integrity signals (order = report order). + const INTEGRITY_PREDICATES = [ + RDF_TYPE, + 'http://schema.org/source', + 'http://schema.org/license', + 'http://schema.org/citation', + 'http://schema.org/mentions', + ]; + + okfCmd + .command('verify ') + .description('Compare a bulk-imported Context Graph against the bundle it was built from (completeness gate)') + .requiredOption('--context-graph-id ', 'Context Graph to verify') + .option('--iri-base ', `IRI namespace concept subjects were minted under (default ${DEFAULT_IRI_BASE})`) + .option('--include-code-span-links', 'Match the import: treat code-span links as edges') + .option( + '--relate ', + 'Match the import: type cross-concept edges by endpoint types ' + + '">=" (repeatable). MUST mirror the --relate rules used at ' + + 'import time — otherwise the offline expectation uses the default schema:mentions mapping ' + + 'and reports a false shortfall against a graph that actually holds the typed predicate.', + collect, + [] as string[], + ) + .option('--list-missing ', 'List up to N concept IRIs missing their rdf:type (best-effort)', (v: string) => parseInt(v, 10)) + .action(async (bundleDir: string, opts: ActionOpts) => { + try { + if (!existsSync(bundleDir)) { + console.error(`Bundle directory not found: ${bundleDir}`); + process.exit(OKF_EXIT_CODES.UNEXPECTED); + } + const iriBase = opts.iriBase ? String(opts.iriBase) : DEFAULT_IRI_BASE; + const contextGraphId = String(opts.contextGraphId); + // The offline expectation MUST be built with the same edge typing the import used, + // or a --relate'd predicate (e.g. schema:hasPart) is reconstructed as schema:mentions + // and verify reports a false shortfall. Mirror the import's --relate exactly. + const typeRelations = parseRelateRules( + Array.isArray(opts.relate) ? (opts.relate as string[]) : [], + ); + + // Offline: what the deterministic mapping SHOULD have produced. + const { files } = loadBundleDirWithReport(bundleDir); + const imported = importBundle(files, { + iriBase, + includeCodeSpanLinks: Boolean(opts.includeCodeSpanLinks), + typeRelations, + }); + const expectedByPred = new Map(); + for (const q of imported.quads) { + expectedByPred.set(q.predicate, (expectedByPred.get(q.predicate) ?? 0) + 1); + } + const expectedConcepts = imported.concepts.length; + + const client = await ApiClient.connect(); + const countFor = async (predicate: string): Promise => { + // Scope the count to subjects under this bundle's IRI base. A graph-wide + // count would let unrelated pre-existing triples mask a real shortfall + // (or inflate it) and report "complete" when concepts are actually missing. + const sparql = + `SELECT (COUNT(*) AS ?c) WHERE { GRAPH ?g { ?s <${predicate}> ?o ` + + `FILTER(STRSTARTS(STR(?s), "${iriBase}")) } }`; + const { result } = await client.query(sparql, contextGraphId, { + includeSharedMemory: true, + }); + const bindings = bindingsOf(result); + const raw = cell(bindings[0]?.c) || '0'; + const m = /^"?(\d+)"?/.exec(raw); + return m ? parseInt(m[1], 10) : 0; + }; + + // Verify EVERY predicate the bundle actually produced, not a fixed + // allowlist — otherwise a shortfall in e.g. schema:hasPart (--relate), + // schema:dateModified or producer-extra keys would be invisible. The + // INTEGRITY_PREDICATES list only fixes the leading report order. + const orderedPreds = [ + ...INTEGRITY_PREDICATES.filter((p) => expectedByPred.has(p)), + ...[...expectedByPred.keys()].filter((p) => !INTEGRITY_PREDICATES.includes(p)).sort(), + ]; + const rows: Array<{ predicate: string; expected: number; actual: number; missing: number }> = []; + for (const predicate of orderedPreds) { + const expected = expectedByPred.get(predicate) ?? 0; + if (expected === 0) continue; // predicate not used by this bundle + const actual = await countFor(predicate); + rows.push({ predicate, expected, actual, missing: Math.max(0, expected - actual) }); + } + + // Best-effort concept-level diff (subjects present with rdf:type). + let missingConcepts: string[] | undefined; + if (opts.listMissing) { + const sparql = `SELECT DISTINCT ?s WHERE { GRAPH ?g { ?s <${RDF_TYPE}> ?o } }`; + const { result } = await client.query(sparql, contextGraphId, { includeSharedMemory: true }); + const bindings = bindingsOf(result); + const unwrap = (t: string): string => (t.startsWith('<') && t.endsWith('>') ? t.slice(1, -1) : t); + const present = new Set(bindings.map((b) => unwrap(cell(b.s)))); + missingConcepts = imported.concepts + .map((c) => c.iri) + .filter((iri) => !present.has(iri)) + .slice(0, Number(opts.listMissing)); + } + + const totalMissing = rows.reduce((a, r) => a + r.missing, 0); + const complete = totalMissing === 0; + console.log( + JSON.stringify( + { + mode: 'verify', + contextGraphId, + expectedConcepts, + complete, + predicates: rows, + totalMissingTriples: totalMissing, + ...(missingConcepts ? { missingConcepts } : {}), + note: complete + ? 'Context Graph matches the bundle on all integrity predicates.' + : 'SHORTFALL: the node\'s SWM holds fewer triples than the bundle defines. ' + + 'Re-run the same `dkg okf import --private` (idempotent second pass; the store ' + + 'dedupes, so only the dropped triples are re-written), then verify again.', + }, + null, + 2, + ), + ); + if (!complete) process.exit(OKF_EXIT_CODES.CLIENT_ERROR); + } catch (err) { + reportOkfError(err); + } + }); +} diff --git a/packages/cli/test/okf-subcommands.test.ts b/packages/cli/test/okf-subcommands.test.ts new file mode 100644 index 000000000..5a6bf3182 --- /dev/null +++ b/packages/cli/test/okf-subcommands.test.ts @@ -0,0 +1,514 @@ +import { describe, expect, it, beforeAll, afterAll } from 'vitest'; +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; +import { mkdtemp, writeFile, mkdir, readFile, readdir, rm, symlink } from 'node:fs/promises'; +import { existsSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { + createServer, + type IncomingMessage, + type Server, + type ServerResponse, +} from 'node:http'; +import type { AddressInfo } from 'node:net'; + +// CLI subcommand tests for `dkg okf {import,export}` against a tiny in-process +// stub that mimics the daemon's knowledge-asset / shared-memory / query routes. +// The CLI talks to the stub via the standard DKG_API_PORT + auth-token channel +// ApiClient.connect() reads, so these run the compiled CLI binary end-to-end +// without booting the daemon. They lock down the behaviours the pure mapper +// tests cannot: dry-run must NOT connect, WM import vs --share advance, the +// --private bulk SWM path + manifest, and the export skolem-node filter. + +const execFileAsync = promisify(execFile); +const __dirname = dirname(fileURLToPath(import.meta.url)); +const CLI_ENTRY = join(__dirname, '..', 'dist', 'cli.js'); + +interface StubCall { + method: string; + url: string; + authorization?: string; + body: string; +} + +interface StubResult { + status: number; + body: unknown; +} +type StubHandler = (req: IncomingMessage, body: string) => StubResult; + +function startStub(): Promise<{ + port: number; + setHandler: (h: StubHandler) => void; + calls: StubCall[]; + close: () => Promise; +}> { + return new Promise((resolve) => { + let handler: StubHandler = () => ({ status: 500, body: { error: 'No handler installed' } }); + const calls: StubCall[] = []; + const server: Server = createServer((req: IncomingMessage, res: ServerResponse) => { + const chunks: Buffer[] = []; + req.on('data', (c) => chunks.push(c as Buffer)); + req.on('end', () => { + const raw = Buffer.concat(chunks).toString('utf-8'); + calls.push({ + method: req.method ?? '', + url: req.url ?? '', + authorization: req.headers.authorization, + body: raw, + }); + const result = handler(req, raw); + res.writeHead(result.status, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify(result.body)); + }); + }); + server.listen(0, '127.0.0.1', () => { + resolve({ + port: (server.address() as AddressInfo).port, + setHandler: (h) => { + handler = h; + }, + calls, + close: () => new Promise((r, j) => server.close((e) => (e ? j(e) : r()))), + }); + }); + }); +} + +/** + * A stub daemon that records calls and serves the knowledge-asset lifecycle + * with minimal success bodies. Tracks created Context Graphs so + * `context-graph/exists` answers truthfully across CLI invocations. + */ +function okfDaemonHandler(createdCGs: Set): StubHandler { + return (req, body) => { + const url = new URL(`http://x${req.url ?? ''}`); + const path = url.pathname; + const m = req.method ?? ''; + if (m === 'GET' && path === '/api/context-graph/exists') { + return { status: 200, body: { id: url.searchParams.get('id'), exists: createdCGs.has(url.searchParams.get('id') ?? '') } }; + } + if (m === 'POST' && path === '/api/context-graph/create') { + const id = JSON.parse(body || '{}').id as string; + createdCGs.add(id); + return { status: 200, body: { created: id, uri: `did:dkg:context-graph:${id}` } }; + } + if (m === 'POST' && path === '/api/context-graph/invite') { + return { status: 200, body: { invited: 'ok', contextGraphId: JSON.parse(body || '{}').contextGraphId } }; + } + if (m === 'POST' && path === '/api/knowledge-assets') { + return { status: 200, body: { created: true } }; + } + if (m === 'POST' && /\/api\/knowledge-assets\/.+\/wm\/write$/.test(path)) { + const quads = JSON.parse(body || '{}').quads ?? []; + return { status: 200, body: { written: quads.length } }; + } + if (m === 'POST' && /\/api\/knowledge-assets\/.+\/wm\/discard$/.test(path)) { + return { status: 200, body: { discarded: true } }; + } + if (m === 'POST' && /\/api\/knowledge-assets\/.+\/wm\/finalize$/.test(path)) { + return { status: 200, body: { merkleRoot: '0xroot', eip712Digest: '0xdig' } }; + } + if (m === 'POST' && /\/api\/knowledge-assets\/.+\/swm\/share$/.test(path)) { + return { status: 200, body: { swmShared: true, promotedCount: 1 } }; + } + if (m === 'POST' && path === '/api/shared-memory/write') { + const quads = JSON.parse(body || '{}').quads ?? []; + return { status: 200, body: { shareOperationId: 'op-1', contextGraphId: 'x', graph: 'g', triplesWritten: quads.length } }; + } + return { status: 404, body: { error: 'NotFound', path } }; + }; +} + +async function runCli( + args: string[], + env: { DKG_API_PORT: string; DKG_HOME: string }, +): Promise<{ exitCode: number; stdout: string; stderr: string }> { + try { + const { stdout, stderr } = await execFileAsync('node', [CLI_ENTRY, ...args], { + env: { ...process.env, ...env }, + }); + return { exitCode: 0, stdout, stderr }; + } catch (err) { + const c = err as NodeJS.ErrnoException & { code?: number | string; stdout?: string; stderr?: string }; + return { exitCode: typeof c.code === 'number' ? c.code : 1, stdout: c.stdout ?? '', stderr: c.stderr ?? '' }; + } +} + +/** + * Import prints human progress lines ("Created Context Graph …", " WM a → …") + * before the final JSON summary, so parse from the first `{` (none of the + * progress lines contain a brace). + */ +function parseJsonTail(stdout: string): Record { + const i = stdout.indexOf('{'); + return JSON.parse(stdout.slice(i)) as Record; +} + +/** A minimal conformant 2-concept OKF bundle in a fresh temp dir. */ +async function makeBundle(): Promise { + const dir = await mkdtemp(join(tmpdir(), 'okf-bundle-')); + await writeFile(join(dir, 'index.md'), '---\nokf_version: "0.1"\n---\n\n# Root\n'); + await writeFile(join(dir, 'a.md'), '---\ntype: Thing\ntitle: A\n---\n\n# Notes\n\nSee [b](b.md).\n'); + await writeFile(join(dir, 'b.md'), '---\ntype: Thing\ntitle: B\n---\n\nplain body\n'); + return dir; +} + +describe.sequential('dkg okf subcommands', { timeout: 120_000 }, () => { + let stub: Awaited>; + let dkgHome: string; + const createdCGs = new Set(); + + beforeAll(async () => { + if (!existsSync(CLI_ENTRY)) { + await execFileAsync('pnpm', ['build'], { cwd: join(__dirname, '..') }); + } + stub = await startStub(); + stub.setHandler(okfDaemonHandler(createdCGs)); + dkgHome = await mkdtemp(join(tmpdir(), 'dkg-okf-cli-')); + await writeFile( + join(dkgHome, 'config.json'), + JSON.stringify({ name: 'okf-cli-stub', apiPort: stub.port, listenPort: 0, nodeRole: 'edge', paranets: [] }), + ); + await writeFile(join(dkgHome, 'auth.token'), 'stub-token\n', { mode: 0o600 }); + }, 120_000); + + afterAll(async () => { + if (stub) await stub.close(); + if (dkgHome) await rm(dkgHome, { recursive: true, force: true }); + }); + + const env = () => ({ DKG_API_PORT: String(stub.port), DKG_HOME: dkgHome }); + const clear = () => { + stub.calls.length = 0; + }; + + it('dry-run prints the mapping and NEVER contacts the node', async () => { + clear(); + const bundle = await makeBundle(); + const r = await runCli(['okf', 'import', bundle, '--context-graph-id', 'cg', '--dry-run'], env()); + expect(r.exitCode).toBe(0); + const out = parseJsonTail(r.stdout); + expect(out.mode).toBe('dry-run'); + expect(out.concepts).toBe(2); + expect(out.conformant).toBe(true); + // The whole point of --dry-run: zero node calls. + expect(stub.calls).toHaveLength(0); + }); + + it('WM import creates KAs and writes quads, but does NOT finalize/share', async () => { + clear(); + const bundle = await makeBundle(); + const r = await runCli( + ['okf', 'import', bundle, '--context-graph-id', 'cg-wm', '--create-context-graph'], + env(), + ); + expect(r.exitCode).toBe(0); + const out = parseJsonTail(r.stdout); + expect(out.memoryLayer).toBe('WM'); + expect(out.assetsCreated).toBe(2); + expect(out.assetsShared).toBe(0); + + const paths = stub.calls.map((c) => `${c.method} ${c.url.split('?')[0]}`); + expect(paths).toContain('POST /api/context-graph/create'); + expect(paths.filter((p) => p === 'POST /api/knowledge-assets')).toHaveLength(2); + expect(paths.some((p) => p.endsWith('/wm/write'))).toBe(true); + // No sealing/sharing in a plain WM import. + expect(paths.some((p) => p.endsWith('/wm/finalize'))).toBe(false); + expect(paths.some((p) => p.endsWith('/swm/share'))).toBe(false); + + // Manifest records the per-concept stage as 'wm'. + const manifest = JSON.parse(await readFile(join(bundle, '.okf-import-manifest.json'), 'utf-8')); + expect(manifest.mode).toBe('per-concept'); + expect(manifest.stages).toEqual({ a: 'wm', b: 'wm' }); + }); + + it('import then import --share ADVANCES WM→SWM (does not skip finalize/share)', async () => { + const bundle = await makeBundle(); + // First: a plain WM import. + const wm = await runCli( + ['okf', 'import', bundle, '--context-graph-id', 'cg-share', '--create-context-graph'], + env(), + ); + expect(wm.exitCode).toBe(0); + + // Then: re-run with --share. The bug was that the manifest's "done" set made + // this skip every concept; it must instead finalize + share each one. + clear(); + const r = await runCli(['okf', 'import', bundle, '--context-graph-id', 'cg-share', '--share'], env()); + expect(r.exitCode).toBe(0); + const out = parseJsonTail(r.stdout); + expect(out.memoryLayer).toBe('SWM'); + expect(out.assetsShared).toBe(2); + expect(out.assetsCreated).toBe(0); // already in WM — not recreated + + const paths = stub.calls.map((c) => `${c.method} ${c.url.split('?')[0]}`); + expect(paths.filter((p) => p.endsWith('/wm/finalize'))).toHaveLength(2); + expect(paths.filter((p) => p.endsWith('/swm/share'))).toHaveLength(2); + // Must NOT re-create the assets that are already in WM. + expect(paths.some((p) => p === 'POST /api/knowledge-assets')).toBe(false); + + const manifest = JSON.parse(await readFile(join(bundle, '.okf-import-manifest.json'), 'utf-8')); + expect(manifest.stages).toEqual({ a: 'swm', b: 'swm' }); + }); + + it('--private bulk-streams quads via /api/shared-memory/write with a chunked manifest', async () => { + clear(); + const bundle = await makeBundle(); + const r = await runCli( + ['okf', 'import', bundle, '--context-graph-id', 'cg-priv', '--private', '--create-context-graph'], + env(), + ); + expect(r.exitCode).toBe(0); + const out = parseJsonTail(r.stdout); + expect(out.importMode).toBe('bulk-private-swm'); + expect(out.datasetPointer).toBe('did:dkg:context-graph:cg-priv'); + expect(out.triplesWritten).toBeGreaterThan(0); + + const paths = stub.calls.map((c) => `${c.method} ${c.url.split('?')[0]}`); + expect(paths).toContain('POST /api/shared-memory/write'); + // The private bulk path never creates per-concept KAs. + expect(paths.some((p) => p === 'POST /api/knowledge-assets')).toBe(false); + + // PRIVACY CONTRACT: the Context Graph must be created invite-only. A regression + // that dropped `{ private: true, accessPolicy: 1 }` would bulk-write the corpus + // into a public CG — exactly the substance-leak this mode must prevent. + const createCall = stub.calls.find( + (c) => c.method === 'POST' && c.url.split('?')[0] === '/api/context-graph/create', + ); + expect(createCall).toBeDefined(); + const createBody = JSON.parse(createCall!.body || '{}'); + expect(createBody.private).toBe(true); + expect(createBody.accessPolicy).toBe(1); + + const manifest = JSON.parse(await readFile(join(bundle, '.okf-import-manifest.json'), 'utf-8')); + expect(manifest.mode).toBe('bulk-private-swm'); + expect(manifest.chunksDone).toBe(manifest.totalChunks); + }); + + it('export filters skolemized section nodes (no .well-known/genid files)', async () => { + clear(); + const outDir = await mkdtemp(join(tmpdir(), 'okf-export-')); + // Query returns a real concept (has rdf:type) AND a section genid subject + // (schema:name only). Only the concept should become a file. + stub.setHandler(() => ({ + status: 200, + body: { + result: { + bindings: [ + { s: 'urn:okf:a', p: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', o: 'http://schema.org/Thing' }, + { s: 'urn:okf:a', p: 'http://schema.org/name', o: '"A"' }, + { s: 'urn:okf:a/.well-known/genid/okfsec_a_0', p: 'http://schema.org/name', o: '"Notes"' }, + ], + }, + }, + })); + const r = await runCli(['okf', 'export', 'cg-x', outDir], env()); + stub.setHandler(okfDaemonHandler(createdCGs)); // restore for any later tests + expect(r.exitCode).toBe(0); + const out = parseJsonTail(r.stdout); + expect(out.concepts).toBe(1); + + const files = await readdir(outDir, { recursive: true } as { recursive: true }); + const flat = (files as string[]).map(String); + expect(flat).toContain('a.md'); + expect(flat.some((f) => f.includes('genid') || f.includes('.well-known'))).toBe(false); + await rm(outDir, { recursive: true, force: true }); + await mkdir(outDir, { recursive: true }).catch(() => {}); + }); + + it('export refuses to write a subject that escapes the output directory', async () => { + clear(); + const outDir = await mkdtemp(join(tmpdir(), 'okf-export-trav-')); + // A hostile graph subject with ../ would otherwise write outside outDir. + stub.setHandler(() => ({ + status: 200, + body: { + result: { + bindings: [ + { s: 'urn:okf:../../escaped', p: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', o: 'http://schema.org/Thing' }, + { s: 'urn:okf:../../escaped', p: 'http://schema.org/name', o: '"x"' }, + ], + }, + }, + })); + const r = await runCli(['okf', 'export', 'cg-x', outDir], env()); + stub.setHandler(okfDaemonHandler(createdCGs)); + expect(r.exitCode).not.toBe(0); + expect(r.stderr).toContain('Refusing to write outside the output directory'); + await rm(outDir, { recursive: true, force: true }); + }); + + it('--relate types dataset→table edges as hasPart (dry-run, deterministic)', async () => { + clear(); + const dir = await mkdtemp(join(tmpdir(), 'okf-relate-')); + await writeFile(join(dir, 'index.md'), '---\nokf_version: "0.1"\n---\n# Root\n'); + await writeFile(join(dir, 'ds.md'), '---\ntype: Dataset\ntitle: DS\n---\n\nSee [t](t.md).\n'); + await writeFile(join(dir, 't.md'), '---\ntype: Table\ntitle: T\n---\n\nplain\n'); + const r = await runCli( + ['okf', 'import', dir, '--context-graph-id', 'cg', '--dry-run', '--print-nquads', + '--relate', 'Dataset>Table=hasPart'], + env(), + ); + expect(r.exitCode).toBe(0); + expect(stub.calls).toHaveLength(0); // dry-run never connects + expect(r.stdout).toContain(' '); + expect(r.stdout).not.toContain(' '); + await rm(dir, { recursive: true, force: true }); + }); + + it('--replace discards the existing WM draft before re-writing', async () => { + const bundle = await makeBundle(); + // First WM import. + await runCli(['okf', 'import', bundle, '--context-graph-id', 'cg-rep', '--create-context-graph'], env()); + // Re-import with --replace: must discard each KA's WM draft, then re-create. + clear(); + const r = await runCli(['okf', 'import', bundle, '--context-graph-id', 'cg-rep', '--replace'], env()); + expect(r.exitCode).toBe(0); + const paths = stub.calls.map((c) => `${c.method} ${c.url.split('?')[0]}`); + expect(paths.filter((p) => p.endsWith('/wm/discard'))).toHaveLength(2); + expect(paths.filter((p) => p === 'POST /api/knowledge-assets')).toHaveLength(2); + await rm(bundle, { recursive: true, force: true }); + }); + + it('warns and skips a symlinked bundle entry (no exfiltration)', async () => { + clear(); + const dir = await mkdtemp(join(tmpdir(), 'okf-sym-')); + await writeFile(join(dir, 'index.md'), '---\nokf_version: "0.1"\n---\n# Root\n'); + await writeFile(join(dir, 'a.md'), '---\ntype: Thing\ntitle: A\n---\n\nbody\n'); + const secret = join(dir, 'secret.txt'); + await writeFile(secret, 'SECRET-XYZ'); + await symlink(secret, join(dir, 'leak.md')); + const r = await runCli(['okf', 'import', dir, '--context-graph-id', 'cg', '--dry-run'], env()); + expect(r.exitCode).toBe(0); + expect(r.stderr).toContain('skipped symlinked bundle entry'); + expect(r.stdout).not.toContain('SECRET-XYZ'); + const out = parseJsonTail(r.stdout); + expect(out.concepts).toBe(1); // only a.md, not the symlink + await rm(dir, { recursive: true, force: true }); + }); + + it('export normalizes SPARQL-JSON object binding cells (not just bare strings)', async () => { + clear(); + const outDir = await mkdtemp(join(tmpdir(), 'okf-export-obj-')); + // The daemon's /api/query can return each cell as `{ value, type, datatype? }` + // rather than a bare string. The command must normalize these — previously + // `unwrapIri(b.s).startsWith` threw on the object form. + stub.setHandler(() => ({ + status: 200, + body: { + result: { + bindings: [ + { + s: { value: 'urn:okf:a', type: 'uri' }, + p: { value: 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', type: 'uri' }, + o: { value: 'http://schema.org/Thing', type: 'uri' }, + }, + { + s: { value: 'urn:okf:a', type: 'uri' }, + p: { value: 'http://schema.org/name', type: 'uri' }, + o: { value: '"A"', type: 'literal' }, + }, + ], + }, + }, + })); + const r = await runCli(['okf', 'export', 'cg-obj', outDir], env()); + stub.setHandler(okfDaemonHandler(createdCGs)); + expect(r.exitCode).toBe(0); + const out = parseJsonTail(r.stdout); + expect(out.concepts).toBe(1); + const files = (await readdir(outDir, { recursive: true } as { recursive: true })) as string[]; + expect(files.map(String)).toContain('a.md'); + await rm(outDir, { recursive: true, force: true }); + }); + + it('verify gates on scoped per-predicate counts and normalizes object-form count cells', async () => { + clear(); + // Clean 2-concept bundle, no headings/links: predicates are rdf:type×2, schema:name×2. + const dir = await mkdtemp(join(tmpdir(), 'okf-verify-')); + await writeFile(join(dir, 'index.md'), '---\nokf_version: "0.1"\n---\n# Root\n'); + await writeFile(join(dir, 'x.md'), '---\ntype: Thing\ntitle: X\n---\n\nplain body\n'); + await writeFile(join(dir, 'y.md'), '---\ntype: Thing\ntitle: Y\n---\n\nplain body\n'); + + const TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; + const NAME = 'http://schema.org/name'; + // COUNT(*) returned as a SPARQL-JSON OBJECT cell (exercises the cell() fix in verify). + const issuedSparql: string[] = []; + const countHandler = + (counts: Record): StubHandler => + (req, body) => { + const sparql = String(JSON.parse(body || '{}').sparql ?? ''); + issuedSparql.push(sparql); + const pred = Object.keys(counts).find((p) => sparql.includes(`<${p}>`)) ?? ''; + return { + status: 200, + body: { result: { bindings: [{ c: { value: String(counts[pred] ?? 0), type: 'literal' } }] } }, + }; + }; + + // All present → complete, zero missing, exit 0. + stub.setHandler(countHandler({ [TYPE]: 2, [NAME]: 2 })); + const ok = await runCli(['okf', 'verify', dir, '--context-graph-id', 'cg-v'], env()); + expect(ok.exitCode).toBe(0); + const okOut = parseJsonTail(ok.stdout); + expect(okOut.complete).toBe(true); + expect(okOut.totalMissingTriples).toBe(0); + + // SCOPING CONTRACT: every COUNT must be filtered to subjects under the bundle's + // IRI prefix — otherwise unrelated pre-existing graph triples could mask (or + // inflate) a real shortfall and report "complete" while concepts are missing. + // A regression dropping the STRSTARTS filter fails here. + expect(issuedSparql.length).toBeGreaterThan(0); + for (const s of issuedSparql) expect(s).toMatch(/STRSTARTS\(STR\(\?s\)/); + + // One name missing → shortfall → complete:false, non-zero exit (pipeline gate). + stub.setHandler(countHandler({ [TYPE]: 2, [NAME]: 1 })); + const bad = await runCli(['okf', 'verify', dir, '--context-graph-id', 'cg-v'], env()); + stub.setHandler(okfDaemonHandler(createdCGs)); + expect(bad.exitCode).not.toBe(0); + const badOut = parseJsonTail(bad.stdout); + expect(badOut.complete).toBe(false); + expect(badOut.totalMissingTriples).toBe(1); + await rm(dir, { recursive: true, force: true }); + }); + + it('verify --relate mirrors the import mapping (COUNTs the typed predicate, not schema:mentions)', async () => { + clear(); + // A Dataset→Table link: default mapping → schema:mentions; --relate → schema:hasPart. + const dir = await mkdtemp(join(tmpdir(), 'okf-verify-relate-')); + await writeFile(join(dir, 'index.md'), '---\nokf_version: "0.1"\n---\n# Root\n'); + await writeFile(join(dir, 'ds.md'), '---\ntype: Dataset\ntitle: DS\n---\n\nSee [t](t.md).\n'); + await writeFile(join(dir, 't.md'), '---\ntype: Table\ntitle: T\n---\n\nplain\n'); + + const countOne: StubHandler = () => ({ + status: 200, + body: { result: { bindings: [{ c: { value: '1', type: 'literal' } }] } }, + }); + const verifySparql = async (extra: string[]): Promise => { + const issued: string[] = []; + stub.setHandler((req, body) => { + issued.push(String(JSON.parse(body || '{}').sparql ?? '')); + return countOne(); + }); + await runCli(['okf', 'verify', dir, '--context-graph-id', 'cg-vr', ...extra], env()); + return issued.join('\n'); + }; + + // Without --relate: the offline expectation falls back to the default edge predicate. + const def = await verifySparql([]); + expect(def).toContain(''); + + // With --relate: the expectation mirrors the import, so verify COUNTs schema:hasPart + // for the typed edge and NEVER schema:mentions — a real --relate import now verifies + // instead of reporting a false shortfall. + const rel = await verifySparql(['--relate', 'Dataset>Table=hasPart']); + expect(rel).toContain(''); + expect(rel).not.toContain(''); + + stub.setHandler(okfDaemonHandler(createdCGs)); + await rm(dir, { recursive: true, force: true }); + }); +}); diff --git a/packages/okf/CONTEXT.md b/packages/okf/CONTEXT.md new file mode 100644 index 000000000..9b7f442c8 --- /dev/null +++ b/packages/okf/CONTEXT.md @@ -0,0 +1,123 @@ +# OKF + +Deterministic Google Open Knowledge Format (OKF) → DKG mapper. Turns a portable +OKF bundle (Markdown + YAML frontmatter + untyped cross-links) into owned, +verifiable RDF Knowledge Assets, reconstructing the bundle's cross-concept link +graph. Pure, no LLM, no network: the same bundle always yields identical triples +and IRIs. The `dkg okf` CLI command is a thin wrapper over this package. + +The framing: OKF standardises *how* knowledge is written and exchanged but ships +**no** verification, provenance or ownership layer (OKF SPEC §1, §10). The DKG +supplies exactly that. This package is the bridge — the trust-and-permanence +backend for OKF. + +## Language + +**Bundle**: +A directory tree of UTF-8 Markdown files; the unit of distribution (OKF §3). Fed +to the mapper as an in-memory `BundleFile[]` (`{ path, content }`, POSIX paths). +`loadBundleDir` is the only filesystem surface; the mapper itself is I/O-free. + +**Concept**: +One non-reserved `.md` file = YAML frontmatter + Markdown body (OKF §4). Each +concept becomes exactly one Knowledge Asset. Reserved `index.md` / `log.md` +files are **not** concepts and are never minted as KAs (OKF §3.1, §6, §7). + +**Concept ID**: +The file's bundle-relative path with `.md` removed (OKF §2) — e.g. +`tables/blocks`. The path *is* the concept's identity. Segment validation agrees +byte-for-byte with the reference agent's `paths.py` (`[A-Za-z0-9_][A-Za-z0-9_.\-]*`). + +**IRI**: +The Knowledge Asset subject IRI, derived deterministically from the concept ID: +`urn:okf:` (configurable base). Same bundle ⇒ same IRIs. This is the +RDF subject; the on-chain UAL is assigned by the node at publish time (it is not +the same thing — see Flagged ambiguities). + +**Link**: +A standard Markdown link `[text](path)` from one concept to another (OKF §5). +Resolved against the bundle (absolute `/abs`, relative `./`, parent `../`, +bare-sibling, extension-less forms) into an **untyped directed edge** +(`schema:mentions`). The kind of relationship lives in prose, not the link +(OKF §5.3) — the mapper never infers FK/join types. Broken links are warnings, +never errors (OKF §5.3, §9). + +**Citation**: +A link (usually an external URL) under a `# Citations` heading (OKF §8), backing +a claim. Mapped to `schema:citation`, semantically distinct from concept edges. + +**Memory layers** (where imported assets live): +- **WM** (Working Memory): private to one agent, free, reversible. The import + default. +- **SWM** (Shared Working Memory): team-visible, gossip-replicated, free, + TTL-bounded. Reached with `--share` (finalize + advance). +- **VM** (Verifiable Memory): on-chain, permanent, costs TRAC. **Never** written + by this package; promotion is a separate, explicitly-gated operator step. + +## Relationships + +- Bundle → many Concepts (+ reserved files, skipped). Pass 1 indexes the bundle + and builds the `conceptId → IRI` map; Pass 2 maps each concept and resolves its + links against that map (so an edge only forms to a concept that exists). +- Concept → one Knowledge Asset (one subject IRI) → many quads (frontmatter + triples + body sections + untyped edges + citations). +- Frontmatter key → RDF predicate via the locked table (ADR 0005). `type` is the + only required key (OKF §9); everything else degrades gracefully when absent. +- Link → `schema:mentions` edge **iff** its resolved target is a concept in the + bundle; otherwise it is a broken-link warning (target may be not-yet-written + knowledge) or, for external URLs, simply ignored as a non-edge. +- **Opt-in typed edges (`typeRelations` / `--relate`).** By default every edge is + `schema:mentions` (zero interpretation, faithful to OKF §5.3's untyped links). + A caller may supply deterministic `(fromType, toType) → predicate` rules to + type edges by their endpoints' OKF `type` — e.g. `BigQuery Dataset → BigQuery + Table = schema:hasPart` (containment) while `Table → Table` stays `mentions`. + This is byte-stable (types come straight from frontmatter, no prose, no LLM) + and **off by default** so the purity guarantee holds unless explicitly opted in. + Caveat: the rule is endpoint-type-based, so it cannot distinguish a same-dataset + containment link from a cross-dataset reference of the same type pair — use it + where that distinction doesn't apply, or leave the default. +- **Round-trip is graph-faithful, not byte-faithful, by design.** `import → + export → import` reproduces an equivalent *semantic graph*, not the original + bytes: free-form prose isn't recoverable from triples, so export regenerates + bodies structurally, and a typed edge (e.g. `hasPart`) exports as a plain + (untyped) OKF link because OKF can't express the relation type. This is a + deliberate choice, not a defect; a future enhancement may have export *add* + provenance (UAL / seal) when serialising from a published graph. + +## Flagged ambiguities + +- **Reuse vs. fork of the Markdown extractor.** The node's `markdown-extractor.ts` + is regex-based and resolves only `[[wikilinks]]`, not OKF's `[text](path)` + links; importing it from `packages/cli` would also create a `cli → okf → cli` + dependency cycle. So we **converge on its predicate vocabulary** (same + `schema:*` / `dkg:hasSection` IRIs, pinned by a test) but use a **real Markdown + AST** (`mdast-util-from-markdown`) for link/section/citation extraction — which + OKF §2 mandates and which is what lets us honour the in-code-span rule below. +- **Links inside inline code spans.** `outputs.md` writes its only two concept + links inside backticks: `` `[transactions](transactions.md)` ``. CommonMark + treats code-span content as literal text, so **by default these are NOT edges** + (the mechanism-first answer). They are recorded as `codeSpanLinks` and surfaced + as warnings. `--include-code-span-links` flips the policy; both behaviours are + tested. +- **IRI derivation / UAL.** Concept subject IRIs are `urn:okf:`, a pure + function of the concept ID. The on-chain UAL (`did:dkg://`) is + assigned by the node at VM publish (RFC-43 pre-knowable UALs are still draft) — + do not conflate the two. WM/SWM data carries no on-chain verification. +- **`type` normalisation.** A bare `type` value is PascalCased into the schema.org + namespace (`BigQuery Dataset` → `http://schema.org/BigQueryDataset`); a full IRI + `type` is used unchanged. Round-trips losslessly because PascalCase of the local + name is idempotent. +- **`timestamp` → `schema:dateModified`.** OKF defines `timestamp` as last-modified + time, so we map it to `schema:dateModified` (typed `xsd:dateTime`) rather than + the extractor's naive `schema:timestamp` slug — a deliberate semantic choice. +- **`resource` → `schema:url`.** Chosen over `dcterms:source`; documented in ADR 0005. +- **Citations, two styles.** Both numbered (`[1] [text](url)`) and bare-bullet + (`- https://…`) forms are parsed leniently; deduplicated by URL. +- **Folder hierarchy.** `schema:isPartOf` from directory structure is **off by + default** — directories are not concepts, and minting them as graph nodes would + muddy the concept graph. Available via `emitFolderHierarchy`. +- **Producer-defined keys** are always preserved (camelCased into schema.org), + never dropped or rejected (OKF §4.1, §9). +- **Conformance is permissive.** Only two rules make a bundle non-conformant + (unparseable frontmatter; missing non-empty `type`). Missing optionals, unknown + types/keys, broken links and missing `index.md` are tolerated (OKF §9). diff --git a/packages/okf/DEMO.md b/packages/okf/DEMO.md new file mode 100644 index 000000000..6c5950f31 --- /dev/null +++ b/packages/okf/DEMO.md @@ -0,0 +1,226 @@ +# OKF ↔ DKG live demo runbook + +End-to-end demonstration: the same portable Bitcoin Markdown, turned into +**owned, shareable** Knowledge Assets on a DKG **mainnet** node, imported into a +Context Graph, **shared through Shared Working Memory**, verified by a second +peer, and reasoned over by a Hermes agent. + +**Cost model — read this first.** +- Steps 1–7 are the default demo and **spend nothing**. Working Memory (WM) and + Shared Working Memory (SWM) are free. +- Step 8 — **Verifiable Memory (VM) promotion — is deferred.** It spends real + **TRAC + native gas, irreversibly**, on mainnet (no faucet). It is *not* part + of the default run; the operator triggers it deliberately, after confirming + funds. It is documented here, clearly marked, and only its UALs/txHashes are + recorded *if and when* it is actually run. +- This runbook is **operator-run, never CI.** The free offline correctness gate + (`pnpm --filter @origintrail-official/dkg-okf test`) is the CI gate; it never + touches a node and is green before any of this is attempted. + +Throughout: **never imply on-chain verification for WM/SWM data.** State which +memory layer each piece of evidence lives in. + +Conventions: `$CG=okf-crypto-bitcoin`, `$BUNDLE=packages/okf/test/fixtures/crypto_bitcoin`. + +--- + +## 0. Offline correctness gate (free, no node) + +```bash +pnpm --filter @origintrail-official/dkg-okf test # 60+ golden/edge/round-trip tests +dkg okf import $BUNDLE --dry-run --print-nquads # deterministic mapping, no node +``` + +Expect: 5 Knowledge Assets, 3 reserved `index.md` skipped, the reconstructed edge +graph, both citation styles, byte-stable N-Quads. Run it twice — identical output. + +## 1. Launch the node on mainnet + +```bash +dkg init # choose a mainnet blockchain (e.g. mainnet-base / mainnet-gnosis / mainnet-neuroweb) +dkg start +dkg status # daemon PID, version, listening port +dkg doctor # health checks +``` + +**Verify the node is actually on mainnet, not testnet/devnet, before going +further** — confirm the active network/chain in the printed config / `dkg doctor` +output. `edge` is the default role. + +## 2. Attach a Hermes agent + +```bash +dkg hermes setup # configure the Hermes-runtime agent bound to this node +dkg hermes # run it +``` + +Confirm the acting agent identity (this is the agent the shared bundle is +*proposed to* in step 6): + +```bash +curl -s localhost:/api/agent/identity # → { agentAddress, agentDid, name, framework, peerId } +``` + +Record the `agentAddress`. + +## 3. Import the bundle into a Context Graph (Working Memory — free) + +```bash +dkg okf import $BUNDLE --context-graph-id $CG --create-context-graph +``` + +Import defaults to **Working Memory** — free, private, reversible. Expect the +summary: `5 concepts, 3 reserved skipped, 101 triples, 11 links resolved, 0 +broken, 10 citations`, plus the deterministic `urn:okf:*` IRIs and the +`memoryLayer: "WM"` note. + +Confirm the 5 assets and the reconstructed edges are present **in WM** via SPARQL +(`/api/query`, `view: working-memory`, `agentAddress` required for WM): + +```bash +curl -s localhost:/api/query -H 'content-type: application/json' -d '{ + "contextGraphId":"okf-crypto-bitcoin", + "view":"working-memory", + "agentAddress":"", + "sparql":"SELECT ?s ?o WHERE { ?s ?o }" +}' +``` + +Expect the 11 `schema:mentions` edges (dataset→4 tables, transactions→4, +inputs→3). `tables/outputs` has none — its only links sit inside backticks +(CommonMark: literal text). All evidence here is **WM (private, free, no on-chain +verification).** + +## 4. Finalize and share to Shared Working Memory (free) + +```bash +dkg okf import $BUNDLE --context-graph-id $CG --share +``` + +`--share` seals each asset (`wm/finalize`) and advances it (`swm/share`, +`entities: "all"`). SWM is free, gossip-replicated and team-visible — this is the +moment the Bitcoin bundle becomes a **shared Context Graph** other agents can +reach. The assets are now sealed and *publish-ready*, but **publishing waits** +(step 8). + +Confirm the same assets/edges in the `shared-working-memory` view: + +```bash +# 11 cross-table edges +dkg query $CG -q 'SELECT ?s ?o WHERE { ?s ?o }' --include-shared-memory + +# Exactly 5 concepts. Count subjects that have an rdf:type — only the 5 concepts +# do. A naive `STRSTARTS(STR(?s),"urn:okf:")` count returns ~19 because the +# daemon skolemises each concept's dkg:hasSection blank nodes into +# `urn:okf:.../.well-known/genid/...` subjects, which also match the prefix. +dkg query $CG -q 'SELECT (COUNT(DISTINCT ?s) AS ?n) WHERE { ?s ?t FILTER(STRSTARTS(STR(?s), "urn:okf:")) }' --include-shared-memory +``` + +Evidence here is **SWM (shared, free, TTL-bounded, no on-chain verification).** + +## 5. Issue a join invitation and have a second peer verify it + +```bash +# Curator side — invite a peer (the V10 invite is the pair \n): +dkg context-graph invite $CG +# For a curated graph, allow the joining agent: +dkg context-graph add-agent $CG +# …or have the joiner request-join and the curator approve-join. +``` + +From a **second node/agent**: + +```bash +dkg subscribe okf-crypto-bitcoin # subscribe + catch up +dkg query okf-crypto-bitcoin -q 'SELECT ?s ?o WHERE { ?s ?o }' --include-shared-memory +``` + +Record the invite and the second peer's query result — the shared Context Graph +is independently checkable by another peer, all in **free SWM**. + +## 6. Hermes agent reasons over the shared knowledge + +Have the Hermes agent answer a natural-language question through its `dkg_*` +tools over the `shared-working-memory` view, e.g. *"what does the `transactions` +table reference?"*: + +```sparql +SELECT ?o WHERE { ?o } +``` + +Expect the four targets: `urn:okf:datasets/crypto_bitcoin`, `urn:okf:tables/blocks`, +`urn:okf:tables/inputs`, `urn:okf:tables/outputs`. Capture the transcript — an +agent consuming OKF-derived, provenance-bearing knowledge from the shared graph. + +## 7. Recreated, visibly + +Regenerate the graph from the shared Context Graph and compare it to Google's own +`viz.html`: + +```bash +dkg okf export okf-crypto-bitcoin ./out --view shared-working-memory +``` + +`export` is the clean inverse of `import` (graph-faithful). Confirm the +regenerated bundle's `schema:mentions` structure matches the dataset→tables and +cross-table edges in the bundle's own +`okf/bundles/crypto_bitcoin/viz.html`. (`packages/graph-viz` can render the graph +view directly.) + +**At this point the deliverable is complete: a shared, peer-verified, +agent-queried Context Graph in SWM. Nothing has been spent.** + +--- + +## 8. VM promotion — staged, but it waits (DEFERRED; real TRAC + gas) + +> **Do not run this as part of the demo.** The assets are sealed and +> publish-ready in SWM, so promotion to Verifiable Memory is one step away — held +> until the operator deliberately chooses to spend. + +When (and only when) the operator chooses to promote: + +1. **Confirm funding first** — on mainnet there is **no faucet**: + ```bash + dkg wallet # or: curl -s localhost:/api/wallets/balances + ``` + Abort if TRAC + native gas are insufficient. +2. **Publish ONE asset first** to observe real cost and validate the on-chain + path (the dataset). The first publish transparently registers the Context + Graph on-chain — expect gas/TRAC: + ```bash + # vm/publish for a single KA (gate behind explicit confirmation in your runbook). + # The KA name is the concept ID with '/' mapped to '__' (asset names can't contain '/'). + curl -s localhost:/api/knowledge-assets/datasets__crypto_bitcoin/vm/publish \ + -H 'content-type: application/json' -d '{"contextGraphId":"okf-crypto-bitcoin"}' + ``` + Record the returned UAL (`did:dkg://`) and + `txHash`. +3. **Then publish the rest** and re-verify via the `verifiable-memory` view. + +| Asset | UAL | txHash | +|---|---|---| +| `datasets/crypto_bitcoin` | _(record if run)_ | _(record if run)_ | +| `tables/blocks` | | | +| `tables/transactions` | | | +| `tables/inputs` | | | +| `tables/outputs` | | | + +Until promoted, the demo's deliverable is the **shared, peer-verified, +agent-queried** Context Graph in SWM. Only VM data carries on-chain verification; +WM/SWM data never does. + +--- + +## Evidence log (fill in during the run) + +- Node network/chain confirmed mainnet: ______ +- Hermes `agentAddress`: ______ +- Import summary (WM): 5 concepts / 101 triples / 11 edges / 0 broken / 10 citations +- WM SPARQL edge count: ______ +- SWM SPARQL edge count: ______ +- Join invitation: ______ +- Second peer query result: ______ +- Hermes agent transcript: ______ +- Regenerated graph vs `viz.html`: ______ +- (Deferred) VM UALs / txHashes: _(only if step 8 was run)_ diff --git a/packages/okf/README.md b/packages/okf/README.md new file mode 100644 index 000000000..3bc0b0b26 --- /dev/null +++ b/packages/okf/README.md @@ -0,0 +1,55 @@ +# @origintrail-official/dkg-okf + +Deterministic **Google Open Knowledge Format (OKF) → DKG** mapper. + +OKF standardises *how* knowledge is written and exchanged — portable Markdown + +YAML frontmatter + untyped cross-links — but ships **no** verification, +provenance or ownership layer (OKF SPEC §1, §10). This package is the bridge: it +turns an OKF bundle into owned, verifiable RDF **Knowledge Assets**, +reconstructing the bundle's cross-concept link graph. The same portable Markdown, +now cryptographically provenanced, owned and shareable across agents. + +Pure, no LLM, no network — the same bundle always yields **identical triples and +IRIs**. + +## Use it from the CLI + +```bash +# Deterministic, offline — prints the mapping summary, never touches a node +dkg okf import ./bundle --dry-run --print-nquads + +# Import into a Context Graph (defaults to private Working Memory) +dkg okf import ./bundle --context-graph-id my-graph --create-context-graph + +# Finalize + share to Shared Working Memory (free, team-visible) +dkg okf import ./bundle --context-graph-id my-graph --share + +# Serialise a Context Graph back into a conformant OKF bundle (clean inverse) +dkg okf export my-graph ./out +``` + +Import defaults to **Working Memory** and never publishes to Verifiable Memory. +`--share` advances to **Shared Working Memory**. On-chain VM promotion (real +TRAC) is a separate, explicitly-gated operator step — see `DEMO.md`. + +## Use it as a library + +```ts +import { loadBundleDir, importBundle, quadsToNQuads } from '@origintrail-official/dkg-okf'; + +const result = importBundle(loadBundleDir('./bundle')); +console.log(result.concepts.length, 'Knowledge Assets'); +console.log(quadsToNQuads(result.quads)); // canonical, byte-stable N-Quads +``` + +## Docs + +- **`CONTEXT.md`** — Language / Relationships / Flagged ambiguities. +- **`docs/adr/0005-okf-rdf-mapping.md`** — the locked OKF→RDF mapping and the + reuse-vs-fork decision. +- **`docs/integrations/okf.md`** — the full-lifecycle article. +- **`DEMO.md`** — the live mainnet runbook (WM → SWM → join invitation → Hermes + agent → rendered graph; VM promotion held as a deferred capstone). + +License: Apache-2.0. The vendored `test/fixtures/crypto_bitcoin/` bundle is © Google +LLC, Apache-2.0 (see its `ATTRIBUTION`). diff --git a/packages/okf/integration.okf.json b/packages/okf/integration.okf.json new file mode 100644 index 000000000..9db6a0eac --- /dev/null +++ b/packages/okf/integration.okf.json @@ -0,0 +1,31 @@ +{ + "slug": "okf", + "name": "Open Knowledge Format (OKF) importer", + "description": "Ingest a Google Open Knowledge Format bundle (portable Markdown + YAML frontmatter + untyped cross-links) into the DKG as deterministic, owned, verifiable Knowledge Assets, reconstructing the bundle's cross-concept link graph. Provides the trust-and-permanence backend OKF deliberately omits.", + "category": ["data-import", "knowledge-management", "interoperability"], + "maintainer": { "github": "@OriginTrail/core-developers", "name": "OriginTrail Core Developers" }, + "repo": "https://github.com/OriginTrail/dkg", + "commit": "778d6b8e01adeba357824d19893f8c28c10e6305", + "license": "Apache-2.0", + "requiresDkgNodeVersion": ">=10.0.0-rc.16", + "memoryLayers": ["WM", "SWM"], + "v10PrimitivesUsed": ["ContextGraph", "KnowledgeAsset", "WorkingMemory", "SharedWorkingMemory", "Assertion"], + "publicInterfacesUsed": ["cli"], + "targetAgents": ["hermes", "elizaos", "openclaw"], + "install": { + "kind": "cli", + "package": "@origintrail-official/dkg", + "version": "10.0.0-rc.16", + "binary": "dkg", + "usageHint": "dkg okf import --context-graph-id \ndkg okf import --dry-run --print-nquads\ndkg okf import --context-graph-id --share\ndkg okf export " + }, + "security": { + "networkEgress": [], + "writeAuthority": ["working-memory", "shared-working-memory"], + "credentialsHandled": ["local DKG node auth token"], + "notes": "Reads a local OKF bundle directory and talks only to the local DKG node API. No external network egress. Import defaults to private Working Memory and never publishes to Verifiable Memory; --share advances assets to free, team-visible Shared Working Memory. On-chain VM promotion (real TRAC) is a separate, explicitly-gated step, not performed by this integration." + }, + "trustTier": "community", + "designBrief": "https://github.com/OriginTrail/dkg/blob/main/docs/integrations/okf.md", + "fitNotes": "OKF standardises how knowledge is written and exchanged but ships no verification, provenance or ownership layer (OKF SPEC §1, §10). This integration makes the DKG the trust-and-permanence backend for OKF: the same portable Markdown, now cryptographically provenanced, owned and shareable across agents. The OKF→RDF mapping is pure and deterministic (no LLM); the same bundle always yields identical triples and IRIs, so independent importers converge on the same graph." +} diff --git a/packages/okf/package.json b/packages/okf/package.json new file mode 100644 index 000000000..fa1e3a186 --- /dev/null +++ b/packages/okf/package.json @@ -0,0 +1,38 @@ +{ + "name": "@origintrail-official/dkg-okf", + "version": "10.0.0-rc.16", + "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "scripts": { + "build": "tsc", + "test": "vitest run", + "test:coverage": "vitest run --coverage", + "clean": "rm -rf dist tsconfig.tsbuildinfo" + }, + "dependencies": { + "js-yaml": "^4.1.1", + "mdast-util-from-markdown": "^2.0.3", + "mdast-util-to-string": "^4.0.0" + }, + "devDependencies": { + "@types/js-yaml": "^4.0.9", + "@types/mdast": "^4.0.4", + "@vitest/coverage-v8": "^4.0.18", + "vitest": "^4.0.18" + }, + "publishConfig": { + "access": "public" + }, + "files": [ + "dist", + "README.md", + "LICENSE" + ], + "license": "Apache-2.0", + "repository": { + "type": "git", + "url": "https://github.com/OriginTrail/dkg.git", + "directory": "packages/okf" + } +} diff --git a/packages/okf/src/bundle.ts b/packages/okf/src/bundle.ts new file mode 100644 index 000000000..e8caeb93b --- /dev/null +++ b/packages/okf/src/bundle.ts @@ -0,0 +1,138 @@ +/** + * Bundle-aware OKF import (the two passes from ADR 0005 / SPEC §3–§5). + * + * Pass 1 — index the bundle: classify concept vs. reserved files, parse each + * concept's frontmatter, derive a deterministic subject IRI per concept ID, and + * read `okf_version` from the root `index.md` if present. + * + * Pass 2 — extract + link: map each concept (frontmatter + body) to quads and + * resolve its Markdown links against the Pass-1 map into untyped directed edges. + * + * Pure and deterministic: same `BundleFile[]` ⇒ identical quads and IRIs. No + * filesystem, no network — `loadBundleDir` (loader.ts) is the only I/O surface. + */ + +import { parseDocument, OkfDocumentError } from './document.js'; +import { mapConcept } from './mapping.js'; +import { + isConceptFile, + isReservedFile, + pathToConceptId, + conceptIdToIri, +} from './paths.js'; +import { DEFAULT_IRI_BASE, SCHEMA_MENTIONS } from './constants.js'; +import type { + BundleFile, + BundleImport, + ConceptMapping, + OkfDocument, + OkfMappingOptions, + OkfWarning, + Quad, +} from './types.js'; + +function byPath(a: BundleFile, b: BundleFile): number { + return a.path < b.path ? -1 : a.path > b.path ? 1 : 0; +} + +export function importBundle(files: BundleFile[], opts: OkfMappingOptions = {}): BundleImport { + const iriBase = opts.iriBase ?? DEFAULT_IRI_BASE; + const warnings: OkfWarning[] = []; + + // Deterministic ordering, independent of how the caller enumerated the tree. + const ordered = [...files].sort(byPath); + + const reservedSkipped = ordered.filter((f) => isReservedFile(f.path)).map((f) => f.path); + + // --- Pass 1: index --- + const docs: OkfDocument[] = []; + const iriByConceptId: Record = {}; + const typeByConceptId: Record = {}; + for (const f of ordered) { + if (!isConceptFile(f.path)) continue; + let doc: OkfDocument; + try { + doc = parseDocument(f.path, f.content); + } catch (err) { + // Unparseable frontmatter is a §9 conformance issue; for import we skip the + // concept with a warning rather than aborting the whole bundle. + warnings.push({ + conceptId: pathToConceptId(f.path), + code: 'parse', + message: err instanceof OkfDocumentError ? err.message : String(err), + }); + continue; + } + docs.push(doc); + iriByConceptId[doc.conceptId] = conceptIdToIri(doc.conceptId, iriBase); + typeByConceptId[doc.conceptId] = + doc.frontmatter.type != null ? String(doc.frontmatter.type) : undefined; + const type = doc.frontmatter.type; + if (type === undefined || type === null || String(type).trim() === '') { + warnings.push({ + conceptId: doc.conceptId, + code: 'missing-type', + message: 'concept has no non-empty `type` (a hard §9 conformance requirement; the bundle will not validate)', + }); + } + } + + // `okf_version` may be declared only in the bundle-root `index.md` (SPEC §11). + let okfVersion: string | null = null; + const rootIndex = ordered.find((f) => f.path === 'index.md'); + if (rootIndex) { + try { + const parsed = parseDocument('index.md', rootIndex.content); + const v = parsed.frontmatter.okf_version; + if (v !== undefined && v !== null) okfVersion = String(v); + } catch { + // Reserved files need no frontmatter; ignore. + } + } + + // --- Pass 2: extract + link --- + const exists = (id: string): boolean => + Object.prototype.hasOwnProperty.call(iriByConceptId, id); + // Opt-in: retype cross-concept edges by their endpoints' OKF `type` pair + // (deterministic; default keeps every edge as schema:mentions). + const typeRelations = opts.typeRelations ?? []; + const relationPredicate = (fromType?: string, toType?: string): string | undefined => { + if (typeRelations.length === 0 || fromType == null || toType == null) return undefined; + return typeRelations.find((r) => r.from === fromType && r.to === toType)?.predicate; + }; + + const concepts: ConceptMapping[] = []; + const quads: Quad[] = []; + for (const doc of docs) { + const mapping = mapConcept(doc, iriByConceptId[doc.conceptId], exists, opts); + if (typeRelations.length > 0) { + const fromType = typeByConceptId[doc.conceptId]; + for (const q of mapping.quads) { + if (q.predicate !== SCHEMA_MENTIONS || !q.object.startsWith(iriBase)) continue; + const targetId = q.object.slice(iriBase.length); + const pred = relationPredicate(fromType, typeByConceptId[targetId]); + if (pred) q.predicate = pred; + } + } + concepts.push(mapping); + quads.push(...mapping.quads); + for (const bl of mapping.brokenLinks) { + warnings.push({ + conceptId: doc.conceptId, + code: 'broken-link', + message: `unresolved cross-link "${bl.raw}" (not in bundle — not an error, SPEC §5.3/§9)`, + }); + } + if (!opts.includeCodeSpanLinks) { + for (const cs of mapping.codeSpanLinks) { + warnings.push({ + conceptId: doc.conceptId, + code: 'code-span-link', + message: `link "${cs.raw}" sits inside an inline code span — treated as literal text per CommonMark, not an edge`, + }); + } + } + } + + return { okfVersion, iriByConceptId, concepts, reservedSkipped, quads, warnings }; +} diff --git a/packages/okf/src/constants.ts b/packages/okf/src/constants.ts new file mode 100644 index 000000000..4375acd7d --- /dev/null +++ b/packages/okf/src/constants.ts @@ -0,0 +1,60 @@ +/** + * RDF predicate / namespace constants for the OKF → DKG mapping. + * + * These deliberately mirror the vocabulary already minted by the node's + * deterministic Markdown extractor (`packages/cli/src/extraction/ + * markdown-extractor.ts`) so that an OKF import and a natively-ingested + * Markdown corpus converge on the same graph shape and join naturally. + * The extractor's predicate strings are private module constants there; + * we re-declare the same literal IRIs here (rather than import across the + * `cli → okf` boundary, which would be a dependency cycle) and pin the + * convergence with a test. + * + * The OKF-specific deltas — and the rationale for each — are recorded in + * `docs/adr/0005-okf-rdf-mapping.md` and `CONTEXT.md`. + */ + +// --- RDF / XSD --- +export const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; +export const XSD_DATE_TIME = 'http://www.w3.org/2001/XMLSchema#dateTime'; +export const XSD_BOOLEAN = 'http://www.w3.org/2001/XMLSchema#boolean'; +export const XSD_INTEGER = 'http://www.w3.org/2001/XMLSchema#integer'; +export const XSD_DECIMAL = 'http://www.w3.org/2001/XMLSchema#decimal'; + +// --- schema.org (converges with the extractor) --- +export const SCHEMA_NS = 'http://schema.org/'; +export const SCHEMA_NAME = 'http://schema.org/name'; +export const SCHEMA_DESCRIPTION = 'http://schema.org/description'; +export const SCHEMA_KEYWORDS = 'http://schema.org/keywords'; +/** Untyped directed concept→concept edge. Converges with the extractor's `[[wikilink]]` predicate. */ +export const SCHEMA_MENTIONS = 'http://schema.org/mentions'; +/** OKF `timestamp` = last-modified time (SPEC §4.1). */ +export const SCHEMA_DATE_MODIFIED = 'http://schema.org/dateModified'; +/** OKF `resource` = canonical URI of the underlying asset (SPEC §4.1). */ +export const SCHEMA_URL = 'http://schema.org/url'; +/** `# Citations` links (SPEC §8) — semantically distinct from concept edges. */ +export const SCHEMA_CITATION = 'http://schema.org/citation'; +/** Containment edge — opt-in target for type-pair relations (e.g. dataset→table). */ +export const SCHEMA_HAS_PART = 'http://schema.org/hasPart'; +/** Optional folder hierarchy (off by default — see ADR 0005). */ +export const SCHEMA_IS_PART_OF = 'http://schema.org/isPartOf'; + +// --- DKG ontology (converges with the extractor) --- +export const DKG_HAS_SECTION = 'http://dkg.io/ontology/hasSection'; + +/** + * Infix for deterministic skolem IRIs minted for `dkg:hasSection` nodes. + * The daemon rejects blank-node RDF *objects* ("RDF object must be a quoted + * literal term or absolute IRI"), so section nodes are skolemized into + * concept-scoped IRIs `/.well-known/genid/okfsec__` — the + * same `.well-known/genid/` scheme the node uses internally, so stored data is + * identical whether or not the node would have skolemized them itself. Export + * uses this infix to exclude section nodes from being rebuilt as concepts. + */ +export const SECTION_GENID_INFIX = '/.well-known/genid/'; + +/** Default IRI namespace for concept Knowledge Asset subjects. */ +export const DEFAULT_IRI_BASE = 'urn:okf:'; + +/** The OKF version this consumer targets (SPEC is v0.1 draft). */ +export const OKF_TARGET_VERSION = '0.1'; diff --git a/packages/okf/src/document.ts b/packages/okf/src/document.ts new file mode 100644 index 000000000..bbc048006 --- /dev/null +++ b/packages/okf/src/document.ts @@ -0,0 +1,82 @@ +/** + * OKF concept document parsing — the frontmatter/body split. + * + * This mirrors the reference agent's `OKFDocument.parse` + * (`okf/src/reference_agent/bundle/document.py`) byte-for-byte in behaviour so + * our IRIs, triples and round-trips agree with the canonical producer: + * + * - frontmatter is delimited by `---` on its own line at the very start and a + * closing `---` on its own line; matching is on the *stripped* line, so a + * `---` with trailing whitespace still closes the block; + * - a file with no leading `---` is treated as all-body (this is how + * `index.md` / `log.md` parse cleanly — they carry no frontmatter, §6/§7); + * - exactly one leading newline is stripped from the body. + * + * NOTE on requiredness: the reference *producer* enforces four keys + * (`REQUIRED_FRONTMATTER_KEYS = ("type","title","description","timestamp")`), + * but SPEC §9 binds *consumers* to require only a non-empty `type`. We are a + * consumer: parsing never enforces the producer's four keys. `validation.ts` + * applies the §9 consumer rule. + */ + +import { load as loadYaml } from 'js-yaml'; +import type { OkfDocument } from './types.js'; +import { pathToConceptId } from './paths.js'; + +const DELIM = '---'; + +export class OkfDocumentError extends Error { + constructor(message: string) { + super(message); + this.name = 'OkfDocumentError'; + } +} + +/** Python `str.splitlines()` semantics: split on \r\n, \r, or \n. */ +function splitLines(text: string): string[] { + return text.split(/\r\n|\r|\n/); +} + +/** + * Parse a concept file's raw text into `{ conceptId, segments, frontmatter, body }`. + * `bundlePath` is the bundle-relative POSIX path (e.g. `tables/blocks.md`). + */ +export function parseDocument(bundlePath: string, text: string): OkfDocument { + const conceptId = pathToConceptId(bundlePath); + const segments = conceptId.split('/'); + + const lines = splitLines(text); + if (lines.length === 0 || lines[0].trim() !== DELIM) { + // No frontmatter block — entire file is body. + return { conceptId, segments, frontmatter: {}, body: text }; + } + + let endIdx = -1; + for (let i = 1; i < lines.length; i++) { + if (lines[i].trim() === DELIM) { + endIdx = i; + break; + } + } + if (endIdx === -1) { + throw new OkfDocumentError(`Unterminated YAML frontmatter block in ${bundlePath}`); + } + + const fmText = lines.slice(1, endIdx).join('\n'); + let parsed: unknown; + try { + parsed = loadYaml(fmText) ?? {}; + } catch (err) { + throw new OkfDocumentError( + `Invalid YAML frontmatter in ${bundlePath}: ${(err as Error).message}`, + ); + } + if (typeof parsed !== 'object' || parsed === null || Array.isArray(parsed)) { + throw new OkfDocumentError(`Frontmatter must be a YAML mapping in ${bundlePath}`); + } + + let body = lines.slice(endIdx + 1).join('\n'); + if (body.startsWith('\n')) body = body.slice(1); + + return { conceptId, segments, frontmatter: parsed as Record, body }; +} diff --git a/packages/okf/src/export.ts b/packages/okf/src/export.ts new file mode 100644 index 000000000..332979322 --- /dev/null +++ b/packages/okf/src/export.ts @@ -0,0 +1,225 @@ +/** + * The clean inverse of the importer (ADR 0005): reconstruct a conformant OKF + * bundle from imported Knowledge Assets. Designed so that + * + * import(bundle) → exportBundle(...) → import(...) + * + * reproduces an *equivalent graph* (same concepts, frontmatter triples, untyped + * edges and citations). It is graph-faithful, not byte-faithful: free-form prose + * is not recoverable from triples, so bodies are regenerated structurally (a + * `# Related` section of links + a `# Citations` section). Section headings from + * the original body are presentational and are not reconstructed — round-trip + * equivalence is asserted over the semantic (non-`hasSection`) quad set. + * + * This module is pure: it consumes a `BundleImport` and returns `BundleFile[]`. + * The `dkg okf export` command feeds it quads fetched from a Context Graph via + * SPARQL; the same function powers the offline round-trip test. + */ + +import { dump as dumpYaml } from 'js-yaml'; +import { + RDF_TYPE, + SCHEMA_NS, + SCHEMA_NAME, + SCHEMA_DESCRIPTION, + SCHEMA_KEYWORDS, + SCHEMA_MENTIONS, + SCHEMA_HAS_PART, + SCHEMA_DATE_MODIFIED, + SCHEMA_URL, + SCHEMA_CITATION, + XSD_INTEGER, + XSD_DECIMAL, + XSD_BOOLEAN, + DEFAULT_IRI_BASE, +} from './constants.js'; +import type { BundleFile, BundleImport, ConceptMapping, OkfMappingOptions } from './types.js'; + +const LITERAL_RE = /^("(?:\\.|[^"\\])*")(?:\^\^<[^>]*>|@[A-Za-z-]+)?$/; +const TYPED_LITERAL_RE = /^("(?:\\.|[^"\\])*")\^\^<([^>]*)>$/; + +/** Lexical value of a literal term, or `null` if the term is an IRI / blank node. */ +function literalValue(term: string): string | null { + if (!term.startsWith('"')) return null; + const m = term.match(LITERAL_RE); + if (!m) return null; + return JSON.parse(m[1]) as string; +} + +/** + * Recover the native JS value of a known typed scalar literal so producer-defined + * numeric/boolean keys survive the round-trip. Import maps `count: 3` → + * `"3"^^xsd:integer`; without this, export would emit YAML string `"3"`, which + * re-imports as a plain string literal (datatype lost). Returns `null` for + * unknown datatypes / plain literals so callers fall back to lexical handling. + */ +function typedScalarValue(term: string): number | boolean | null { + const m = term.match(TYPED_LITERAL_RE); + if (!m) return null; + const lexical = JSON.parse(m[1]) as string; + switch (m[2]) { + case XSD_INTEGER: { + const n = Number(lexical); + return Number.isInteger(n) ? n : null; + } + case XSD_DECIMAL: { + const n = Number(lexical); + return Number.isFinite(n) ? n : null; + } + case XSD_BOOLEAN: + return lexical === 'true' ? true : lexical === 'false' ? false : null; + default: + return null; + } +} + +/** Map an rdf:type object IRI back to a writable `type` value. */ +function typeValue(object: string): string { + return object.startsWith(SCHEMA_NS) ? object.slice(SCHEMA_NS.length) : object; +} + +function conceptIdFromIri(iri: string, iriBase: string): string | null { + return iri.startsWith(iriBase) ? iri.slice(iriBase.length) : null; +} + +const KNOWN_PREDICATES = new Set([ + RDF_TYPE, + SCHEMA_NAME, + SCHEMA_DESCRIPTION, + SCHEMA_KEYWORDS, + SCHEMA_MENTIONS, + SCHEMA_HAS_PART, + SCHEMA_DATE_MODIFIED, + SCHEMA_URL, + SCHEMA_CITATION, +]); + +function reconstructConcept(c: ConceptMapping, iriBase: string): BundleFile { + const frontmatter: Record = {}; + const tags: string[] = []; + const edges: string[] = []; + const citations: string[] = []; + const extras: Record = {}; + + for (const q of c.quads) { + if (q.subject !== c.iri) continue; // skip section nodes (skolemized genid subjects) + switch (q.predicate) { + case RDF_TYPE: + frontmatter.type = typeValue(q.object); + break; + case SCHEMA_NAME: + frontmatter.title = literalValue(q.object) ?? q.object; + break; + case SCHEMA_DESCRIPTION: + frontmatter.description = literalValue(q.object) ?? q.object; + break; + case SCHEMA_URL: + frontmatter.resource = q.object; + break; + case SCHEMA_DATE_MODIFIED: + frontmatter.timestamp = literalValue(q.object) ?? q.object; + break; + case SCHEMA_KEYWORDS: { + const v = literalValue(q.object); + if (v !== null) tags.push(v); + break; + } + case SCHEMA_MENTIONS: + case SCHEMA_HAS_PART: { + // Both are cross-concept edges → reconstruct as body links. OKF links are + // untyped (SPEC §5.3), so a typed relation (e.g. hasPart) deliberately + // exports as a plain link; the DKG-side typing is not OKF-expressible. + const id = conceptIdFromIri(q.object, iriBase); + if (id) edges.push(id); + break; + } + case SCHEMA_CITATION: + citations.push(literalValue(q.object) ?? q.object); + break; + default: { + if (q.predicate.startsWith(SCHEMA_NS) && !KNOWN_PREDICATES.has(q.predicate)) { + const key = q.predicate.slice(SCHEMA_NS.length); + // Preserve known typed scalars (integer/decimal/boolean) as native + // values so the datatype survives re-import; else fall back to lexical. + const scalar = typedScalarValue(q.object); + const v = scalar !== null ? scalar : (literalValue(q.object) ?? q.object); + if (key in extras) { + const cur = extras[key]; + extras[key] = Array.isArray(cur) ? [...cur, v] : [cur, v]; + } else { + extras[key] = v; + } + } + } + } + } + if (tags.length > 0) frontmatter.tags = tags; + Object.assign(frontmatter, extras); + + // Stable key order: required/recommended first, then producer extras. + const order = ['type', 'resource', 'title', 'description', 'tags', 'timestamp']; + const ordered: Record = {}; + for (const k of order) if (k in frontmatter) ordered[k] = frontmatter[k]; + for (const k of Object.keys(frontmatter)) if (!(k in ordered)) ordered[k] = frontmatter[k]; + + const yaml = dumpYaml(ordered, { sortKeys: false, lineWidth: -1 }).trimEnd(); + + const bodyParts: string[] = []; + const uniqueEdges = [...new Set(edges)].sort(); + if (uniqueEdges.length > 0) { + bodyParts.push('# Related', ''); + for (const id of uniqueEdges) bodyParts.push(`* [${id}](/${id}.md)`); + bodyParts.push(''); + } + if (citations.length > 0) { + bodyParts.push('# Citations', ''); + citations.forEach((url, i) => bodyParts.push(`[${i + 1}] [${url}](${url})`)); + bodyParts.push(''); + } + + return { + path: `${c.conceptId}.md`, + content: `---\n${yaml}\n---\n\n${bodyParts.join('\n')}`, + }; +} + +/** Regenerate a directory `index.md` listing immediate child concepts/subdirs (SPEC §6). */ +function regenerateIndexes(conceptIds: string[]): BundleFile[] { + const childrenByDir = new Map>(); + const add = (dir: string, entry: string) => { + if (!childrenByDir.has(dir)) childrenByDir.set(dir, new Set()); + childrenByDir.get(dir)!.add(entry); + }; + for (const id of conceptIds) { + const parts = id.split('/'); + for (let i = 0; i < parts.length; i++) { + const dir = parts.slice(0, i).join('/'); + const isLeaf = i === parts.length - 1; + add(dir, isLeaf ? `concept:${parts[i]}.md` : `dir:${parts[i]}`); + } + } + const files: BundleFile[] = []; + for (const [dir, entries] of [...childrenByDir.entries()].sort()) { + const lines = ['# Index', '']; + for (const e of [...entries].sort()) { + if (e.startsWith('concept:')) { + const name = e.slice('concept:'.length); + lines.push(`* [${name.replace(/\.md$/, '')}](${name})`); + } else { + const name = e.slice('dir:'.length); + lines.push(`* [${name}](${name}/index.md)`); + } + } + lines.push(''); + files.push({ path: dir ? `${dir}/index.md` : 'index.md', content: lines.join('\n') }); + } + return files; +} + +/** Serialise an imported bundle back into a conformant OKF `BundleFile[]`. */ +export function exportBundle(imported: BundleImport, opts: OkfMappingOptions = {}): BundleFile[] { + const iriBase = opts.iriBase ?? DEFAULT_IRI_BASE; + const concepts = imported.concepts.map((c) => reconstructConcept(c, iriBase)); + const indexes = regenerateIndexes(imported.concepts.map((c) => c.conceptId)); + return [...concepts, ...indexes].sort((a, b) => (a.path < b.path ? -1 : a.path > b.path ? 1 : 0)); +} diff --git a/packages/okf/src/index.ts b/packages/okf/src/index.ts new file mode 100644 index 000000000..75dc1924a --- /dev/null +++ b/packages/okf/src/index.ts @@ -0,0 +1,49 @@ +/** + * `@origintrail-official/dkg-okf` — deterministic Google Open Knowledge Format + * (OKF) → DKG Knowledge Asset mapper. + * + * Turns a portable OKF bundle (Markdown + YAML frontmatter + untyped cross-links) + * into owned, verifiable RDF Knowledge Assets, reconstructing the bundle's + * cross-concept link graph. Pure, no LLM, no network: same bundle ⇒ identical + * triples and IRIs. The `dkg okf` CLI command is a thin wrapper over this. + * + * See `CONTEXT.md` for the language/relationships/flagged-ambiguities and + * `docs/adr/0005-okf-rdf-mapping.md` for the locked OKF→RDF mapping. + */ + +export type { + Quad, + OkfDocument, + OkfLink, + OkfCitation, + OkfMappingOptions, + TypeRelation, + ConceptMapping, + OkfWarning, + BundleImport, + BundleFile, + ConformanceReport, +} from './types.js'; + +export * from './constants.js'; + +export { + RESERVED_FILENAMES, + isValidSegment, + basename, + isReservedFile, + isConceptFile, + pathToConceptId, + conceptIdToIri, + conceptIdToKaName, + resolveLinkTarget, +} from './paths.js'; + +export { parseDocument, OkfDocumentError } from './document.js'; +export { frontmatterQuads, parseBody, mapConcept } from './mapping.js'; +export { importBundle } from './bundle.js'; +export { exportBundle } from './export.js'; +export { validateBundle } from './validation.js'; +export { quadsToNQuads } from './nquads.js'; +export { loadBundleDir, loadBundleDirWithReport } from './loader.js'; +export { isSafeIri, literalTerm, typedLiteralTerm } from './utils.js'; diff --git a/packages/okf/src/loader.ts b/packages/okf/src/loader.ts new file mode 100644 index 000000000..ed1dd4fa4 --- /dev/null +++ b/packages/okf/src/loader.ts @@ -0,0 +1,47 @@ +/** + * The package's only filesystem surface: read an OKF bundle directory into the + * in-memory `BundleFile[]` the pure mapper consumes. Kept separate so the mapper + * itself stays I/O-free and unit-testable in isolation. + */ + +import { readdirSync, readFileSync, lstatSync } from 'node:fs'; +import { join, relative, sep } from 'node:path'; +import type { BundleFile } from './types.js'; + +/** + * Recursively read all `.md` files under `dir` as bundle-relative POSIX paths. + * + * Symbolic links are NOT followed (`lstat`, not `stat`): a bundle could ship a + * `secret.md -> ~/.dkg/auth.token` symlink, and following it would slurp a local + * file into the graph and (with `--share`) gossip it. Symlinked entries are + * skipped and collected in `skippedSymlinks` so callers can surface them. + */ +export function loadBundleDir(dir: string): BundleFile[] { + return loadBundleDirWithReport(dir).files; +} + +export function loadBundleDirWithReport(dir: string): { + files: BundleFile[]; + skippedSymlinks: string[]; +} { + const out: BundleFile[] = []; + const skippedSymlinks: string[] = []; + const walk = (current: string): void => { + for (const entry of readdirSync(current).sort()) { + const full = join(current, entry); + const st = lstatSync(full); + if (st.isSymbolicLink()) { + skippedSymlinks.push(relative(dir, full).split(sep).join('/')); + continue; + } + if (st.isDirectory()) { + walk(full); + } else if (st.isFile() && entry.endsWith('.md')) { + const rel = relative(dir, full).split(sep).join('/'); + out.push({ path: rel, content: readFileSync(full, 'utf8') }); + } + } + }; + walk(dir); + return { files: out, skippedSymlinks }; +} diff --git a/packages/okf/src/mapping.ts b/packages/okf/src/mapping.ts new file mode 100644 index 000000000..8cfc9ad9e --- /dev/null +++ b/packages/okf/src/mapping.ts @@ -0,0 +1,303 @@ +/** + * Deterministic OKF concept → RDF mapping (no LLM, no network). + * + * One OKF concept document maps to one Knowledge Asset subject IRI plus a set + * of content + linkage quads. The mapping reuses the node Markdown extractor's + * predicate vocabulary (so an OKF import and a native Markdown import converge), + * with the OKF-specific deltas recorded in `docs/adr/0005-okf-rdf-mapping.md`: + * + * - body links are real Markdown links `[text](path)` (OKF §5), NOT the + * extractor's `[[wikilinks]]`, so we resolve them with a real Markdown AST + * (`mdast-util-from-markdown`) — which is also what lets us honour the + * CommonMark rule that a link inside an inline code span is literal text; + * - OKF concept titles live in frontmatter, so body headings (including `#` + * H1s like `# Schema`) are genuine sections → `dkg:hasSection`; + * - `timestamp` is OKF's last-modified time → `schema:dateModified`. + */ + +import { fromMarkdown } from 'mdast-util-from-markdown'; +import { toString as mdToString } from 'mdast-util-to-string'; +import type { Nodes, Link, InlineCode, Text } from 'mdast'; +import { + RDF_TYPE, + SCHEMA_NS, + SCHEMA_NAME, + SCHEMA_DESCRIPTION, + SCHEMA_KEYWORDS, + SCHEMA_MENTIONS, + SCHEMA_DATE_MODIFIED, + SCHEMA_URL, + SCHEMA_CITATION, + SCHEMA_IS_PART_OF, + DKG_HAS_SECTION, + SECTION_GENID_INFIX, + XSD_DATE_TIME, + XSD_BOOLEAN, + XSD_INTEGER, + XSD_DECIMAL, + DEFAULT_IRI_BASE, +} from './constants.js'; +import { conceptIdToIri, resolveLinkTarget } from './paths.js'; +import { + isSafeIri, + literalTerm, + typedLiteralTerm, + pascalCase, + camelCase, + sanitizeForBlank, +} from './utils.js'; +import type { + OkfDocument, + OkfMappingOptions, + ConceptMapping, + Quad, + OkfLink, + OkfCitation, +} from './types.js'; + +const BARE_URL_RE = /https?:\/\/[^\s)<>"]+/g; +const INLINE_LINK_RE = /\[[^\]]*\]\(([^)\s]+)\)/g; + +/** OKF `type` value → an rdf:type object IRI (raw, no angle brackets). */ +function typeToIri(value: unknown): string | null { + const s = String(value).trim(); + if (!s) return null; + if (isSafeIri(s)) return s; // already a full IRI (e.g. `tag:…`, `https://…`) + const pascal = pascalCase(s); + return pascal ? SCHEMA_NS + pascal : null; +} + +function toArray(value: unknown): unknown[] { + return Array.isArray(value) ? value : [value]; +} + +function dateToLexical(value: unknown): string { + return value instanceof Date ? value.toISOString() : String(value); +} + +/** Producer-defined scalar/array values → object terms (typed where possible). */ +function valueToTerms(value: unknown): string[] { + if (value === null || value === undefined) return []; + if (Array.isArray(value)) return value.flatMap(valueToTerms); + if (value instanceof Date) return [typedLiteralTerm(value.toISOString(), XSD_DATE_TIME)]; + if (typeof value === 'boolean') return [typedLiteralTerm(String(value), XSD_BOOLEAN)]; + if (typeof value === 'number') { + return [typedLiteralTerm(String(value), Number.isInteger(value) ? XSD_INTEGER : XSD_DECIMAL)]; + } + if (typeof value === 'string') { + return [isSafeIri(value) ? value : literalTerm(value)]; + } + return [literalTerm(JSON.stringify(value))]; +} + +/** Map the YAML frontmatter to quads (SPEC §4.1; see the locked table in ADR 0005). */ +export function frontmatterQuads(iri: string, frontmatter: Record): Quad[] { + const out: Quad[] = []; + for (const [key, value] of Object.entries(frontmatter)) { + if (value === null || value === undefined) continue; + switch (key) { + case 'type': { + const t = typeToIri(value); + if (t) out.push({ subject: iri, predicate: RDF_TYPE, object: t }); + break; + } + case 'title': + out.push({ subject: iri, predicate: SCHEMA_NAME, object: literalTerm(String(value)) }); + break; + case 'description': + out.push({ + subject: iri, + predicate: SCHEMA_DESCRIPTION, + object: literalTerm(String(value)), + }); + break; + case 'tags': + for (const tag of toArray(value)) { + out.push({ subject: iri, predicate: SCHEMA_KEYWORDS, object: literalTerm(String(tag)) }); + } + break; + case 'timestamp': + out.push({ + subject: iri, + predicate: SCHEMA_DATE_MODIFIED, + object: typedLiteralTerm(dateToLexical(value), XSD_DATE_TIME), + }); + break; + case 'resource': { + const r = String(value); + out.push({ + subject: iri, + predicate: SCHEMA_URL, + object: isSafeIri(r) ? r : literalTerm(r), + }); + break; + } + default: { + // Producer-defined keys — preserved, never dropped (SPEC §4.1/§9). + const predicate = SCHEMA_NS + camelCase(key); + for (const term of valueToTerms(value)) { + out.push({ subject: iri, predicate, object: term }); + } + } + } + } + return out; +} + +interface ParsedBody { + headings: string[]; + /** Real Markdown links found outside the Citations section. */ + bodyLinks: string[]; + /** `[label](target)` patterns found inside inline code spans (outside Citations). */ + codeSpanHrefs: string[]; + /** Citations gathered from the `# Citations` section (both styles). */ + citations: OkfCitation[]; +} + +function collect(node: Nodes, links: Link[], codes: InlineCode[], texts: string[]): void { + if (node.type === 'link') links.push(node); + else if (node.type === 'inlineCode') codes.push(node); + else if (node.type === 'text') texts.push((node as Text).value); + if ('children' in node && Array.isArray(node.children)) { + for (const child of node.children) collect(child as Nodes, links, codes, texts); + } +} + +function extractInlineLinkHrefs(code: string): string[] { + const out: string[] = []; + for (const m of code.matchAll(INLINE_LINK_RE)) out.push(m[1]); + return out; +} + +/** Parse a concept body with a real Markdown AST. */ +export function parseBody(body: string): ParsedBody { + const tree = fromMarkdown(body); + const headings: string[] = []; + const bodyLinks: string[] = []; + const codeSpanHrefs: string[] = []; + const citations: OkfCitation[] = []; + let currentSection = ''; + + for (const node of tree.children) { + if (node.type === 'heading') { + const text = mdToString(node); + headings.push(text); + currentSection = text.trim().toLowerCase(); + continue; + } + const inCitations = currentSection === 'citations'; + const links: Link[] = []; + const codes: InlineCode[] = []; + const texts: string[] = []; + collect(node as Nodes, links, codes, texts); + + if (inCitations) { + for (const l of links) { + const label = mdToString(l).trim(); + citations.push(label ? { url: l.url, label } : { url: l.url }); + } + for (const t of texts) { + for (const m of t.matchAll(BARE_URL_RE)) citations.push({ url: m[0] }); + } + } else { + for (const l of links) bodyLinks.push(l.url); + for (const c of codes) codeSpanHrefs.push(...extractInlineLinkHrefs(c.value)); + } + } + return { headings, bodyLinks, codeSpanHrefs, citations }; +} + +/** + * Map a single concept to its Knowledge Asset quads + structured link/citation + * diagnostics. `conceptExists` decides whether a resolved link target is in the + * bundle (a candidate that is not present is a broken link — warned, never fatal). + */ +export function mapConcept( + doc: OkfDocument, + iri: string, + conceptExists: (conceptId: string) => boolean, + opts: OkfMappingOptions = {}, +): ConceptMapping { + const iriBase = opts.iriBase ?? DEFAULT_IRI_BASE; + const quads: Quad[] = [...frontmatterQuads(iri, doc.frontmatter)]; + + const parsed = parseBody(doc.body); + + // Sections: every body heading (OKF titles live in frontmatter, so H1s count). + // Section nodes are skolemized into deterministic concept-scoped IRIs rather + // than emitted as RDF blank nodes: the daemon rejects blank-node *objects* + // ("RDF object must be a quoted literal term or absolute IRI"), so a blank + // `hasSection` object fails the first write on a strict node. The IRI uses the + // node's own `.well-known/genid/` scheme, so the stored graph is identical. + parsed.headings.forEach((text, i) => { + const sectionIri = `${iri}${SECTION_GENID_INFIX}okfsec_${sanitizeForBlank(doc.conceptId)}_${i}`; + quads.push({ subject: iri, predicate: DKG_HAS_SECTION, object: sectionIri }); + quads.push({ subject: sectionIri, predicate: SCHEMA_NAME, object: literalTerm(text) }); + }); + + const resolvedLinks: OkfLink[] = []; + const brokenLinks: OkfLink[] = []; + const codeSpanLinks: OkfLink[] = []; + const edgeTargets = new Set(); + + const addEdge = (target: string) => { + if (edgeTargets.has(target)) return; + edgeTargets.add(target); + quads.push({ + subject: iri, + predicate: SCHEMA_MENTIONS, + object: conceptIdToIri(target, iriBase), + }); + }; + + for (const href of parsed.bodyLinks) { + const candidate = resolveLinkTarget(href, doc.conceptId); + if (candidate && conceptExists(candidate)) { + resolvedLinks.push({ raw: href, targetConceptId: candidate, inCodeSpan: false }); + addEdge(candidate); + } else if (candidate) { + // Resolved to a bundle path that doesn't exist → broken (SPEC §5.3/§9). + brokenLinks.push({ raw: href, targetConceptId: null, inCodeSpan: false }); + } + // candidate === null → external URL / anchor / escapes root: not a concept edge. + } + + for (const href of parsed.codeSpanHrefs) { + const candidate = resolveLinkTarget(href, doc.conceptId); + const present = !!candidate && conceptExists(candidate); + const link: OkfLink = { + raw: href, + targetConceptId: present ? candidate : null, + inCodeSpan: true, + }; + codeSpanLinks.push(link); + if (opts.includeCodeSpanLinks && present && candidate) { + resolvedLinks.push(link); + addEdge(candidate); + } + } + + const citations: OkfCitation[] = []; + const seenCitation = new Set(); + for (const c of parsed.citations) { + if (seenCitation.has(c.url)) continue; + seenCitation.add(c.url); + citations.push(c); + quads.push({ + subject: iri, + predicate: SCHEMA_CITATION, + object: isSafeIri(c.url) ? c.url : literalTerm(c.url), + }); + } + + if (opts.emitFolderHierarchy && doc.segments.length > 1) { + const parentId = doc.segments.slice(0, -1).join('/'); + quads.push({ + subject: iri, + predicate: SCHEMA_IS_PART_OF, + object: conceptIdToIri(parentId, iriBase), + }); + } + + return { conceptId: doc.conceptId, iri, quads, resolvedLinks, brokenLinks, codeSpanLinks, citations }; +} diff --git a/packages/okf/src/nquads.ts b/packages/okf/src/nquads.ts new file mode 100644 index 000000000..71c594037 --- /dev/null +++ b/packages/okf/src/nquads.ts @@ -0,0 +1,27 @@ +/** + * Deterministic N-Quads serialization for golden tests and `export`. + * + * Quad object terms already use the node's quad encoding (raw IRIs without + * angle brackets, literals as `"…"` / `"…"^^
`, blanks as `_:…`). This + * renders them as canonical N-Quads: IRIs wrapped in `<…>`, literals/blanks + * passed through, then deduplicated and lexically sorted so the same graph + * always serialises to byte-identical output. + */ + +import type { Quad } from './types.js'; + +function termToNQuads(term: string): string { + if (term.startsWith('_:')) return term; // blank node + if (term.startsWith('"')) return term; // literal (possibly `"…"^^
` / `"…"@lang`) + return `<${term}>`; // IRI +} + +/** Render quads to canonical (deduped + sorted) N-Quads. */ +export function quadsToNQuads(quads: Quad[]): string { + const lines = quads.map((q) => { + const graph = q.graph ? ` ${termToNQuads(q.graph)}` : ''; + return `${termToNQuads(q.subject)} <${q.predicate}> ${termToNQuads(q.object)}${graph} .`; + }); + const unique = [...new Set(lines)].sort(); + return unique.length > 0 ? unique.join('\n') + '\n' : ''; +} diff --git a/packages/okf/src/paths.ts b/packages/okf/src/paths.ts new file mode 100644 index 000000000..492835b16 --- /dev/null +++ b/packages/okf/src/paths.ts @@ -0,0 +1,132 @@ +/** + * Concept-ID ↔ path resolution and OKF cross-link resolution. + * + * The segment-validation regex is kept byte-for-byte in agreement with the + * OKF reference agent's `okf/src/reference_agent/bundle/paths.py`: + * + * _SEGMENT_RE = re.compile(r"[A-Za-z0-9_][A-Za-z0-9_.\-]*") + * + * matched with `fullmatch`. A path segment must start with an alphanumeric or + * underscore, then may contain alphanumerics, underscore, dot or hyphen. + */ + +import { DEFAULT_IRI_BASE } from './constants.js'; + +/** Reserved filenames that are NOT concepts (SPEC §3.1, §6, §7). */ +export const RESERVED_FILENAMES = new Set(['index.md', 'log.md']); + +/** Mirrors `paths.py` `_SEGMENT_RE` used with `fullmatch`. */ +const SEGMENT_RE = /^[A-Za-z0-9_][A-Za-z0-9_.\-]*$/; + +/** A scheme-prefixed URL (http:, https:, mailto:, urn:, …) — never a concept link. */ +const SCHEME_RE = /^[A-Za-z][A-Za-z0-9+.-]*:/; + +export function isValidSegment(segment: string): boolean { + return SEGMENT_RE.test(segment); +} + +/** POSIX basename of a bundle-relative path. */ +export function basename(path: string): string { + const parts = path.split('/'); + return parts[parts.length - 1] ?? ''; +} + +/** True for reserved `index.md` / `log.md` at any depth (SPEC §3.1). */ +export function isReservedFile(path: string): boolean { + return RESERVED_FILENAMES.has(basename(path)); +} + +/** True for a non-reserved `.md` file (i.e. a concept document, SPEC §4). */ +export function isConceptFile(path: string): boolean { + return path.endsWith('.md') && !isReservedFile(path); +} + +/** + * Bundle-relative path → concept ID (path with `.md` removed, SPEC §2). + * `tables/blocks.md` → `tables/blocks`. + */ +export function pathToConceptId(path: string): string { + const noExt = path.endsWith('.md') ? path.slice(0, -3) : path; + // Normalise any backslashes a Windows caller might pass; bundles are POSIX. + return noExt.split(/[\\/]/).filter((s) => s.length > 0).join('/'); +} + +/** Concept ID → deterministic subject IRI. Same bundle ⇒ same IRI. */ +export function conceptIdToIri(conceptId: string, iriBase: string = DEFAULT_IRI_BASE): string { + return `${iriBase}${conceptId}`; +} + +/** + * Node-side Knowledge Asset / assertion name for a concept. + * + * DKG asset/assertion names cannot contain `/`, but OKF concept IDs are + * path-based (`tables/blocks`). The encoding must be **injective** — a naive + * `/`→`__` collapses `a/b` and the literal concept `a__b` onto the same name. + * So escape the escape character first (`_`→`_5f`), then `/`→`_2f` (the chars' + * hex codes). `a/b`→`a_2fb`, `a__b`→`a_5f_5fb` — distinct. The RDF subject IRI + * is unaffected; it keeps the original `/`-bearing concept ID. + */ +export function conceptIdToKaName(conceptId: string): string { + return conceptId.replace(/_/g, '_5f').replace(/\//g, '_2f'); +} + +/** + * Resolve a Markdown link `href` written inside the concept `fromConceptId` + * into a candidate target concept ID, per SPEC §5. Handles: + * - absolute (bundle-relative): `/tables/customers.md` + * - relative: `./other.md`, `../tables/x.md` + * - bare-sibling: `x.md` + * - extension-less variants: `x`, `../tables/x` + * - `#anchor` / `?query` suffixes are stripped first + * + * Returns `null` for: external URLs (scheme-prefixed), pure anchors, paths that + * escape the bundle root, directory links, or any candidate whose segments fail + * the `paths.py` validation regex (so it could not be a concept ID anyway). + * + * Note: this returns a *candidate* — whether the target actually exists in the + * bundle is decided by the caller against the Pass-1 concept map. A candidate + * that does not exist is a broken link, which is NOT an error (SPEC §5.3/§9). + */ +export function resolveLinkTarget(href: string, fromConceptId: string): string | null { + // Strip anchor / query. + let target = href.split('#')[0].split('?')[0].trim(); + if (target.length === 0) return null; + // External URL (http:, mailto:, …) — not a concept link. + if (SCHEME_RE.test(target)) return null; + // A trailing slash denotes a directory, not a concept document. + if (target.endsWith('/')) return null; + + let stack: string[]; + if (target.startsWith('/')) { + // Absolute, bundle-relative. + stack = []; + target = target.slice(1); + } else { + // Relative to the linking concept's directory. + stack = fromConceptId.split('/').slice(0, -1); + } + + for (const part of target.split('/')) { + if (part === '' || part === '.') continue; + if (part === '..') { + if (stack.length === 0) return null; // escapes the bundle root + stack.pop(); + continue; + } + stack.push(part); + } + if (stack.length === 0) return null; + + // Drop a trailing `.md` extension (extension-less links are left as-is). + const last = stack[stack.length - 1]; + if (last.endsWith('.md')) { + stack[stack.length - 1] = last.slice(0, -3); + } + if (stack[stack.length - 1].length === 0) return null; // directory link + + // Every segment must be a valid concept-ID segment, else it can't be a concept. + for (const seg of stack) { + if (!isValidSegment(seg)) return null; + } + return stack.join('/'); +} diff --git a/packages/okf/src/types.ts b/packages/okf/src/types.ts new file mode 100644 index 000000000..6eddb1bed --- /dev/null +++ b/packages/okf/src/types.ts @@ -0,0 +1,144 @@ +/** + * Public types for the OKF → DKG mapper. + * + * The RDF output is a deterministic array of `Quad`s. We reuse the node's + * canonical `Quad` shape (`{ subject, predicate, object, graph? }`) so the + * mapper's output drops straight into the importer/`/api/assertion/write` + * path without translation. + */ + +/** Canonical quad shape, structurally identical to `dkg-core`'s extraction `Quad`. */ +export interface Quad { + subject: string; + predicate: string; + object: string; + graph?: string; +} + +/** A parsed OKF concept document (frontmatter + body), per SPEC §4. */ +export interface OkfDocument { + /** Bundle-relative concept ID (path with `.md` removed), e.g. `tables/blocks`. */ + conceptId: string; + /** Path segments of the concept ID, e.g. `['tables', 'blocks']`. */ + segments: string[]; + /** Parsed YAML frontmatter (empty object if none). */ + frontmatter: Record; + /** Markdown body (everything after the closing `---`). */ + body: string; +} + +/** A resolved or unresolved cross-link discovered in a concept body (SPEC §5). */ +export interface OkfLink { + /** Raw link target as written in the Markdown, e.g. `../tables/blocks.md`. */ + raw: string; + /** Resolved target concept ID if it exists in the bundle, else `null`. */ + targetConceptId: string | null; + /** True when the link sat inside an inline code span (SPEC/CommonMark edge case). */ + inCodeSpan: boolean; +} + +/** A citation captured from a `# Citations` section (SPEC §8). */ +export interface OkfCitation { + /** The cited URL (external in the wild). */ + url: string; + /** Optional human label (numbered `[n] [label](url)` style). */ + label?: string; +} + +/** + * A deterministic, opt-in rule that types a cross-concept edge by the OKF + * `type` of its two endpoints — no LLM, no prose parsing. e.g. a link from a + * `BigQuery Dataset` to a `BigQuery Table` is containment (`schema:hasPart`), + * while `BigQuery Table` → `BigQuery Table` stays an untyped reference + * (`schema:mentions`). Both endpoint types come straight from frontmatter, so + * this is byte-stable. Default behaviour (no rules) keeps every edge as + * `schema:mentions`, preserving the "links are untyped per SPEC §5.3" guarantee. + */ +export interface TypeRelation { + /** Source concept's OKF `type` (exact string, e.g. `BigQuery Dataset`). */ + from: string; + /** Target concept's OKF `type`. */ + to: string; + /** Predicate IRI to use for edges matching this (from,to) type pair. */ + predicate: string; +} + +/** Mapping options. All deterministic; no network, no LLM. */ +export interface OkfMappingOptions { + /** IRI namespace for concept subjects. Default `urn:okf:`. */ + iriBase?: string; + /** + * Opt-in type-pair edge typing. Empty/undefined ⇒ every cross-concept edge is + * `schema:mentions` (the faithful, zero-interpretation default). + */ + typeRelations?: TypeRelation[]; + /** + * Whether a concept link written inside an inline code span counts as an edge. + * Default `false` — CommonMark treats code-span content as literal text, so it + * is NOT a link. See ADR 0005 / CONTEXT.md "Flagged ambiguities". + */ + includeCodeSpanLinks?: boolean; + /** + * Whether to emit `schema:isPartOf` triples reflecting the folder hierarchy. + * Default `false` — directories are not concepts, so minting them as nodes + * muddies the concept graph. See ADR 0005. + */ + emitFolderHierarchy?: boolean; +} + +/** Per-concept mapping result. */ +export interface ConceptMapping { + conceptId: string; + /** Deterministic subject IRI for this concept's Knowledge Asset. */ + iri: string; + /** Content + linkage triples for this concept. */ + quads: Quad[]; + /** Resolved concept→concept edges (subset of links). */ + resolvedLinks: OkfLink[]; + /** Links whose target is not in the bundle (warned, never fatal — SPEC §5.3/§9). */ + brokenLinks: OkfLink[]; + /** Links skipped because they sat in a code span and the option is off. */ + codeSpanLinks: OkfLink[]; + /** Citations captured (distinct from concept edges). */ + citations: OkfCitation[]; +} + +/** A non-fatal diagnostic surfaced during a bundle import. */ +export interface OkfWarning { + conceptId?: string; + code: 'broken-link' | 'code-span-link' | 'missing-type' | 'reserved-skip' | 'parse'; + message: string; +} + +/** Result of importing a whole bundle. */ +export interface BundleImport { + /** OKF version declared in the root `index.md`, if any (SPEC §11). */ + okfVersion: string | null; + /** Concept ID → subject IRI map (Pass 1). */ + iriByConceptId: Record; + /** Per-concept mappings (Pass 2). */ + concepts: ConceptMapping[]; + /** Reserved files (`index.md` / `log.md`) skipped — never minted as KAs. */ + reservedSkipped: string[]; + /** All quads across all concepts, in concept order. */ + quads: Quad[]; + /** Non-fatal diagnostics. */ + warnings: OkfWarning[]; +} + +/** A single file fed to the in-memory mapper (path is bundle-relative, POSIX). */ +export interface BundleFile { + /** Bundle-relative POSIX path including extension, e.g. `tables/blocks.md`. */ + path: string; + /** UTF-8 file contents. */ + content: string; +} + +/** §9 conformance report. */ +export interface ConformanceReport { + conformant: boolean; + /** Hard violations — only §9 rules 1–2 (parseable frontmatter + non-empty `type`) make a bundle non-conformant; reserved-file structure issues are warnings. */ + errors: string[]; + /** Things consumers MUST tolerate (§9) — surfaced as info, never failing. */ + warnings: string[]; +} diff --git a/packages/okf/src/utils.ts b/packages/okf/src/utils.ts new file mode 100644 index 000000000..735f55f02 --- /dev/null +++ b/packages/okf/src/utils.ts @@ -0,0 +1,59 @@ +/** + * Small, dependency-free helpers shared by the mapper. + * + * `isSafeIri` is replicated verbatim from `dkg-core`'s `sparql-safe.ts` so the + * pure mapper has zero runtime cross-package dependency and stays unit-testable + * in isolation (the package intentionally does not import the node at runtime). + * A test pins the behaviour against the same inputs the node validates. + */ + +const IRI_SCHEME_RE = /^[a-zA-Z][a-zA-Z0-9+.-]*:[^\s<>"{}|\\^`\x00-\x20]+$/; + +/** True when the string is a syntactically safe IRI with a scheme prefix. */ +export function isSafeIri(value: string): boolean { + if (!value) return false; + return IRI_SCHEME_RE.test(value); +} + +/** N-Triples/quad-encoding of a plain string literal: `"escaped"`. */ +export function literalTerm(value: string): string { + return JSON.stringify(value); +} + +/** N-Triples/quad-encoding of a typed literal: `"lexical"^^`. */ +export function typedLiteralTerm(lexical: string, datatypeIri: string): string { + return `${JSON.stringify(lexical)}^^<${datatypeIri}>`; +} + +/** + * `BigQuery Dataset` → `BigQueryDataset`. Splits on any non-alphanumeric run and + * upper-cases the first letter of each word, preserving existing inner case. + */ +export function pascalCase(value: string): string { + return value + .split(/[^A-Za-z0-9]+/) + .filter(Boolean) + .map((w) => w.charAt(0).toUpperCase() + w.slice(1)) + .join(''); +} + +/** + * `release date` → `releaseDate`, `custom_field` → `customField`. camelCase used + * for producer-defined frontmatter keys (converges with the extractor's handling). + */ +export function camelCase(value: string): string { + const parts = value.split(/[^A-Za-z0-9]+/).filter(Boolean); + if (parts.length === 0) return 'property'; + return parts + .map((p, i) => + i === 0 + ? p.charAt(0).toLowerCase() + p.slice(1) + : p.charAt(0).toUpperCase() + p.slice(1), + ) + .join(''); +} + +/** Stable blank-node label fragment from an arbitrary concept ID. */ +export function sanitizeForBlank(value: string): string { + return value.replace(/[^A-Za-z0-9]+/g, '_'); +} diff --git a/packages/okf/src/validation.ts b/packages/okf/src/validation.ts new file mode 100644 index 000000000..ad9fccd1b --- /dev/null +++ b/packages/okf/src/validation.ts @@ -0,0 +1,82 @@ +/** + * OKF §9 conformance validation — deliberately permissive. + * + * A bundle is conformant iff: + * 1. every non-reserved `.md` has a parseable YAML frontmatter block; + * 2. every frontmatter has a non-empty `type`; + * 3. reserved files (`index.md`/`log.md`) follow §6/§7 when present. + * + * Consumers MUST NOT reject a bundle for: missing optional fields, unknown + * `type` values, unknown extra keys, broken cross-links, or missing `index.md`. + * Those are surfaced as `warnings`, never `errors`. Only rules 1 and 2 produce + * hard errors; reserved-file structure issues are reported as warnings to keep + * the consumer lenient (see ADR 0005 / CONTEXT.md). + */ + +import { parseDocument, OkfDocumentError } from './document.js'; +import { isConceptFile, isReservedFile, basename, pathToConceptId } from './paths.js'; +import type { BundleFile, ConformanceReport } from './types.js'; + +export function validateBundle(files: BundleFile[]): ConformanceReport { + const errors: string[] = []; + const warnings: string[] = []; + + const ordered = [...files].sort((a, b) => (a.path < b.path ? -1 : a.path > b.path ? 1 : 0)); + + let conceptCount = 0; + let hasRootIndex = false; + + for (const f of ordered) { + if (isReservedFile(f.path)) { + const name = basename(f.path); + const isRootIndex = f.path === 'index.md'; + if (isRootIndex) hasRootIndex = true; + // Reserved files carry no frontmatter, except the bundle-root index.md may + // declare only `okf_version` (§6, §11). + try { + const parsed = parseDocument(f.path, f.content); + const keys = Object.keys(parsed.frontmatter); + if (keys.length > 0) { + if (name === 'index.md' && isRootIndex) { + const extra = keys.filter((k) => k !== 'okf_version'); + if (extra.length > 0) { + warnings.push( + `root index.md declares frontmatter keys other than okf_version: ${extra.join(', ')} (§11)`, + ); + } + } else { + warnings.push(`reserved file ${f.path} carries frontmatter (§6/§7 expect none)`); + } + } + } catch (err) { + warnings.push( + `reserved file ${f.path} did not parse: ${err instanceof OkfDocumentError ? err.message : String(err)}`, + ); + } + continue; + } + + if (!isConceptFile(f.path)) continue; // non-.md assets (viz.html, etc.) are out of scope + conceptCount += 1; + + const conceptId = pathToConceptId(f.path); + let frontmatter: Record; + try { + frontmatter = parseDocument(f.path, f.content).frontmatter; + } catch (err) { + errors.push( + `${conceptId}: ${err instanceof OkfDocumentError ? err.message : String(err)} (§9 rule 1)`, + ); + continue; + } + const type = frontmatter.type; + if (type === undefined || type === null || String(type).trim() === '') { + errors.push(`${conceptId}: frontmatter has no non-empty \`type\` (§9 rule 2)`); + } + } + + if (conceptCount === 0) warnings.push('bundle contains no concept documents'); + if (!hasRootIndex) warnings.push('bundle has no root index.md (permitted — §9)'); + + return { conformant: errors.length === 0, errors, warnings }; +} diff --git a/packages/okf/test/bundle.test.ts b/packages/okf/test/bundle.test.ts new file mode 100644 index 000000000..7b6a27816 --- /dev/null +++ b/packages/okf/test/bundle.test.ts @@ -0,0 +1,181 @@ +import { describe, it, expect } from 'vitest'; +import { fileURLToPath } from 'node:url'; +import { importBundle, exportBundle, loadBundleDir, quadsToNQuads } from '../src/index.js'; +import type { BundleImport } from '../src/index.js'; + +const fixtureDir = fileURLToPath(new URL('./fixtures/crypto_bitcoin', import.meta.url)); +const files = loadBundleDir(fixtureDir); + +/** Outgoing untyped edges (schema:mentions) for a concept, as target concept IDs. */ +function edges(r: BundleImport, conceptId: string): string[] { + const c = r.concepts.find((x) => x.conceptId === conceptId); + if (!c) throw new Error(`no concept ${conceptId}`); + return [...new Set(c.resolvedLinks.map((l) => l.targetConceptId!))].sort(); +} + +describe('crypto_bitcoin golden import (§4.1)', () => { + const r = importBundle(files); + + it('mints exactly 5 Knowledge Assets, zero for the 3 reserved index.md', () => { + expect(r.concepts.map((c) => c.conceptId).sort()).toEqual([ + 'datasets/crypto_bitcoin', + 'tables/blocks', + 'tables/inputs', + 'tables/outputs', + 'tables/transactions', + ]); + expect(r.reservedSkipped.sort()).toEqual(['datasets/index.md', 'index.md', 'tables/index.md']); + }); + + it('derives IRIs deterministically from concept IDs', () => { + expect(r.iriByConceptId).toMatchObject({ + 'datasets/crypto_bitcoin': 'urn:okf:datasets/crypto_bitcoin', + 'tables/blocks': 'urn:okf:tables/blocks', + 'tables/transactions': 'urn:okf:tables/transactions', + 'tables/inputs': 'urn:okf:tables/inputs', + 'tables/outputs': 'urn:okf:tables/outputs', + }); + }); + + it('emits the expected frontmatter triples per concept', () => { + const ds = r.concepts.find((c) => c.conceptId === 'datasets/crypto_bitcoin')!; + const has = (predicate: string, object: string) => + ds.quads.some((q) => q.predicate === predicate && q.object === object); + expect(has('http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://schema.org/BigQueryDataset')).toBe(true); + expect(has('http://schema.org/name', '"Cryptocurrency Bitcoin"')).toBe(true); + expect( + has( + 'http://schema.org/url', + 'https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin', + ), + ).toBe(true); + expect( + ds.quads.some( + (q) => + q.predicate === 'http://schema.org/dateModified' && + q.object === '"2026-05-28T22:44:47+00:00"^^', + ), + ).toBe(true); + // tables/blocks is BigQuery Table + const blocks = r.concepts.find((c) => c.conceptId === 'tables/blocks')!; + expect( + blocks.quads.some( + (q) => + q.predicate === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' && + q.object === 'http://schema.org/BigQueryTable', + ), + ).toBe(true); + }); + + it('reconstructs the untyped relationship graph from prose links', () => { + expect(edges(r, 'datasets/crypto_bitcoin')).toEqual([ + 'tables/blocks', + 'tables/inputs', + 'tables/outputs', + 'tables/transactions', + ]); + expect(edges(r, 'tables/transactions')).toEqual([ + 'datasets/crypto_bitcoin', + 'tables/blocks', + 'tables/inputs', + 'tables/outputs', + ]); + expect(edges(r, 'tables/inputs')).toEqual([ + 'datasets/crypto_bitcoin', + 'tables/outputs', + 'tables/transactions', + ]); + // blocks: no concept edges (external citations only) + expect(edges(r, 'tables/blocks')).toEqual([]); + }); + + it('documents the in-code-span edge case: outputs has NO concept edges by default', () => { + expect(edges(r, 'tables/outputs')).toEqual([]); + const outputs = r.concepts.find((c) => c.conceptId === 'tables/outputs')!; + expect(outputs.codeSpanLinks.map((l) => l.raw)).toEqual(['transactions.md', 'inputs.md']); + }); + + it('captures both citation styles as citation triples, not concept edges', () => { + const blocks = r.concepts.find((c) => c.conceptId === 'tables/blocks')!; + // bare-bullet style + expect(blocks.citations.map((c) => c.url)).toEqual([ + 'https://github.com/blockchain-etl/bitcoin-etl', + ]); + const ds = r.concepts.find((c) => c.conceptId === 'datasets/crypto_bitcoin')!; + // numbered style + expect(ds.citations.map((c) => c.url)).toContain( + 'https://cloud.google.com/blog/products/gcp/bitcoin-in-bigquery-blockchain-analytics-on-public-data', + ); + expect( + ds.quads.some( + (q) => + q.predicate === 'http://schema.org/citation' && + q.object.startsWith('https://bigquery.googleapis.com'), + ), + ).toBe(true); + }); + + it('produces byte-identical N-Quads across two runs (determinism)', () => { + const a = quadsToNQuads(importBundle(files).quads); + const b = quadsToNQuads(importBundle(files).quads); + expect(a).toBe(b); + expect(a.length).toBeGreaterThan(0); + }); + + it('opting into code-span links adds the two outputs edges', () => { + const withCode = importBundle(files, { includeCodeSpanLinks: true }); + expect(edges(withCode, 'tables/outputs')).toEqual(['tables/inputs', 'tables/transactions']); + }); +}); + +describe('opt-in type-pair edge relations (deterministic, no LLM)', () => { + const HAS_PART = 'http://schema.org/hasPart'; + const MENTIONS = 'http://schema.org/mentions'; + // Dataset→Table is containment; Table→Table stays an untyped reference. + const rules = [{ from: 'BigQuery Dataset', to: 'BigQuery Table', predicate: HAS_PART }]; + + const objectsByPredicate = (r: BundleImport, conceptId: string, predicate: string) => + r.concepts + .find((c) => c.conceptId === conceptId)! + .quads.filter((q) => q.predicate === predicate) + .map((q) => q.object) + .sort(); + + it('default (no rules) keeps every edge as schema:mentions', () => { + const r = importBundle(files); + expect(objectsByPredicate(r, 'datasets/crypto_bitcoin', HAS_PART)).toEqual([]); + expect(objectsByPredicate(r, 'datasets/crypto_bitcoin', MENTIONS).length).toBeGreaterThan(0); + }); + + it('dataset→table becomes hasPart; table→table stays mentions', () => { + const r = importBundle(files, { typeRelations: rules }); + // The dataset's links to its 4 tables are now containment edges... + expect(objectsByPredicate(r, 'datasets/crypto_bitcoin', HAS_PART).length).toBe(4); + expect(objectsByPredicate(r, 'datasets/crypto_bitcoin', MENTIONS)).toEqual([]); + // ...while transactions→(other tables) remain untyped references. + expect(objectsByPredicate(r, 'tables/transactions', HAS_PART)).toEqual([]); + expect(objectsByPredicate(r, 'tables/transactions', MENTIONS).length).toBeGreaterThan(0); + }); + + it('a rule that matches no endpoint pair leaves all edges as mentions', () => { + const r = importBundle(files, { + typeRelations: [{ from: 'Nonexistent', to: 'AlsoMissing', predicate: HAS_PART }], + }); + expect(objectsByPredicate(r, 'datasets/crypto_bitcoin', HAS_PART)).toEqual([]); + expect(objectsByPredicate(r, 'datasets/crypto_bitcoin', MENTIONS).length).toBeGreaterThan(0); + }); + + it('is byte-stable with rules applied', () => { + const a = quadsToNQuads(importBundle(files, { typeRelations: rules }).quads); + const b = quadsToNQuads(importBundle(files, { typeRelations: rules }).quads); + expect(a).toBe(b); + }); + + it('hasPart edges round-trip through export as plain (untyped) OKF links', () => { + // OKF can't express the relation type, so export emits a link and a plain + // re-import (no rules) reads it back as mentions — consistent with SPEC §5.3. + const first = importBundle(files, { typeRelations: rules }); + const second = importBundle(exportBundle(first)); + expect(edges(second, 'datasets/crypto_bitcoin').length).toBe(4); + }); +}); diff --git a/packages/okf/test/document.test.ts b/packages/okf/test/document.test.ts new file mode 100644 index 000000000..b6337c492 --- /dev/null +++ b/packages/okf/test/document.test.ts @@ -0,0 +1,45 @@ +import { describe, it, expect } from 'vitest'; +import { parseDocument, OkfDocumentError } from '../src/index.js'; + +describe('parseDocument (mirrors document.py OKFDocument.parse)', () => { + it('treats a file with no leading --- as all body (reserved files)', () => { + const doc = parseDocument('index.md', '# Subdirectories\n\n* [a](a.md)\n'); + expect(doc.frontmatter).toEqual({}); + expect(doc.body).toBe('# Subdirectories\n\n* [a](a.md)\n'); + }); + + it('splits frontmatter and strips exactly one leading body newline', () => { + const doc = parseDocument( + 'tables/blocks.md', + '---\ntype: BigQuery Table\ntitle: Blocks\n---\n\nBody line one\n', + ); + expect(doc.conceptId).toBe('tables/blocks'); + expect(doc.segments).toEqual(['tables', 'blocks']); + expect(doc.frontmatter).toEqual({ type: 'BigQuery Table', title: 'Blocks' }); + expect(doc.body).toBe('\nBody line one\n'.slice(1)); // one leading \n removed + expect(doc.body.startsWith('Body line one')).toBe(true); + }); + + it('closes the block on a --- with trailing whitespace', () => { + const doc = parseDocument('a.md', '---\ntype: X\n--- \nbody'); + expect(doc.frontmatter).toEqual({ type: 'X' }); + expect(doc.body).toBe('body'); + }); + + it('throws on an unterminated frontmatter block', () => { + expect(() => parseDocument('a.md', '---\ntype: X\nbody without close')).toThrow( + OkfDocumentError, + ); + }); + + it('throws when frontmatter is not a YAML mapping', () => { + expect(() => parseDocument('a.md', '---\n- just\n- a\n- list\n---\nbody')).toThrow( + /must be a YAML mapping/, + ); + }); + + it('reads okf_version from a root index.md', () => { + const doc = parseDocument('index.md', '---\nokf_version: "0.1"\n---\n\n# Root\n'); + expect(doc.frontmatter.okf_version).toBe('0.1'); + }); +}); diff --git a/packages/okf/test/edge-cases.test.ts b/packages/okf/test/edge-cases.test.ts new file mode 100644 index 000000000..c8a554550 --- /dev/null +++ b/packages/okf/test/edge-cases.test.ts @@ -0,0 +1,85 @@ +import { describe, it, expect } from 'vitest'; +import { fileURLToPath } from 'node:url'; +import { importBundle, loadBundleDir, validateBundle } from '../src/index.js'; + +const links = loadBundleDir(fileURLToPath(new URL('./fixtures/synthetic_links', import.meta.url))); +const edge = loadBundleDir(fileURLToPath(new URL('./fixtures/edge_cases', import.meta.url))); + +describe('synthetic link-forms bundle (exercises forms crypto_bitcoin lacks)', () => { + const r = importBundle(links); + const edgesOf = (id: string) => + [...new Set(r.concepts.find((c) => c.conceptId === id)!.resolvedLinks.map((l) => l.targetConceptId))].sort(); + + it('resolves absolute, relative, parent-relative, bare-sibling and extension-less links', () => { + expect(edgesOf('hub')).toEqual(['beta', 'tables/alpha', 'tables/gamma']); + expect(edgesOf('tables/alpha')).toEqual(['beta', 'hub']); // ../hub.md and /beta.md + expect(edgesOf('beta')).toEqual(['hub', 'tables/alpha']); + }); + + it('records a broken cross-link as a warning, never an error', () => { + const hub = r.concepts.find((c) => c.conceptId === 'hub')!; + expect(hub.brokenLinks.map((l) => l.raw)).toContain('tables/does_not_exist.md'); + expect(r.warnings.some((w) => w.code === 'broken-link')).toBe(true); + }); +}); + +describe('edge-case bundle (type-only, unknown keys, broken link, log.md)', () => { + const r = importBundle(edge); + + it('reads okf_version from the root index.md', () => { + expect(r.okfVersion).toBe('0.1'); + }); + + it('maps a type-only concept with just rdf:type (graceful degradation)', () => { + const t = r.concepts.find((c) => c.conceptId === 'type_only')!; + const types = t.quads.filter( + (q) => q.predicate === 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', + ); + expect(types).toHaveLength(1); + expect(t.quads.some((q) => q.predicate === 'http://schema.org/name')).toBe(false); + }); + + it('preserves producer-defined keys and never drops them', () => { + const x = r.concepts.find((c) => c.conceptId === 'extras')!; + expect(x.quads.some((q) => q.predicate === 'http://schema.org/owner' && q.object === '"Alice"')).toBe(true); + expect( + x.quads.some( + (q) => + q.predicate === 'http://schema.org/priority' && + q.object === '"3"^^', + ), + ).toBe(true); + }); + + it('does not mint log.md or index.md as Knowledge Assets', () => { + expect(r.reservedSkipped.sort()).toEqual(['index.md', 'log.md']); + expect(r.concepts.map((c) => c.conceptId).sort()).toEqual(['extras', 'type_only']); + }); + + it('warns on the broken link but stays conformant', () => { + expect(r.warnings.some((w) => w.code === 'broken-link')).toBe(true); + expect(validateBundle(edge).conformant).toBe(true); + }); +}); + +describe('§9 conformance validation', () => { + it('accepts the crypto_bitcoin bundle', () => { + const cb = loadBundleDir(fileURLToPath(new URL('./fixtures/crypto_bitcoin', import.meta.url))); + expect(validateBundle(cb).conformant).toBe(true); + }); + + it('flags a concept missing a non-empty type as non-conformant (§9 rule 2)', () => { + const report = validateBundle([ + { path: 'bad.md', content: '---\ntitle: No type here\n---\nbody' }, + ]); + expect(report.conformant).toBe(false); + expect(report.errors.join(' ')).toMatch(/non-empty `type`/); + }); + + it('does NOT reject for broken links, unknown keys, or missing index.md (§9)', () => { + const report = validateBundle([ + { path: 'a.md', content: '---\ntype: T\nweird_key: v\n---\n[broken](missing.md)' }, + ]); + expect(report.conformant).toBe(true); + }); +}); diff --git a/packages/okf/test/fixtures/crypto_bitcoin.ATTRIBUTION.md b/packages/okf/test/fixtures/crypto_bitcoin.ATTRIBUTION.md new file mode 100644 index 000000000..47dc0ee21 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin.ATTRIBUTION.md @@ -0,0 +1,17 @@ +# Attribution + +This directory vendors Google's `crypto_bitcoin` OKF bundle verbatim, used as the +golden acceptance fixture for the OKF → DKG importer. + +- **Source:** `GoogleCloudPlatform/knowledge-catalog`, path `okf/bundles/crypto_bitcoin/` +- **Pinned commit:** `d44368c15e38e7c92481c5992e4f9b5b421a801d` +- **Upstream:** https://github.com/GoogleCloudPlatform/knowledge-catalog/tree/d44368c15e38e7c92481c5992e4f9b5b421a801d/okf/bundles/crypto_bitcoin +- **License:** Apache-2.0 (© Google LLC) + +The `.md` files are unmodified copies. `viz.html` (a non-concept rendering asset) +is intentionally not vendored — the importer consumes only the Markdown concepts. +The bundle is the public `bigquery-public-data.crypto_bitcoin` dataset description +produced by the open-source `bitcoin-etl` pipeline. + +Per OKF SPEC §3, the bundle ships **no** verification, provenance, or ownership +layer; that is exactly what the DKG integration adds. diff --git a/packages/okf/test/fixtures/crypto_bitcoin/datasets/crypto_bitcoin.md b/packages/okf/test/fixtures/crypto_bitcoin/datasets/crypto_bitcoin.md new file mode 100644 index 000000000..202919075 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/datasets/crypto_bitcoin.md @@ -0,0 +1,55 @@ +--- +type: BigQuery Dataset +resource: https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin +title: Cryptocurrency Bitcoin +description: This BigQuery public dataset contains a complete history of the Bitcoin + blockchain and updates every 10 minutes. +tags: +- cryptocurrency +- bitcoin +- blockchain +- public data +- gcp +- data-analytics +timestamp: '2026-05-28T22:44:47+00:00' +--- + +The `crypto_bitcoin` dataset provides a comprehensive and up-to-date record of the entire Bitcoin blockchain. It includes detailed information about [blocks](../tables/blocks.md), [transactions](../tables/transactions.md), transaction [inputs](../tables/inputs.md), and [outputs](../tables/outputs.md). This dataset is part of the BigQuery Public Datasets program, making it freely accessible for analysis and research into Bitcoin's operations, economics, and historical trends. Researchers, developers, and enthusiasts can use this data to understand transaction patterns, network activity, and the overall state of the Bitcoin network. + +# Schema + +This dataset contains the following tables, providing a complete history of the Bitcoin blockchain: + +* [blocks](../tables/blocks.md) +* [inputs](../tables/inputs.md) +* [outputs](../tables/outputs.md) +* [transactions](../tables/transactions.md) + +# Common query patterns + +```sql +-- Count the total number of blocks in the Bitcoin blockchain +SELECT + COUNT(*) +FROM + `bigquery-public-data.crypto_bitcoin.blocks`; +``` + +```sql +-- Get the total number of transactions over time +SELECT + DATE(block_timestamp) AS transaction_date, + COUNT(transaction_id) AS total_transactions +FROM + `bigquery-public-data.crypto_bitcoin.transactions` +GROUP BY + transaction_date +ORDER BY + transaction_date DESC +LIMIT 100; +``` + +# Citations + +[1] [BigQuery Public Dataset: crypto_bitcoin](https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin) +[2] [Bitcoin in BigQuery: blockchain analytics on public data](https://cloud.google.com/blog/products/gcp/bitcoin-in-bigquery-blockchain-analytics-on-public-data) diff --git a/packages/okf/test/fixtures/crypto_bitcoin/datasets/index.md b/packages/okf/test/fixtures/crypto_bitcoin/datasets/index.md new file mode 100644 index 000000000..9f118b6d7 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/datasets/index.md @@ -0,0 +1,3 @@ +# BigQuery Dataset + +* [Cryptocurrency Bitcoin](crypto_bitcoin.md) - This BigQuery public dataset contains a complete history of the Bitcoin blockchain and updates every 10 minutes. diff --git a/packages/okf/test/fixtures/crypto_bitcoin/index.md b/packages/okf/test/fixtures/crypto_bitcoin/index.md new file mode 100644 index 000000000..6e4525206 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/index.md @@ -0,0 +1,4 @@ +# Subdirectories + +* [datasets](datasets/index.md) - This BigQuery public dataset contains a complete history of the Bitcoin blockchain and updates every 10 minutes. +* [tables](tables/index.md) - This directory contains BigQuery table details for Bitcoin blocks, transactions, inputs, and outputs. diff --git a/packages/okf/test/fixtures/crypto_bitcoin/tables/blocks.md b/packages/okf/test/fixtures/crypto_bitcoin/tables/blocks.md new file mode 100644 index 000000000..feea27640 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/tables/blocks.md @@ -0,0 +1,32 @@ +--- +type: BigQuery Table +resource: https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin/tables/blocks +title: Bitcoin Blocks Table +description: Details about the Bitcoin Blocks BigQuery table, including its schema. +tags: +- bitcoin +- bigquery +- blocks +- blockchain +timestamp: '2026-05-28T22:43:59+00:00' +--- + +# Schema + +| Field | Type | +| --- | --- | +| hash | hex_string | +| size | bigint | +| stripped_size | bigint | +| weight | bigint | +| number | bigint | +| version | bigint | +| merkle_root | hex_string | +| timestamp | bigint | +| nonce | hex_string | +| bits | hex_string | +| coinbase_param | hex_string | +| transaction_count | bigint | + +# Citations +- https://github.com/blockchain-etl/bitcoin-etl diff --git a/packages/okf/test/fixtures/crypto_bitcoin/tables/index.md b/packages/okf/test/fixtures/crypto_bitcoin/tables/index.md new file mode 100644 index 000000000..7583606c3 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/tables/index.md @@ -0,0 +1,6 @@ +# BigQuery Table + +* [Bitcoin Blocks Table](blocks.md) - Details about the Bitcoin Blocks BigQuery table, including its schema. +* [Bitcoin Transaction Inputs](inputs.md) - Details about transaction inputs on the Bitcoin blockchain. +* [Bitcoin Transactions](transactions.md) - A comprehensive table detailing all transactions on the Bitcoin blockchain. +* [Outputs](outputs.md) - Outputs from all transactions in the Bitcoin blockchain. diff --git a/packages/okf/test/fixtures/crypto_bitcoin/tables/inputs.md b/packages/okf/test/fixtures/crypto_bitcoin/tables/inputs.md new file mode 100644 index 000000000..f3160d679 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/tables/inputs.md @@ -0,0 +1,65 @@ +--- +type: BigQuery Table +resource: https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin/tables/inputs +title: Bitcoin Transaction Inputs +description: Details about transaction inputs on the Bitcoin blockchain. +tags: +- bitcoin +- blockchain +- cryptocurrency +- transactions +- inputs +- etl +timestamp: '2026-05-28T22:44:24+00:00' +--- + +This table, part of the public [crypto_bitcoin](../datasets/crypto_bitcoin.md) dataset, contains detailed information about every input used in Bitcoin transactions. Each row represents a single transaction input, which typically references an unspent output from a previous transaction. This table is crucial for tracing the flow of Bitcoin and understanding the history of transactions. It records where the coins originated (`spent_transaction_hash` and `spent_output_index`) and the associated `value` transferred. This table can be joined with the [transactions](transactions.md) table on `transaction_hash` and [outputs](outputs.md) to reconstruct the full transaction graph. + +# Schema + +* `transaction_hash`: STRING +* `block_hash`: STRING +* `block_number`: INTEGER +* `block_timestamp`: TIMESTAMP +* `index`: INTEGER +* `spent_transaction_hash`: STRING +* `spent_output_index`: INTEGER +* `script_asm`: STRING +* `script_hex`: STRING +* `sequence`: INTEGER +* `required_signatures`: INTEGER +* `type`: STRING +* `addresses`: REPEATED STRING +* `value`: NUMERIC + +# Common query patterns + +```sql +SELECT * +FROM `bigquery-public-data.crypto_bitcoin.inputs` +WHERE transaction_hash = 'YOUR_TRANSACTION_HASH_HERE' +LIMIT 10 +``` + +```sql +SELECT + block_number, + SUM(value) AS total_input_value +FROM `bigquery-public-data.crypto_bitcoin.inputs` +WHERE block_number = 600000 -- Example block number +GROUP BY block_number +``` + +```sql +SELECT DISTINCT + address +FROM `bigquery-public-data.crypto_bitcoin.inputs`, + UNNEST(addresses) AS address +WHERE block_timestamp >= '2023-01-01' +LIMIT 10 +``` + +# Citations + +[1] [BigQuery Table: inputs](https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin/tables/inputs) +[2] [blockchain-etl/bitcoin-etl](https://github.com/blockchain-etl/bitcoin-etl) diff --git a/packages/okf/test/fixtures/crypto_bitcoin/tables/outputs.md b/packages/okf/test/fixtures/crypto_bitcoin/tables/outputs.md new file mode 100644 index 000000000..b2121b717 --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/tables/outputs.md @@ -0,0 +1,68 @@ +--- +type: BigQuery Table +resource: https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin/tables/outputs +title: Outputs +description: Outputs from all transactions in the Bitcoin blockchain. +tags: +- bitcoin +- blockchain +- transactions +- outputs +- etl +timestamp: '2026-05-28T22:44:32+00:00' +--- + +The `outputs` table contains records of all transaction outputs within the Bitcoin blockchain. Each row in this table represents a single output from a Bitcoin transaction, detailing the amount transferred, the destination addresses, and other script-related information. This table is crucial for understanding the flow of Bitcoin and analyzing transaction patterns, especially when linked with the `[transactions](transactions.md)` and `[inputs](inputs.md)` tables. + +# Schema + +- `transaction_hash`: The hash of the transaction this output belongs to. +- `block_hash`: The hash of the block containing this transaction. +- `block_number`: The number of the block containing this transaction. +- `block_timestamp`: The timestamp of the block containing this transaction. +- `index`: The zero-based index of this output within its transaction. +- `script_asm`: The script in assembly format. +- `script_hex`: The script in hexadecimal format. +- `required_signatures`: The number of signatures required to spend this output. +- `type`: The type of the output script. +- `addresses`: (REPEATED) Array of destination addresses for this output. +- `value`: The value of the output in satoshis. + +# Common query patterns + +```sql +SELECT + t.* +FROM + `bigquery-public-data.crypto_bitcoin.outputs` AS t +WHERE + t.transaction_hash = 'some_transaction_hash' +``` + +```sql +SELECT + SUM(t.value) AS total_output_value +FROM + `bigquery-public-data.crypto_bitcoin.outputs` AS t +WHERE + t.block_number = 123456 +``` + +```sql +SELECT + t.type, + COUNT(*) AS output_count +FROM + `bigquery-public-data.crypto_bitcoin.outputs` AS t +WHERE + t.block_timestamp BETWEEN TIMESTAMP('2023-01-01') AND TIMESTAMP('2023-01-31') +GROUP BY + t.type +ORDER BY + output_count DESC +``` + +# Citations + +[1] [Outputs Table](https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin/tables/outputs) +[2] [Bitcoin ETL on GitHub](https://github.com/blockchain-etl/bitcoin-etl) diff --git a/packages/okf/test/fixtures/crypto_bitcoin/tables/transactions.md b/packages/okf/test/fixtures/crypto_bitcoin/tables/transactions.md new file mode 100644 index 000000000..1ec7471bb --- /dev/null +++ b/packages/okf/test/fixtures/crypto_bitcoin/tables/transactions.md @@ -0,0 +1,112 @@ +--- +type: BigQuery Table +resource: https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin/tables/transactions +title: Bitcoin Transactions +description: A comprehensive table detailing all transactions on the Bitcoin blockchain. +tags: +- bitcoin +- blockchain +- transactions +- crypto +- public data +- etl +timestamp: '2026-05-28T22:45:04+00:00' +--- + +The `transactions` table in the [crypto_bitcoin](../datasets/crypto_bitcoin.md) dataset provides a complete record of every transaction ever processed on the Bitcoin blockchain. Each row represents a single transaction, offering granular details such as its hash, size, associated [block](blocks.md) information (hash, number, timestamp), and the total input and output values. Importantly, it includes detailed arrays for both [inputs](inputs.md) and [outputs](outputs.md), each specifying spent transaction details, script information, involved addresses, and values. This table is essential for in-depth analysis of transaction flows, tracing funds, and understanding the economic activity within the Bitcoin network. The grain is one row per transaction, with data spanning the entire history of the Bitcoin blockchain, partitioned by `block_timestamp_month`. + +# Schema +- `hash` STRING REQUIRED: The hash of this transaction +- `size` INTEGER: The size of this transaction in bytes +- `virtual_size` INTEGER: The virtual transaction size (differs from size for witness transactions) +- `version` INTEGER: Protocol version specified in block which contained this transaction +- `lock_time` INTEGER: Earliest time that miners can include the transaction in their hashing of the Merkle root to attach it in the latest block of the blockchain +- `block_hash` STRING REQUIRED: Hash of the block which contains this transaction +- `block_number` INTEGER REQUIRED: Number of the block which contains this transaction +- `block_timestamp` TIMESTAMP REQUIRED: Timestamp of the block which contains this transaction +- `block_timestamp_month` DATE REQUIRED: Month of the block which contains this transaction +- `input_count` INTEGER: The number of inputs in the transaction +- `output_count` INTEGER: The number of outputs in the transaction +- `input_value` NUMERIC: Total value of inputs in the transaction +- `output_value` NUMERIC: Total value of outputs in the transaction +- `is_coinbase` BOOLEAN: True if this transaction is a coinbase transaction +- `fee` NUMERIC: The fee paid by this transaction +- `inputs` RECORD REPEATED: Transaction inputs + - `index` INTEGER REQUIRED: 0-indexed number of an input within a transaction + - `spent_transaction_hash` STRING: The hash of the transaction which contains the output that this input spends + - `spent_output_index` INTEGER: The index of the output this input spends + - `script_asm` STRING: Symbolic representation of the bitcoin's script language op-codes + - `script_hex` STRING: Hexadecimal representation of the bitcoin's script language op-codes + - `sequence` INTEGER: A number intended to allow unconfirmed time-locked transactions to be updated before being finalized + - `required_signatures` INTEGER: The number of signatures required to authorize the spent output + - `type` STRING: The address type of the spent output + - `addresses` STRING REPEATED: Addresses which own the spent output + - `value` NUMERIC: The value in base currency attached to the spent output +- `outputs` RECORD REPEATED: Transaction outputs + - `index` INTEGER REQUIRED: 0-indexed number of an output within a transaction + - `script_asm` STRING: Symbolic representation of the bitcoin's script language op-codes + - `script_hex` STRING: Hexadecimal representation of the bitcoin's script language op-codes + - `required_signatures` INTEGER: The number of signatures required to authorize spending of this output + - `type` STRING: The address type of the output + - `addresses` STRING REPEATED: Addresses which own this output + - `value` NUMERIC: The value in base currency attached to this output + +# Common query patterns +```sql +-- Get the total number of transactions per day +SELECT + DATE(block_timestamp) AS transaction_date, + COUNT(hash) AS transaction_count +FROM + `bigquery-public-data.crypto_bitcoin.transactions` +WHERE + block_timestamp BETWEEN '2023-01-01' AND '2023-01-31' +GROUP BY + transaction_date +ORDER BY + transaction_date DESC; +``` +```sql +-- Find transactions involving a specific address as an output +SELECT + t.hash AS transaction_hash, + t.block_timestamp +FROM + `bigquery-public-data.crypto_bitcoin.transactions` AS t, + UNNEST(t.outputs) AS output +WHERE + '1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa' IN UNNEST(output.addresses) +LIMIT 10; +``` +```sql +-- Calculate the total fees collected in a given month +SELECT + FORMAT_TIMESTAMP('%Y-%m', block_timestamp) AS transaction_month, + SUM(fee) AS total_fee +FROM + `bigquery-public-data.crypto_bitcoin.transactions` +WHERE + block_timestamp BETWEEN '2023-01-01' AND '2023-01-31' +GROUP BY + transaction_month; +``` +```sql +-- Find duplicate transactions (anomaly detection) +SELECT +   * +FROM ( + SELECT +   hash, +   COUNT(hash) AS dup_transaction_count + FROM +   `bigquery-public-data.crypto_bitcoin.transactions` + GROUP BY +   hash) +WHERE + dup_transaction_count > 1 +``` + +# Citations +[1] [Bitcoin Transactions](https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin/tables/transactions) +[2] [Bitcoin ETL](https://github.com/blockchain-etl/bitcoin-etl) +[3] [Bitcoin in BigQuery: blockchain analytics on public data](https://cloud.google.com/blog/products/gcp/bitcoin-in-bigquery-blockchain-analytics-on-public-data) diff --git a/packages/okf/test/fixtures/edge_cases/extras.md b/packages/okf/test/fixtures/edge_cases/extras.md new file mode 100644 index 000000000..450ba9bb6 --- /dev/null +++ b/packages/okf/test/fixtures/edge_cases/extras.md @@ -0,0 +1,13 @@ +--- +type: Thing +title: Extras +description: Carries producer-defined keys and an unknown type-like key plus a broken link. +owner: Alice +review status: pending +priority: 3 +timestamp: '2026-02-02T12:00:00+00:00' +--- + +This concept links to a target that does not exist in the bundle: +[ghost](ghost.md) — a broken cross-link, which is NOT an error (SPEC §5.3/§9). +It also links to the real [type only](type_only.md) concept. diff --git a/packages/okf/test/fixtures/edge_cases/index.md b/packages/okf/test/fixtures/edge_cases/index.md new file mode 100644 index 000000000..bee913e73 --- /dev/null +++ b/packages/okf/test/fixtures/edge_cases/index.md @@ -0,0 +1,8 @@ +--- +okf_version: "0.1" +--- + +# Edge cases + +* [Type only](type_only.md) - a concept with only the required `type` +* [Extras](extras.md) - unknown frontmatter keys and a broken link diff --git a/packages/okf/test/fixtures/edge_cases/log.md b/packages/okf/test/fixtures/edge_cases/log.md new file mode 100644 index 000000000..b80f33a1e --- /dev/null +++ b/packages/okf/test/fixtures/edge_cases/log.md @@ -0,0 +1,9 @@ +# Log + +## 2026-02-02 + +**Update** - added the `extras` concept with producer-defined keys. + +## 2026-01-01 + +**Creation** - bundle created with a single `type_only` concept. diff --git a/packages/okf/test/fixtures/edge_cases/type_only.md b/packages/okf/test/fixtures/edge_cases/type_only.md new file mode 100644 index 000000000..d387bedf3 --- /dev/null +++ b/packages/okf/test/fixtures/edge_cases/type_only.md @@ -0,0 +1,7 @@ +--- +type: Thing +--- + +A conformant concept that declares only the required `type` field — every +optional field (title, description, resource, tags, timestamp) is absent. A +consumer must accept it and degrade gracefully (SPEC §4.1/§9). diff --git a/packages/okf/test/fixtures/synthetic_links/beta.md b/packages/okf/test/fixtures/synthetic_links/beta.md new file mode 100644 index 000000000..74e0a95ee --- /dev/null +++ b/packages/okf/test/fixtures/synthetic_links/beta.md @@ -0,0 +1,8 @@ +--- +type: Leaf +title: Beta +description: A sibling concept linked by bare-sibling form, links back to hub with parent-relative path. +timestamp: '2026-01-01T00:00:00+00:00' +--- + +Beta links back to [hub](hub.md) and down to [alpha](tables/alpha.md). diff --git a/packages/okf/test/fixtures/synthetic_links/hub.md b/packages/okf/test/fixtures/synthetic_links/hub.md new file mode 100644 index 000000000..757448902 --- /dev/null +++ b/packages/okf/test/fixtures/synthetic_links/hub.md @@ -0,0 +1,18 @@ +--- +type: Hub +title: Link Hub +description: A concept that links to others using every supported link form. +timestamp: '2026-01-01T00:00:00+00:00' +--- + +This hub links with an absolute bundle-relative path to [alpha](/tables/alpha.md), +a parent-relative-free sibling [beta](beta.md), an explicit relative +[alpha again](./tables/alpha.md), a parent-relative back-reference from a child is +covered elsewhere, an extension-less link to [gamma](tables/gamma), and a broken +link to [missing](tables/does_not_exist.md). + +It also mentions a link inside a code span: `[beta](beta.md)`, which CommonMark +treats as literal text. + +# Citations +[1] [OKF spec](https://github.com/GoogleCloudPlatform/knowledge-catalog/blob/main/okf/SPEC.md) diff --git a/packages/okf/test/fixtures/synthetic_links/index.md b/packages/okf/test/fixtures/synthetic_links/index.md new file mode 100644 index 000000000..f5f920a4a --- /dev/null +++ b/packages/okf/test/fixtures/synthetic_links/index.md @@ -0,0 +1,6 @@ +# Synthetic link-forms bundle + +Exercises every OKF §5 link form the real `crypto_bitcoin` bundle does not. + +* [hub](hub.md) - links out using all forms +* [tables](tables/index.md) - sub-directory diff --git a/packages/okf/test/fixtures/synthetic_links/tables/alpha.md b/packages/okf/test/fixtures/synthetic_links/tables/alpha.md new file mode 100644 index 000000000..0977057d8 --- /dev/null +++ b/packages/okf/test/fixtures/synthetic_links/tables/alpha.md @@ -0,0 +1,9 @@ +--- +type: Leaf +title: Alpha +description: A leaf concept in a subdirectory; uses an absolute link and a parent-relative link. +timestamp: '2026-01-01T00:00:00+00:00' +--- + +Alpha references the [hub](../hub.md) using a parent-relative path, and the +[beta](/beta.md) concept using an absolute bundle-relative path. diff --git a/packages/okf/test/fixtures/synthetic_links/tables/gamma.md b/packages/okf/test/fixtures/synthetic_links/tables/gamma.md new file mode 100644 index 000000000..6ab9b878d --- /dev/null +++ b/packages/okf/test/fixtures/synthetic_links/tables/gamma.md @@ -0,0 +1,8 @@ +--- +type: Leaf +title: Gamma +description: Target of an extension-less link from the hub. +timestamp: '2026-01-01T00:00:00+00:00' +--- + +Gamma is linked without a file extension from the hub. diff --git a/packages/okf/test/fixtures/synthetic_links/tables/index.md b/packages/okf/test/fixtures/synthetic_links/tables/index.md new file mode 100644 index 000000000..b92b14152 --- /dev/null +++ b/packages/okf/test/fixtures/synthetic_links/tables/index.md @@ -0,0 +1,4 @@ +# Tables + +* [Alpha](alpha.md) - leaf in a subdirectory +* [Gamma](gamma.md) - another leaf diff --git a/packages/okf/test/internals.test.ts b/packages/okf/test/internals.test.ts new file mode 100644 index 000000000..76e6f2eaa --- /dev/null +++ b/packages/okf/test/internals.test.ts @@ -0,0 +1,142 @@ +import { describe, it, expect } from 'vitest'; +import { + isSafeIri, + literalTerm, + typedLiteralTerm, + frontmatterQuads, + quadsToNQuads, + exportBundle, + importBundle, + validateBundle, + resolveLinkTarget, +} from '../src/index.js'; + +describe('term helpers', () => { + it('isSafeIri requires a scheme and rejects unsafe chars', () => { + expect(isSafeIri('https://x/y')).toBe(true); + expect(isSafeIri('urn:okf:a/b')).toBe(true); + expect(isSafeIri('not an iri')).toBe(false); + expect(isSafeIri('')).toBe(false); + expect(isSafeIri('http://x/')).toBe(false); + }); + it('escapes literals and types typed literals', () => { + expect(literalTerm('he said "hi"')).toBe('"he said \\"hi\\""'); + expect(typedLiteralTerm('3', 'http://www.w3.org/2001/XMLSchema#integer')).toBe( + '"3"^^', + ); + }); +}); + +describe('valueToTerms typing for producer-defined keys', () => { + const q = frontmatterQuads('urn:okf:x', { + type: 'T', + flag: true, + count: 7, + ratio: 1.5, + note: 'plain', + homepage: 'https://example.org/p', + nested: { a: 1 }, + list: ['a', 'b'], + }); + const obj = (predicate: string) => q.filter((x) => x.predicate === predicate).map((x) => x.object); + + it('types boolean/integer/decimal and preserves IRIs vs literals', () => { + expect(obj('http://schema.org/flag')).toEqual(['"true"^^']); + expect(obj('http://schema.org/count')).toEqual(['"7"^^']); + expect(obj('http://schema.org/ratio')).toEqual(['"1.5"^^']); + expect(obj('http://schema.org/note')).toEqual(['"plain"']); + expect(obj('http://schema.org/homepage')).toEqual(['https://example.org/p']); + expect(obj('http://schema.org/list')).toEqual(['"a"', '"b"']); + expect(obj('http://schema.org/nested')).toEqual(['"{\\"a\\":1}"']); + }); + + it('skips null/undefined values', () => { + const q2 = frontmatterQuads('urn:okf:x', { type: 'T', empty: null, gone: undefined }); + expect(q2.some((x) => x.predicate.includes('empty') || x.predicate.includes('gone'))).toBe(false); + }); +}); + +describe('quadsToNQuads', () => { + it('returns empty string for no quads and dedupes identical lines', () => { + expect(quadsToNQuads([])).toBe(''); + const dup = [ + { subject: 'urn:a', predicate: 'urn:p', object: 'urn:b' }, + { subject: 'urn:a', predicate: 'urn:p', object: 'urn:b' }, + ]; + expect(quadsToNQuads(dup)).toBe(' .\n'); + }); + it('renders a named graph term', () => { + expect( + quadsToNQuads([{ subject: 'urn:a', predicate: 'urn:p', object: '"v"', graph: 'urn:g' }]), + ).toBe(' "v" .\n'); + }); +}); + +describe('export of producer keys (array + index regeneration)', () => { + it('round-trips a multi-valued producer key and nested subdirectories', () => { + const files = [ + { path: 'index.md', content: '# Root\n' }, + { path: 'a/one.md', content: '---\ntype: T\ntitle: One\nauthors:\n- X\n- Y\n---\n[two](/a/two.md)\n' }, + { path: 'a/two.md', content: '---\ntype: T\ntitle: Two\n---\nbody\n' }, + ]; + const imported = importBundle(files); + const exported = exportBundle(imported); + // a regenerated index.md exists for both root and the `a/` subdir + expect(exported.some((f) => f.path === 'index.md')).toBe(true); + expect(exported.some((f) => f.path === 'a/index.md')).toBe(true); + const re = importBundle(exported); + const one = re.concepts.find((c) => c.conceptId === 'a/one')!; + expect(one.quads.filter((x) => x.predicate === 'http://schema.org/authors').map((x) => x.object)).toEqual( + ['"X"', '"Y"'], + ); + expect(validateBundle(exported).conformant).toBe(true); + }); +}); + +describe('validation reserved-file structure (§6/§7)', () => { + it('warns when a non-root reserved file carries frontmatter', () => { + const report = validateBundle([ + { path: 'tables/index.md', content: '---\nfoo: bar\n---\n# listing' }, + { path: 'a.md', content: '---\ntype: T\n---\nbody' }, + ]); + expect(report.conformant).toBe(true); + expect(report.warnings.join(' ')).toMatch(/carries frontmatter/); + }); + it('warns when root index.md declares keys other than okf_version', () => { + const report = validateBundle([ + { path: 'index.md', content: '---\nokf_version: "0.1"\ntitle: nope\n---\n# root' }, + { path: 'a.md', content: '---\ntype: T\n---\nbody' }, + ]); + expect(report.warnings.join(' ')).toMatch(/other than okf_version/); + }); + it('reports an unparseable concept frontmatter as a hard error (§9 rule 1)', () => { + const report = validateBundle([{ path: 'a.md', content: '---\ntype: T\nno close' }]); + expect(report.conformant).toBe(false); + expect(report.errors.join(' ')).toMatch(/rule 1/); + }); + + it('warns (never errors) when a reserved file fails to parse', () => { + const report = validateBundle([ + { path: 'index.md', content: '---\nunterminated frontmatter' }, + { path: 'a.md', content: '---\ntype: T\n---\nbody' }, + ]); + expect(report.conformant).toBe(true); + expect(report.warnings.join(' ')).toMatch(/did not parse/); + }); +}); + +describe('robustness', () => { + it('skips an unparseable concept with a parse warning, never aborting the bundle', () => { + const r = importBundle([ + { path: 'good.md', content: '---\ntype: T\ntitle: Good\n---\nbody' }, + { path: 'bad.md', content: '---\ntype: T\nno close here' }, + ]); + expect(r.concepts.map((c) => c.conceptId)).toEqual(['good']); + expect(r.warnings.some((w) => w.code === 'parse' && w.conceptId === 'bad')).toBe(true); + }); + + it('rejects a link whose resolved segment is not a valid concept-ID segment', () => { + expect(resolveLinkTarget('/-bad.md', 'a')).toBeNull(); + expect(resolveLinkTarget('/.hidden.md', 'a')).toBeNull(); + }); +}); diff --git a/packages/okf/test/loader.test.ts b/packages/okf/test/loader.test.ts new file mode 100644 index 000000000..ce1e49f47 --- /dev/null +++ b/packages/okf/test/loader.test.ts @@ -0,0 +1,34 @@ +import { describe, it, expect, beforeAll, afterAll } from 'vitest'; +import { mkdtempSync, writeFileSync, symlinkSync, mkdirSync, rmSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { loadBundleDir, loadBundleDirWithReport } from '../src/index.js'; + +describe('loadBundleDir does not follow symlinks (no local-file exfiltration)', () => { + let dir: string; + let secret: string; + + beforeAll(() => { + dir = mkdtempSync(join(tmpdir(), 'okf-loader-')); + secret = join(dir, 'secret-outside.txt'); + writeFileSync(secret, 'SECRET-TOKEN-CONTENTS'); + mkdirSync(join(dir, 'bundle')); + writeFileSync(join(dir, 'bundle', 'ok.md'), '---\ntype: T\ntitle: ok\n---\nbody\n'); + // A bundle could ship a symlink whose target is a sensitive local file. + symlinkSync(secret, join(dir, 'bundle', 'leak.md')); + }); + + afterAll(() => rmSync(dir, { recursive: true, force: true })); + + it('skips symlinked .md entries instead of slurping their target', () => { + const files = loadBundleDir(join(dir, 'bundle')); + expect(files.map((f) => f.path)).toEqual(['ok.md']); + expect(JSON.stringify(files)).not.toContain('SECRET-TOKEN-CONTENTS'); + }); + + it('reports skipped symlinks for callers that want to warn', () => { + const { files, skippedSymlinks } = loadBundleDirWithReport(join(dir, 'bundle')); + expect(files.map((f) => f.path)).toEqual(['ok.md']); + expect(skippedSymlinks).toEqual(['leak.md']); + }); +}); diff --git a/packages/okf/test/mapping.test.ts b/packages/okf/test/mapping.test.ts new file mode 100644 index 000000000..5c1a3fb7d --- /dev/null +++ b/packages/okf/test/mapping.test.ts @@ -0,0 +1,140 @@ +import { describe, it, expect } from 'vitest'; +import { + frontmatterQuads, + parseBody, + mapConcept, + parseDocument, + DKG_HAS_SECTION, + SCHEMA_NAME, + SECTION_GENID_INFIX, +} from '../src/index.js'; + +const Q = (quads: { predicate: string; object: string }[], predicate: string) => + quads.filter((q) => q.predicate === predicate).map((q) => q.object); + +describe('frontmatterQuads (locked OKF → RDF table, ADR 0005)', () => { + const iri = 'urn:okf:datasets/crypto_bitcoin'; + const fm = { + type: 'BigQuery Dataset', + resource: 'https://bigquery.googleapis.com/v2/projects/x/datasets/crypto_bitcoin', + title: 'Cryptocurrency Bitcoin', + description: 'A dataset.', + tags: ['cryptocurrency', 'bitcoin'], + timestamp: '2026-05-28T22:44:47+00:00', + }; + const quads = frontmatterQuads(iri, fm); + + it('maps type → rdf:type with PascalCased schema.org IRI', () => { + expect(Q(quads, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')).toEqual([ + 'http://schema.org/BigQueryDataset', + ]); + }); + it('maps title → schema:name and description → schema:description', () => { + expect(Q(quads, 'http://schema.org/name')).toEqual(['"Cryptocurrency Bitcoin"']); + expect(Q(quads, 'http://schema.org/description')).toEqual(['"A dataset."']); + }); + it('maps each tag → one schema:keywords literal', () => { + expect(Q(quads, 'http://schema.org/keywords')).toEqual(['"cryptocurrency"', '"bitcoin"']); + }); + it('maps timestamp → schema:dateModified typed xsd:dateTime', () => { + expect(Q(quads, 'http://schema.org/dateModified')).toEqual([ + '"2026-05-28T22:44:47+00:00"^^', + ]); + }); + it('maps resource → schema:url as an IRI object', () => { + expect(Q(quads, 'http://schema.org/url')).toEqual([ + 'https://bigquery.googleapis.com/v2/projects/x/datasets/crypto_bitcoin', + ]); + }); + it('preserves producer-defined keys as camelCased schema.org predicates', () => { + const extra = frontmatterQuads(iri, { type: 'X', owner: 'Alice', 'review status': 'pending' }); + expect(Q(extra, 'http://schema.org/owner')).toEqual(['"Alice"']); + expect(Q(extra, 'http://schema.org/reviewStatus')).toEqual(['"pending"']); + }); + it('accepts a full-IRI type unchanged', () => { + const t = frontmatterQuads(iri, { type: 'https://example.org/Custom' }); + expect(Q(t, 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')).toEqual([ + 'https://example.org/Custom', + ]); + }); +}); + +describe('parseBody (real Markdown AST)', () => { + it('detects headings, body links, code-span links and citations separately', () => { + const body = [ + 'Prose with a [real link](other.md) and an inline `[code link](coder.md)`.', + '', + '# Schema', + '', + '# Citations', + '[1] [Source](https://example.org/s)', + '- https://example.org/bare', + ].join('\n'); + const parsed = parseBody(body); + expect(parsed.headings).toEqual(['Schema', 'Citations']); + expect(parsed.bodyLinks).toEqual(['other.md']); + expect(parsed.codeSpanHrefs).toEqual(['coder.md']); + expect(parsed.citations.map((c) => c.url)).toEqual([ + 'https://example.org/s', + 'https://example.org/bare', + ]); + }); +}); + +describe('mapConcept section nodes are skolemized IRIs, never blank nodes', () => { + // The daemon rejects blank-node RDF objects, so `dkg:hasSection` must point at + // an absolute IRI. Sections are skolemized into deterministic concept-scoped + // `.well-known/genid/` IRIs (matching the node's own scheme). + const doc = parseDocument( + 'datasets/crypto_bitcoin.md', + '---\ntype: T\ntitle: O\n---\n\n# Schema\n\nbody\n\n# Citations\n\n[1] [x](https://e.org/x)\n', + ); + const iri = 'urn:okf:datasets/crypto_bitcoin'; + const m = mapConcept(doc, iri, () => false); + + it('emits no blank-node terms anywhere in the quads', () => { + for (const q of m.quads) { + expect(q.subject.startsWith('_:')).toBe(false); + expect(q.object.startsWith('_:')).toBe(false); + } + }); + + it('hasSection objects are deterministic concept-scoped genid IRIs', () => { + const sections = m.quads.filter((q) => q.predicate === DKG_HAS_SECTION); + expect(sections.map((q) => q.object)).toEqual([ + `${iri}${SECTION_GENID_INFIX}okfsec_datasets_crypto_bitcoin_0`, + `${iri}${SECTION_GENID_INFIX}okfsec_datasets_crypto_bitcoin_1`, + ]); + // each section node carries its heading text via schema:name + for (const s of sections) { + const name = m.quads.find((q) => q.subject === s.object && q.predicate === SCHEMA_NAME); + expect(name).toBeDefined(); + } + }); + + it('is stable across runs (same bundle ⇒ same section IRIs)', () => { + const again = mapConcept(doc, iri, () => false); + expect(again.quads).toEqual(m.quads); + }); +}); + +describe('mapConcept code-span policy', () => { + const doc = parseDocument( + 'tables/outputs.md', + '---\ntype: T\ntitle: O\n---\n\nLinked with `[transactions](transactions.md)` and `[inputs](inputs.md)`.\n', + ); + const exists = (id: string) => ['tables/transactions', 'tables/inputs'].includes(id); + + it('does NOT treat code-span links as edges by default (CommonMark)', () => { + const m = mapConcept(doc, 'urn:okf:tables/outputs', exists); + expect(m.resolvedLinks).toEqual([]); + expect(m.codeSpanLinks.map((l) => l.raw)).toEqual(['transactions.md', 'inputs.md']); + }); + it('treats them as edges when includeCodeSpanLinks is set', () => { + const m = mapConcept(doc, 'urn:okf:tables/outputs', exists, { includeCodeSpanLinks: true }); + expect(m.resolvedLinks.map((l) => l.targetConceptId).sort()).toEqual([ + 'tables/inputs', + 'tables/transactions', + ]); + }); +}); diff --git a/packages/okf/test/paths.test.ts b/packages/okf/test/paths.test.ts new file mode 100644 index 000000000..a19cb778d --- /dev/null +++ b/packages/okf/test/paths.test.ts @@ -0,0 +1,106 @@ +import { describe, it, expect } from 'vitest'; +import { + resolveLinkTarget, + isValidSegment, + isReservedFile, + isConceptFile, + pathToConceptId, + conceptIdToIri, + conceptIdToKaName, +} from '../src/index.js'; + +describe('isValidSegment (agrees with paths.py _SEGMENT_RE)', () => { + it('accepts alphanumeric / underscore starts with dot/hyphen inside', () => { + expect(isValidSegment('crypto_bitcoin')).toBe(true); + expect(isValidSegment('blocks')).toBe(true); + expect(isValidSegment('a.b-c_d')).toBe(true); + expect(isValidSegment('_hidden')).toBe(true); + }); + it('rejects leading dot/hyphen/slash and empties', () => { + expect(isValidSegment('.hidden')).toBe(false); + expect(isValidSegment('-dash')).toBe(false); + expect(isValidSegment('')).toBe(false); + expect(isValidSegment('a/b')).toBe(false); + }); +}); + +describe('reserved / concept classification', () => { + it('flags index.md and log.md at any depth as reserved', () => { + expect(isReservedFile('index.md')).toBe(true); + expect(isReservedFile('tables/index.md')).toBe(true); + expect(isReservedFile('a/b/log.md')).toBe(true); + expect(isReservedFile('tables/blocks.md')).toBe(false); + }); + it('treats non-reserved .md as concepts', () => { + expect(isConceptFile('tables/blocks.md')).toBe(true); + expect(isConceptFile('index.md')).toBe(false); + expect(isConceptFile('viz.html')).toBe(false); + }); +}); + +describe('pathToConceptId / conceptIdToIri', () => { + it('strips .md and joins POSIX segments', () => { + expect(pathToConceptId('tables/blocks.md')).toBe('tables/blocks'); + expect(pathToConceptId('datasets/crypto_bitcoin.md')).toBe('datasets/crypto_bitcoin'); + }); + it('derives a deterministic IRI from the concept ID', () => { + expect(conceptIdToIri('tables/blocks')).toBe('urn:okf:tables/blocks'); + expect(conceptIdToIri('tables/blocks', 'https://x/')).toBe('https://x/tables/blocks'); + }); +}); + +describe('conceptIdToKaName (node asset names cannot contain "/")', () => { + it('encodes path separators and underscores deterministically (hex escapes)', () => { + expect(conceptIdToKaName('tables/transactions')).toBe('tables_2ftransactions'); + expect(conceptIdToKaName('datasets/crypto_bitcoin')).toBe('datasets_2fcrypto_5fbitcoin'); + expect(conceptIdToKaName('flat')).toBe('flat'); + expect(conceptIdToKaName('a/b/c')).toBe('a_2fb_2fc'); + }); + it('never produces a name containing a slash', () => { + for (const id of ['tables/blocks', 'a/b/c', 'datasets/crypto_bitcoin', 'x']) { + expect(conceptIdToKaName(id)).not.toContain('/'); + } + }); + it('is INJECTIVE: "a/b" and the literal concept "a__b" do NOT collide', () => { + // The naive `/`→`__` mapping collapsed these onto one node KA name. + expect(conceptIdToKaName('a/b')).not.toBe(conceptIdToKaName('a__b')); + // Spot-check a few more would-be collisions. + const ids = ['a/b', 'a__b', 'a_b', 'a/b/c', 'a__b__c', 'tables/x', 'tables__x']; + expect(new Set(ids.map(conceptIdToKaName)).size).toBe(ids.length); + }); +}); + +describe('resolveLinkTarget — all OKF §5 link forms', () => { + // from concept `tables/transactions` (dir = tables/) + it('bare-sibling', () => { + expect(resolveLinkTarget('blocks.md', 'tables/transactions')).toBe('tables/blocks'); + }); + it('parent-relative', () => { + expect(resolveLinkTarget('../datasets/crypto_bitcoin.md', 'tables/transactions')).toBe( + 'datasets/crypto_bitcoin', + ); + }); + it('absolute bundle-relative', () => { + expect(resolveLinkTarget('/tables/blocks.md', 'tables/transactions')).toBe('tables/blocks'); + }); + it('explicit ./relative', () => { + expect(resolveLinkTarget('./inputs.md', 'tables/transactions')).toBe('tables/inputs'); + }); + it('extension-less', () => { + expect(resolveLinkTarget('blocks', 'tables/transactions')).toBe('tables/blocks'); + expect(resolveLinkTarget('../datasets/crypto_bitcoin', 'tables/transactions')).toBe( + 'datasets/crypto_bitcoin', + ); + }); + it('strips #anchor and ?query', () => { + expect(resolveLinkTarget('blocks.md#schema', 'tables/transactions')).toBe('tables/blocks'); + expect(resolveLinkTarget('blocks.md?x=1', 'tables/transactions')).toBe('tables/blocks'); + }); + it('returns null for external URLs, pure anchors, and root escapes', () => { + expect(resolveLinkTarget('https://example.org/x', 'tables/transactions')).toBeNull(); + expect(resolveLinkTarget('mailto:a@b.c', 'tables/transactions')).toBeNull(); + expect(resolveLinkTarget('#section', 'tables/transactions')).toBeNull(); + expect(resolveLinkTarget('../../escapes.md', 'tables/transactions')).toBeNull(); + expect(resolveLinkTarget('dir/', 'tables/transactions')).toBeNull(); + }); +}); diff --git a/packages/okf/test/roundtrip.test.ts b/packages/okf/test/roundtrip.test.ts new file mode 100644 index 000000000..218a3581a --- /dev/null +++ b/packages/okf/test/roundtrip.test.ts @@ -0,0 +1,99 @@ +import { describe, it, expect } from 'vitest'; +import { fileURLToPath } from 'node:url'; +import { + importBundle, + exportBundle, + loadBundleDir, + quadsToNQuads, + validateBundle, + DKG_HAS_SECTION, + SECTION_GENID_INFIX, +} from '../src/index.js'; +import type { Quad } from '../src/index.js'; + +const files = loadBundleDir(fileURLToPath(new URL('./fixtures/crypto_bitcoin', import.meta.url))); + +/** + * The semantic graph: concept-subject quads minus presentational `dkg:hasSection` + * structure (the `hasSection` edges and the skolemized section nodes). Round-trip + * equivalence is asserted over this projection (export is graph-faithful, not + * byte-faithful — see export.ts). + */ +function semantic(quads: Quad[]): string { + return quadsToNQuads( + quads.filter( + (q) => + !q.subject.includes(SECTION_GENID_INFIX) && + q.predicate !== DKG_HAS_SECTION, + ), + ); +} + +describe('round-trip: import → export → import (§4.2)', () => { + const first = importBundle(files); + const exported = exportBundle(first); + const second = importBundle(exported); + + it('reproduces an equivalent semantic graph', () => { + expect(semantic(second.quads)).toBe(semantic(first.quads)); + }); + + it('reproduces the same 5 concepts and reconstructed edges', () => { + expect(second.concepts.map((c) => c.conceptId).sort()).toEqual( + first.concepts.map((c) => c.conceptId).sort(), + ); + const edgesOf = (r: typeof first, id: string) => + [...new Set(r.concepts.find((c) => c.conceptId === id)!.resolvedLinks.map((l) => l.targetConceptId))].sort(); + for (const id of first.concepts.map((c) => c.conceptId)) { + expect(edgesOf(second, id)).toEqual(edgesOf(first, id)); + } + }); + + it('produces a §9-conformant bundle', () => { + expect(validateBundle(exported).conformant).toBe(true); + }); +}); + +describe('round-trip preserves typed producer-key scalars (export fidelity)', () => { + // Producer-defined numeric/boolean keys must keep their RDF datatype across + // import → export → import; otherwise `count: 3` degrades to a string literal. + const bundle = [ + { + path: 'm.md', + content: + '---\n' + + 'type: Metric\n' + + 'title: M\n' + + 'count: 3\n' + + 'ratio: 1.5\n' + + 'active: true\n' + + 'archived: false\n' + + '---\n\nbody\n', + }, + ]; + + const first = importBundle(bundle); + const second = importBundle(exportBundle(first)); + + const objFor = (r: typeof first, predicate: string) => + r.concepts[0].quads.find((q) => q.predicate === predicate)?.object; + + it('keeps xsd:integer / xsd:decimal / xsd:boolean datatypes', () => { + expect(objFor(second, 'http://schema.org/count')).toBe( + '"3"^^', + ); + expect(objFor(second, 'http://schema.org/ratio')).toBe( + '"1.5"^^', + ); + expect(objFor(second, 'http://schema.org/active')).toBe( + '"true"^^', + ); + expect(objFor(second, 'http://schema.org/archived')).toBe( + '"false"^^', + ); + // and they match the first import exactly + for (const p of ['count', 'ratio', 'active', 'archived']) { + expect(objFor(second, `http://schema.org/${p}`)).toBe(objFor(first, `http://schema.org/${p}`)); + } + }); +}); diff --git a/packages/okf/tsconfig.json b/packages/okf/tsconfig.json new file mode 100644 index 000000000..d231bbc57 --- /dev/null +++ b/packages/okf/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "dist", + "rootDir": "src", + "composite": true + }, + "include": ["src"] +} diff --git a/packages/okf/vitest.config.ts b/packages/okf/vitest.config.ts new file mode 100644 index 000000000..c74a90d54 --- /dev/null +++ b/packages/okf/vitest.config.ts @@ -0,0 +1,17 @@ +import { defineConfig } from 'vitest/config'; +import { kosavaOkfCoverage } from '../../vitest.coverage'; + +export default defineConfig({ + test: { + include: ['test/**/*.test.ts'], + coverage: { + provider: 'v8', + reporter: ['text', 'html', 'lcov', 'json-summary'], + reportsDirectory: './coverage', + include: ['src/**/*.ts'], + // index.ts is a pure re-export barrel; types.ts is interfaces only. + exclude: ['src/index.ts', 'src/types.ts'], + thresholds: kosavaOkfCoverage, + }, + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0ea1e86a2..74d71230d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -362,6 +362,9 @@ importers: '@origintrail-official/dkg-node-ui': specifier: workspace:* version: link:../node-ui + '@origintrail-official/dkg-okf': + specifier: workspace:* + version: link:../okf '@origintrail-official/dkg-publisher': specifier: workspace:* version: link:../publisher @@ -840,6 +843,31 @@ importers: specifier: ^4.0.18 version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + packages/okf: + dependencies: + js-yaml: + specifier: ^4.1.1 + version: 4.1.1 + mdast-util-from-markdown: + specifier: ^2.0.3 + version: 2.0.3 + mdast-util-to-string: + specifier: ^4.0.0 + version: 4.0.0 + devDependencies: + '@types/js-yaml': + specifier: ^4.0.9 + version: 4.0.9 + '@types/mdast': + specifier: ^4.0.4 + version: 4.0.4 + '@vitest/coverage-v8': + specifier: ^4.0.18 + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + vitest: + specifier: ^4.0.18 + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + packages/publisher: dependencies: '@multiformats/multiaddr': diff --git a/vitest.coverage.ts b/vitest.coverage.ts index adc47a416..c903ca647 100644 --- a/vitest.coverage.ts +++ b/vitest.coverage.ts @@ -145,6 +145,13 @@ export const kosavaEpcisCoverage: CoverageThresholds = { statements: 97, }; +export const kosavaOkfCoverage: CoverageThresholds = { + lines: 90, + functions: 90, + branches: 85, + statements: 90, +}; + /** * @deprecated Import a tier-specific export (e.g. `kosavaNodeUiCoverage`). * Kept for any external tooling that still references the old name. From b0373a9fdbd7762fb9b0374a1f7b6871e193ce43 Mon Sep 17 00:00:00 2001 From: Branimir Rakic Date: Tue, 30 Jun 2026 21:45:22 +0200 Subject: [PATCH 2/3] test(okf): assert exact bigquery citation URL (fix CodeQL incomplete-url-sanitization) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeQL js/incomplete-url-substring-sanitization (high) flagged `q.object.startsWith('https://bigquery.googleapis.com')` in bundle.test.ts — a host-prefix check an arbitrary host could follow. Assert the full citation URL instead; CodeQL-clean and a stronger assertion. No production code involved. Co-Authored-By: Claude Opus 4.8 (1M context) --- packages/okf/test/bundle.test.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/okf/test/bundle.test.ts b/packages/okf/test/bundle.test.ts index 7b6a27816..c4a12947d 100644 --- a/packages/okf/test/bundle.test.ts +++ b/packages/okf/test/bundle.test.ts @@ -110,7 +110,9 @@ describe('crypto_bitcoin golden import (§4.1)', () => { ds.quads.some( (q) => q.predicate === 'http://schema.org/citation' && - q.object.startsWith('https://bigquery.googleapis.com'), + // exact URL (not a host-prefix startsWith — that trips CodeQL's incomplete + // URL-sanitization rule and is a weaker check anyway). + q.object === 'https://bigquery.googleapis.com/v2/projects/bigquery-public-data/datasets/crypto_bitcoin', ), ).toBe(true); }); From 0b997e425eb34f079a96420315ec8e5d81fa4f71 Mon Sep 17 00:00:00 2001 From: Branimir Rakic Date: Tue, 30 Jun 2026 21:58:55 +0200 Subject: [PATCH 3/3] =?UTF-8?q?fix(okf):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20build-graph=20wiring,=20--sub-graph-name,=20concept?= =?UTF-8?q?-path=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the otReviewAgent findings on #1388: 🔴 dkg-okf was missing from the runtime build graph. The CLI imports it at startup, so a release/auto-update build (build:runtime:packages) would omit packages/okf/dist and the CLI could fail to start for ANY command. Added dkg-okf to build:runtime:packages and to the cli tsconfig references. 🔴 --private bulk import silently dropped --sub-graph-name — data landed in the root CG SWM instead of the requested sub-graph, breaking isolation + scoped export/verify. ApiClient.sharedMemoryWrite now serializes subGraphName; the private path forwards it; the resumability manifest records it (so a resume can't mix root/sub-graph). +test. 🔴 Invalid concept file paths were minted into RDF subjects — `bad name.md` → `urn:okf:bad name` (a space-bearing IRI → invalid N-Quads / write failures). validateBundle now hard-errors invalid concept path segments, and importBundle skips them with an `invalid-path` warning instead of emitting malformed RDF. +test. 🟡 Added a test for the existing-PUBLIC-CG --private refusal (+ --allow-public-context-graph override) — previously only new-private-CG creation was covered. 🔵 Removed the local RDF_TYPE that shadowed the imported constant. okf 78 / okf-subcommands 14 green; build:packages 21/21; build:runtime:packages includes okf. Co-Authored-By: Claude Opus 4.8 (1M context) --- package.json | 2 +- packages/cli/src/api-client.ts | 8 +++- packages/cli/src/commands/okf.ts | 12 ++++-- packages/cli/test/okf-subcommands.test.ts | 51 +++++++++++++++++++++++ packages/cli/tsconfig.json | 1 + packages/okf/src/bundle.ts | 12 ++++++ packages/okf/src/types.ts | 2 +- packages/okf/src/validation.ts | 8 +++- packages/okf/test/edge-cases.test.ts | 23 ++++++++++ 9 files changed, 110 insertions(+), 9 deletions(-) diff --git a/package.json b/package.json index 7792d3e97..c75129cb6 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,7 @@ "scripts": { "build": "node scripts/build.mjs", "build:packages": "turbo build", - "build:runtime:packages": "pnpm -r --filter @origintrail-official/dkg-core --filter @origintrail-official/dkg-storage --filter @origintrail-official/dkg-query --filter @origintrail-official/dkg-publisher --filter @origintrail-official/dkg-chain --filter @origintrail-official/dkg-epcis --filter @origintrail-official/dkg-random-sampling --filter @origintrail-official/dkg-agent --filter @origintrail-official/dkg-graph-viz --filter @origintrail-official/dkg-node-ui --filter @origintrail-official/dkg-adapter-openclaw --filter @origintrail-official/dkg-adapter-hermes --filter @origintrail-official/kafka-plugin --filter @origintrail-official/dkg run build", + "build:runtime:packages": "pnpm -r --filter @origintrail-official/dkg-core --filter @origintrail-official/dkg-storage --filter @origintrail-official/dkg-query --filter @origintrail-official/dkg-publisher --filter @origintrail-official/dkg-chain --filter @origintrail-official/dkg-epcis --filter @origintrail-official/dkg-okf --filter @origintrail-official/dkg-random-sampling --filter @origintrail-official/dkg-agent --filter @origintrail-official/dkg-graph-viz --filter @origintrail-official/dkg-node-ui --filter @origintrail-official/dkg-adapter-openclaw --filter @origintrail-official/dkg-adapter-hermes --filter @origintrail-official/kafka-plugin --filter @origintrail-official/dkg run build", "build:runtime": "pnpm run build:runtime:packages && pnpm --filter @origintrail-official/dkg-node-ui run build:ui", "test": "turbo test", "test:watch": "vitest --config vitest.config.ts", diff --git a/packages/cli/src/api-client.ts b/packages/cli/src/api-client.ts index 6021ce376..1c79e36c1 100644 --- a/packages/cli/src/api-client.ts +++ b/packages/cli/src/api-client.ts @@ -502,14 +502,18 @@ export class ApiClient { */ async sharedMemoryWrite(contextGraphId: string, quads: Array<{ subject: string; predicate: string; object: string; graph: string; - }>): Promise<{ + }>, subGraphName?: string): Promise<{ shareOperationId: string; contextGraphId: string; graph: string; triplesWritten: number; skolemizedBlankNodes?: number; }> { - return this.post('/api/shared-memory/write', { contextGraphId, quads }); + return this.post('/api/shared-memory/write', { + contextGraphId, + quads, + ...(subGraphName ? { subGraphName } : {}), + }); } /** diff --git a/packages/cli/src/commands/okf.ts b/packages/cli/src/commands/okf.ts index 5b022cb1b..362ba4f9c 100644 --- a/packages/cli/src/commands/okf.ts +++ b/packages/cli/src/commands/okf.ts @@ -345,12 +345,16 @@ export function registerOkfCommand(program: Command): void { try { const prev = JSON.parse(await readFile(manifestPath, 'utf-8')) as { contextGraphId?: string; + subGraphName?: string; mode?: string; chunkSize?: number; chunksDone?: number; }; if ( prev.contextGraphId === contextGraphId && + // resume only within the SAME target sub-graph (undefined == root) — + // never carry a root manifest into a --sub-graph-name run or vice versa. + prev.subGraphName === subGraphName && prev.mode === 'bulk-private-swm' && prev.chunkSize === CHUNK && typeof prev.chunksDone === 'number' @@ -371,7 +375,7 @@ export function registerOkfCommand(program: Command): void { // down to a floor, so a too-big batch degrades instead of failing. const writeSlice = async (slice: typeof allQuads): Promise => { try { - const res = await client.sharedMemoryWrite(contextGraphId, slice); + const res = await client.sharedMemoryWrite(contextGraphId, slice, subGraphName); triplesWritten += res.triplesWritten ?? slice.length; skolemized += res.skolemizedBlankNodes ?? 0; } catch (e) { @@ -396,7 +400,7 @@ export function registerOkfCommand(program: Command): void { await writeFile( manifestPath, JSON.stringify( - { contextGraphId, mode: 'bulk-private-swm', chunkSize: CHUNK, chunksDone, totalChunks }, + { contextGraphId, ...(subGraphName ? { subGraphName } : {}), mode: 'bulk-private-swm', chunkSize: CHUNK, chunksDone, totalChunks }, null, 2, ), @@ -692,10 +696,10 @@ export function registerOkfCommand(program: Command): void { // non-zero on any shortfall so it can gate a pipeline. The fix for a // shortfall is to re-run the same idempotent `import --private` (a second // loose-write pass; the store dedupes, so only the dropped triples land). - const RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'; // Predicates we treat as integrity signals (order = report order). const INTEGRITY_PREDICATES = [ - RDF_TYPE, + RDF_TYPE, // imported from @origintrail-official/dkg-okf — single source of truth + 'http://schema.org/source', 'http://schema.org/license', 'http://schema.org/citation', diff --git a/packages/cli/test/okf-subcommands.test.ts b/packages/cli/test/okf-subcommands.test.ts index 5a6bf3182..e7c349587 100644 --- a/packages/cli/test/okf-subcommands.test.ts +++ b/packages/cli/test/okf-subcommands.test.ts @@ -288,6 +288,57 @@ describe.sequential('dkg okf subcommands', { timeout: 120_000 }, () => { expect(manifest.chunksDone).toBe(manifest.totalChunks); }); + it('--private forwards --sub-graph-name into the shared-memory write body + manifest', async () => { + clear(); + const bundle = await makeBundle(); + const r = await runCli( + ['okf', 'import', bundle, '--context-graph-id', 'cg-sg', '--sub-graph-name', 'team', '--private', '--create-context-graph'], + env(), + ); + expect(r.exitCode).toBe(0); + // Every bulk write must carry the sub-graph so data lands in cg/team SWM, not root. + const writes = stub.calls.filter((c) => c.method === 'POST' && c.url.split('?')[0] === '/api/shared-memory/write'); + expect(writes.length).toBeGreaterThan(0); + for (const w of writes) expect(JSON.parse(w.body || '{}').subGraphName).toBe('team'); + // The resumability manifest records the sub-graph (so a resume can't mix root/sub-graph). + const manifest = JSON.parse(await readFile(join(bundle, '.okf-import-manifest.json'), 'utf-8')); + expect(manifest.subGraphName).toBe('team'); + }); + + it('refuses --private into an existing PUBLIC Context Graph; --allow-public-context-graph overrides', async () => { + clear(); + const bundle = await makeBundle(); + const publicCgHandler: StubHandler = (req, raw) => { + const url = new URL(`http://127.0.0.1${req.url}`); + const path = url.pathname; + if (req.method === 'GET' && path === '/api/context-graph/exists') { + return { status: 200, body: { id: url.searchParams.get('id'), exists: true } }; + } + if (req.method === 'GET' && path === '/api/context-graph/list') { + return { status: 200, body: { contextGraphs: [{ id: 'cg-pub', accessPolicy: 'public' }] } }; + } + return okfDaemonHandler(createdCGs)(req, raw); // everything else succeeds + }; + stub.setHandler(publicCgHandler); + + // Refusal: non-zero exit, and NO bulk write happened. + const refused = await runCli(['okf', 'import', bundle, '--context-graph-id', 'cg-pub', '--private'], env()); + expect(refused.exitCode).not.toBe(0); + expect(stub.calls.some((c) => c.url.split('?')[0] === '/api/shared-memory/write')).toBe(false); + expect(refused.stderr).toMatch(/Refusing --private/); + + // Override: --allow-public-context-graph proceeds and writes. + const before = stub.calls.length; + const allowed = await runCli( + ['okf', 'import', bundle, '--context-graph-id', 'cg-pub', '--private', '--allow-public-context-graph'], + env(), + ); + expect(allowed.exitCode).toBe(0); + expect(stub.calls.slice(before).some((c) => c.url.split('?')[0] === '/api/shared-memory/write')).toBe(true); + + stub.setHandler(okfDaemonHandler(createdCGs)); + }); + it('export filters skolemized section nodes (no .well-known/genid files)', async () => { clear(); const outDir = await mkdtemp(join(tmpdir(), 'okf-export-')); diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index e1baaa902..0544abb1b 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -10,6 +10,7 @@ { "path": "../core" }, { "path": "../agent" }, { "path": "../epcis" }, + { "path": "../okf" }, { "path": "../node-ui" }, { "path": "../adapter-openclaw" }, { "path": "../mcp-dkg" } diff --git a/packages/okf/src/bundle.ts b/packages/okf/src/bundle.ts index e8caeb93b..7ec6dcb73 100644 --- a/packages/okf/src/bundle.ts +++ b/packages/okf/src/bundle.ts @@ -17,6 +17,7 @@ import { mapConcept } from './mapping.js'; import { isConceptFile, isReservedFile, + isValidSegment, pathToConceptId, conceptIdToIri, } from './paths.js'; @@ -63,6 +64,17 @@ export function importBundle(files: BundleFile[], opts: OkfMappingOptions = {}): }); continue; } + // A concept whose path has a non-OKF segment would mint a malformed subject IRI + // (e.g. `bad name.md` → `urn:okf:bad name`, a space-bearing IRI that breaks + // N-Quads / node writes). Skip it with a warning rather than emit bad RDF. + if (!doc.conceptId.split('/').every(isValidSegment)) { + warnings.push({ + conceptId: doc.conceptId, + code: 'invalid-path', + message: `invalid concept path segment(s) — must match [A-Za-z0-9_][A-Za-z0-9_.-]* (§2); skipped`, + }); + continue; + } docs.push(doc); iriByConceptId[doc.conceptId] = conceptIdToIri(doc.conceptId, iriBase); typeByConceptId[doc.conceptId] = diff --git a/packages/okf/src/types.ts b/packages/okf/src/types.ts index 6eddb1bed..df290e017 100644 --- a/packages/okf/src/types.ts +++ b/packages/okf/src/types.ts @@ -106,7 +106,7 @@ export interface ConceptMapping { /** A non-fatal diagnostic surfaced during a bundle import. */ export interface OkfWarning { conceptId?: string; - code: 'broken-link' | 'code-span-link' | 'missing-type' | 'reserved-skip' | 'parse'; + code: 'broken-link' | 'code-span-link' | 'missing-type' | 'reserved-skip' | 'parse' | 'invalid-path'; message: string; } diff --git a/packages/okf/src/validation.ts b/packages/okf/src/validation.ts index ad9fccd1b..9d93b0c9d 100644 --- a/packages/okf/src/validation.ts +++ b/packages/okf/src/validation.ts @@ -14,7 +14,7 @@ */ import { parseDocument, OkfDocumentError } from './document.js'; -import { isConceptFile, isReservedFile, basename, pathToConceptId } from './paths.js'; +import { isConceptFile, isReservedFile, isValidSegment, basename, pathToConceptId } from './paths.js'; import type { BundleFile, ConformanceReport } from './types.js'; export function validateBundle(files: BundleFile[]): ConformanceReport { @@ -60,6 +60,12 @@ export function validateBundle(files: BundleFile[]): ConformanceReport { conceptCount += 1; const conceptId = pathToConceptId(f.path); + // Every concept path segment must be a valid OKF segment, or the derived subject + // IRI (`${iriBase}${conceptId}`) is malformed (e.g. `urn:okf:bad name`). The + // mapper already enforces this for link targets; enforce it for concept files too. + if (!conceptId.split('/').every(isValidSegment)) { + errors.push(`${conceptId}: invalid concept path segment(s) — must match [A-Za-z0-9_][A-Za-z0-9_.-]* (§2)`); + } let frontmatter: Record; try { frontmatter = parseDocument(f.path, f.content).frontmatter; diff --git a/packages/okf/test/edge-cases.test.ts b/packages/okf/test/edge-cases.test.ts index c8a554550..f98c1dd80 100644 --- a/packages/okf/test/edge-cases.test.ts +++ b/packages/okf/test/edge-cases.test.ts @@ -82,4 +82,27 @@ describe('§9 conformance validation', () => { ]); expect(report.conformant).toBe(true); }); + + it('rejects concept paths with invalid segments (would mint a malformed subject IRI)', () => { + const files = [ + { path: 'index.md', content: '# Root\n' }, + { path: 'bad name.md', content: '---\ntype: T\ntitle: Spacey\n---\nbody' }, // space in segment + { path: '-bad.md', content: '---\ntype: T\ntitle: Dashy\n---\nbody' }, // leading dash + { path: 'ok/good.md', content: '---\ntype: T\ntitle: Good\n---\nbody' }, // valid + ]; + // validateBundle hard-errors the invalid paths… + const report = validateBundle(files); + expect(report.conformant).toBe(false); + expect(report.errors.join('\n')).toMatch(/bad name: invalid concept path/); + expect(report.errors.join('\n')).toMatch(/-bad: invalid concept path/); + + // …and importBundle skips them with an `invalid-path` warning, never minting a + // malformed IRI; the valid concept still imports. + const r = importBundle(files); + expect(r.warnings.filter((w) => w.code === 'invalid-path').map((w) => w.conceptId).sort()).toEqual(['-bad', 'bad name']); + expect(r.concepts.map((c) => c.conceptId)).toContain('ok/good'); + expect(r.concepts.some((c) => c.conceptId === 'bad name' || c.conceptId === '-bad')).toBe(false); + // no emitted subject IRI contains a space (the malformed-IRI symptom) + expect(r.quads.every((q) => !q.subject.includes(' '))).toBe(true); + }); });