Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
6cc97d0
feat(telemetry): opt-in OTLP log collection for core nodes (PoC)
Bojan131 Jun 24, 2026
ab56bf7
feat(telemetry): per-node Grafana selection + dashboard
Bojan131 Jun 24, 2026
4cdeef6
feat(telemetry): production ingest path for existing Loki (Alloy bridge)
Bojan131 Jun 24, 2026
242c54f
docs(telemetry): production deploy bundle + manager handoff
Bojan131 Jun 24, 2026
a3c5257
docs(telemetry): exact Cloudflare WAF expression + real-Grafana verif…
Bojan131 Jun 24, 2026
9a76a4a
feat(telemetry): fleet dashboard + operator guide + example alerts
Bojan131 Jun 24, 2026
d9b557b
chore: drop accidentally-committed localhost hardhat deploy artifacts
Bojan131 Jun 24, 2026
10da035
chore: restore tracked localhost_contracts.json (only the untracked l…
Bojan131 Jun 24, 2026
a30318d
feat(telemetry): OTel SDK foundation — traces+metrics providers, log …
Bojan131 Jun 25, 2026
80e536d
feat(telemetry): instrument spans + metrics across agent/publisher/ch…
Bojan131 Jun 25, 2026
a841d00
chore(telemetry): collector traces+metrics pipelines + mock OTLP coll…
Bojan131 Jun 25, 2026
2aa31f7
feat(telemetry): add protocol_router.send span + P2P send-duration me…
Bojan131 Jun 25, 2026
414936b
fix(telemetry): address bug-bot review (1 bug + 3 issues + 2 nits)
Bojan131 Jun 25, 2026
9f3625d
Merge remote-tracking branch 'origin/main' into feat/core-node-log-co…
Bojan131 Jun 25, 2026
f78d06f
fix(telemetry): two CI regressions from instrumentation
Bojan131 Jun 25, 2026
908e298
Merge remote-tracking branch 'origin/main' into feat/core-node-log-co…
Bojan131 Jun 26, 2026
0bfdadd
chore: drop accidentally-committed localhost Hardhat deploy artifacts…
Bojan131 Jun 26, 2026
6a8c971
feat(telemetry): address #1317 review — unify log resource attrs + fu…
Bojan131 Jun 26, 2026
66ef5b7
Merge remote-tracking branch 'origin/main' into feat/core-node-log-co…
Bojan131 Jun 26, 2026
2771d2e
fix(telemetry): tie OTel traces/metrics to the master gate + prove in…
Bojan131 Jun 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions packages/cli/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -538,8 +538,38 @@ export interface DkgConfig {
* on first start and stored in `<DKG_HOME>/auth.token`.
*/
auth?: { enabled?: boolean; tokens?: string[] };
/** Opt-in telemetry streaming to central network dashboard. */
telemetry?: { enabled?: boolean };
/**
* Opt-in telemetry streaming to a central network dashboard.
* `enabled` is the master gate: when false, NOTHING is forwarded off the
* node (local logging — SQLite + daemon.log — is always on regardless).
*/
telemetry?: {
enabled?: boolean;
/**
* Remote log forwarding (opt-in). Active only when `enabled` is true.
*/
logs?: {
/**
* Outbound transport for logs. 'none' = local only; 'otlp' = OTLP/HTTP
* to an OpenTelemetry collector; 'syslog' = legacy RFC 5424 → Graylog.
* Defaults to 'syslog' when unset (preserves prior behaviour).
*/
exporter?: 'none' | 'otlp' | 'syslog';
/**
* OTLP/HTTP logs endpoint, e.g. http://localhost:4318/v1/logs. Falls
* back to the per-network default (TELEMETRY_ENDPOINTS[network].otlpLogs).
*/
endpoint?: string;
/** Bearer credential for the operator's collector. Treated as a secret. */
token?: string;
/** Minimum level forwarded remotely. Local sink keeps everything. Default 'info'. */
level?: 'debug' | 'info' | 'warn' | 'error';
/** Extra sensitive key names to redact from messages before they leave the node. */
redact?: string[];
/** Bounded in-memory buffer; drop-oldest on overflow. Default 500. */
bufferMaxEntries?: number;
};
};
/** Shared memory (workspace) data TTL in milliseconds. Default: 30 days (2592000000). Set to 0 to disable cleanup. */
sharedMemoryTtlMs?: number;
/** @deprecated Legacy alias for sharedMemoryTtlMs */
Expand Down Expand Up @@ -736,14 +766,19 @@ export interface DkgConfig {
* Nodes resolve the correct endpoints from the network they're on.
* Operators only see a single toggle — no endpoint configuration.
*/
export const TELEMETRY_ENDPOINTS: Record<string, { syslog: { host: string; port: number }; otlp: string }> = {
export const TELEMETRY_ENDPOINTS: Record<
string,
{ syslog: { host: string; port: number }; otlp: string; otlpLogs?: string }
> = {
testnet: {
syslog: { host: 'loggly.origin-trail.network', port: 12201 },
otlp: 'https://telemetry-testnet.origintrail.io/v1/metrics',
otlpLogs: 'https://telemetry-testnet.origintrail.io/v1/logs', // OriginTrail-hosted opt-in collector (TBD)
},
mainnet: {
syslog: { host: 'loggly.origin-trail.network', port: 0 }, // TODO: assign mainnet syslog port
syslog: { host: 'loggly.origin-trail.network', port: 0 }, // legacy syslog — OTLP is the mainnet path
otlp: 'https://telemetry.origintrail.io/v1/metrics',
otlpLogs: 'https://telemetry.origintrail.io/v1/logs', // OriginTrail-hosted opt-in collector (TBD)
},
};

Expand Down
73 changes: 68 additions & 5 deletions packages/cli/src/daemon/lifecycle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ import {
} from '@origintrail-official/dkg-chain';
import { DKGAgent, loadOpWallets, KaNumberAllocator } from '@origintrail-official/dkg-agent';
import { isExternalBackend } from '@origintrail-official/dkg-storage';
import { computeNetworkId, createOperationContext, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, sparqlIri, contextGraphSharedMemoryUri, contextGraphAssertionUri, contextGraphMetaUri, DEFAULT_PROTOCOL_OUTBOX_BACKOFFS_MS, DEFAULT_PROTOCOL_OUTBOX_MAX_AGE_MS, pickNetworkTunables } from '@origintrail-official/dkg-core';
import { computeNetworkId, createOperationContext, createLogRedactor, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, sparqlIri, contextGraphSharedMemoryUri, contextGraphAssertionUri, contextGraphMetaUri, DEFAULT_PROTOCOL_OUTBOX_BACKOFFS_MS, DEFAULT_PROTOCOL_OUTBOX_MAX_AGE_MS, pickNetworkTunables } from '@origintrail-official/dkg-core';
import { findReservedSubjectPrefix, isSkolemizedUri } from '@origintrail-official/dkg-publisher';
import {
DashboardDB,
Expand All @@ -79,6 +79,7 @@ import {
handleNodeUIRequest,
ChatMemoryManager,
LogPushWorker,
OtlpLogWorker,
LlmClient,
SqliteMessageIdempotencyStore,
SqliteProtocolOutboxStore,
Expand Down Expand Up @@ -1966,6 +1967,11 @@ export async function runDaemonInner(
chatDb = dashDb;
log("Dashboard DB initialized at " + join(dkgDir(), "node-ui.db"));

// Redactor for the copy of each record that LEAVES the node. The local
// dashboard DB keeps full-fidelity records (it's the operator's own machine);
// redaction only protects data crossing the trust boundary to a collector.
const redactForRemote = createLogRedactor(config.telemetry?.logs?.redact);

Logger.setSink((entry) => {
try {
dashDb.insertLog({
Expand All @@ -1979,7 +1985,12 @@ export async function runDaemonInner(
} catch {
/* DB write must never break the node */
}
logPusher?.push(entry);
// Fan out a single redacted copy to every active remote shipper.
if (logPusher || otlpExporter) {
const safe = redactForRemote(entry);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Issue: Remote log redaction wiring is not verified

What's wrong
The PR tests the redaction helper and the OTLP worker separately, but not the daemon wiring that decides which copy leaves the node. This is the privacy-critical behavior: local logs should remain full-fidelity, while syslog/OTLP receive only the redacted record.

Example
A future edit that accidentally changes the fan-out to otlpExporter?.push(entry) would leak operationalWalletPrivateKey=... remotely. The redactor tests and OtlpLogWorker tests would still pass because neither test proves Logger.setSink sends the redacted copy to the remote shippers while keeping the local DB record unchanged.

Suggested direction
Add a lifecycle-level or extracted-helper test that exercises the actual Logger.setSink fan-out path with sensitive log content and configured telemetry modes.

For Agents
Add a focused daemon-sink wiring test, or extract the sink fan-out into a small helper that can be unit-tested. Mock/stub dashDb.insertLog, LogPushWorker.push, and OtlpLogWorker.push; emit a log containing a private key; assert the DB gets the original message, both remote shippers get [REDACTED], and logs.exporter: 'none' leaves remote pushers inactive.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Issue: Remote redaction wiring is not verified at the daemon sink

What's wrong
The existing tests prove the redactor can scrub a standalone record and that OtlpLogWorker can send records, but they do not exercise the trust-boundary code that actually applies redaction before forwarding daemon logs. Because this path handles secrets leaving the node, a regression that accidentally pushes entry instead of safe, or fails to apply configured extra redact keys, would not be caught.

Example
A daemon-level test should log operationalWalletPrivateKey=0x... with remote telemetry enabled and assert that dashDb.insertLog(...) receives the original message while both logPusher.push(...) and otlpExporter.push(...) receive [REDACTED], including an extra key from telemetry.logs.redact.

Suggested direction
Add a focused test around the daemon log sink fan-out, not just the standalone redactor and standalone OTLP worker.

For Agents
Look at the Logger.setSink block in packages/cli/src/daemon/lifecycle.ts. Extract the fan-out/redaction logic or add a focused daemon unit with fake DB, syslog worker, and OTLP worker. Preserve full-fidelity local logs and prove every remote shipper gets only the redacted copy.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Issue: Remote redaction is not verified at the daemon fan-out boundary

What's wrong
The PR adds at-source redaction before remote log shipping, but the tests only verify the pure redactor and the OTLP worker separately. They do not prove that daemon log forwarding applies the redactor before either remote shipper receives a record.

Example
A regression that changed line 2012 to const safe = entry, or pushed entry directly to otlpExporter, would still leave packages/core/test/log-redaction.test.ts green because those tests only call the redactor directly; they never exercise the daemon sink that forwards records off-node.

Suggested direction
Cover the actual Logger.setSink fan-out behavior, not only createLogRedactor in isolation, because this is the trust boundary where secrets either stay local or leave the node.

For Agents
Add a focused test around the daemon log fan-out path, or extract a small helper from lifecycle.ts, using fake dashboard/remote shippers. Prove local storage receives the original message while remote shippers receive a redacted message for privateKey=... and configured extra keys.

logPusher?.push(safe);
otlpExporter?.push(safe);
}
});

// Extract the plain value from an RDF typed literal like "6"^^<xsd:integer>
Expand Down Expand Up @@ -2149,6 +2160,7 @@ export async function runDaemonInner(
: "mainnet";
const syslogEndpoint = TELEMETRY_ENDPOINTS[networkKey]?.syslog;
let logPusher: LogPushWorker | null = null;
let otlpExporter: OtlpLogWorker | null = null;

function startLogPusher(): { ok: boolean; error?: string } {
if (logPusher) return { ok: true };
Expand Down Expand Up @@ -2190,8 +2202,59 @@ export async function runDaemonInner(
log("Telemetry: log streaming disabled");
}

function startOtlpExporter(): { ok: boolean; error?: string } {
if (otlpExporter) return { ok: true };
const endpoint =
config.telemetry?.logs?.endpoint || TELEMETRY_ENDPOINTS[networkKey]?.otlpLogs;
if (!endpoint) {
return {
ok: false,
error: `OTLP log export is not configured for ${networkKey} (set config.telemetry.logs.endpoint)`,
};
}
const minLevel = config.telemetry?.logs?.level ?? "info";
otlpExporter = new OtlpLogWorker({
endpoint,
token: config.telemetry?.logs?.token,
network: networkKey,
peerId: agent.peerId,
nodeName: config.name,
version: nodeVersion,
commit: nodeCommit,
role: config.nodeRole ?? "edge",
minLevel,
bufferMaxEntries: config.telemetry?.logs?.bufferMaxEntries,
onError: (m) => log(`Telemetry(OTLP): ${m}`),
});
otlpExporter.start();
log(`Telemetry: OTLP log export enabled → ${endpoint} (level ≥ ${minLevel})`);
return { ok: true };
}

function stopOtlpExporter(): void {
if (!otlpExporter) return;
otlpExporter.stop();
otlpExporter = null;
log("Telemetry: OTLP log export disabled");
}

// Dispatch to the configured log exporter. 'syslog' is the default when
// unset (preserves prior behaviour); 'otlp' is the recommended path; 'none'
// keeps logs local-only even while telemetry is enabled.
function startTelemetry(): { ok: boolean; error?: string } {
const mode = config.telemetry?.logs?.exporter ?? "syslog";
Comment thread
Bojan131 marked this conversation as resolved.
Outdated
if (mode === "none") return { ok: true };
if (mode === "otlp") return startOtlpExporter();
return startLogPusher();
}

function stopTelemetry(): void {
stopLogPusher();
stopOtlpExporter();
}

if (config.telemetry?.enabled) {
const r = startLogPusher();
const r = startTelemetry();
if (!r.ok) {
log(`Telemetry: ${r.error}`);
config.telemetry.enabled = false;
Expand Down Expand Up @@ -2619,10 +2682,10 @@ export async function runDaemonInner(
enabled: boolean,
): Promise<{ ok: boolean; error?: string }> => {
if (enabled) {
const r = startLogPusher();
const r = startTelemetry();
if (!r.ok) return r;
} else {
stopLogPusher();
stopTelemetry();
Comment thread
Bojan131 marked this conversation as resolved.
Outdated
}
config.telemetry = { ...config.telemetry, enabled };
await saveConfig(config);
Expand Down
3 changes: 2 additions & 1 deletion packages/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ export * from './publisher-extension.js';
export * from './imported-artifact-bytes.js';
export * from './imported-artifact-metadata.js';
export * from './event-bus.js';
export { Logger, createOperationContext, type OperationContext, type OperationName, type LogSink } from './logger.js';
export { Logger, createOperationContext, type OperationContext, type OperationName, type LogSink, type LogRecord } from './logger.js';
export { createLogRedactor, redactLogEntry, redactMessage, DEFAULT_SENSITIVE_KEYS, REDACTED } from './log-redaction.js';
export * from './crypto/index.js';
export * from './proto/index.js';
export {
Expand Down
126 changes: 126 additions & 0 deletions packages/core/src/log-redaction.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/**
* At-source redaction of secrets from log records before they leave the node.
*
* Why this exists: V10 nodes are run by independent operators, and once a
* secret (a wallet private key, a mnemonic, an API token) is shipped to a
* remote collector it is irreversibly leaked. Redaction therefore runs on the
* node, on the copy of every log record that is about to be FORWARDED. The
* local dashboard DB keeps full-fidelity records for the operator's own
* debugging — redaction only protects data that crosses the trust boundary.
*
* Design choices (deliberately conservative to avoid mangling useful logs):
* - Structured "key: value" / key=value / "key":"value" shapes are redacted
* by KEY NAME (high precision). This is how DKG actually logs secrets
* (e.g. operationalWalletPrivateKey, mnemonic).
* - JWTs are redacted by shape (eyJ….….…) — effectively zero false positives.
* - We deliberately do NOT blanket-redact 0x-prefixed 64-hex strings: in DKG
* those are overwhelmingly Merkle roots, KC roots and tx hashes (public,
* non-secret) and nuking them would destroy debuggability. A bare private
* key with no key-name context is a residual gap best closed with a
* collector-side OTTL/regex backstop (see the PoC stack).
*/

import type { LogRecord } from './logger.js';

/**
* Default sensitive key names whose values are scrubbed from log messages
* before forwarding. Matched case-insensitively.
*/
export const DEFAULT_SENSITIVE_KEYS: readonly string[] = [
'privateKey',
'private_key',
'privKey',
'operationalWalletPrivateKey',
'managementWalletPrivateKey',
'mnemonic',
'seedPhrase',
'seed_phrase',
'seed',
'secret',
'secretKey',
'clientSecret',
'password',
'passphrase',
'passwd',
'pwd',
'apiKey',
'api_key',
'apiToken',
'accessToken',
'access_token',
'refreshToken',
'refresh_token',
'token',
'authorization',
'bearer',
'sessionKey',
'encryptionKey',
];

export const REDACTED = '[REDACTED]';

function escapeRegExp(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

/**
* Redact the free-form `message` of a log record. The two patterns are:
* 1. JWT-shaped tokens (header.payload.signature, base64url) — by shape.
* 2. `<sensitiveKey><sep><value>` — the value is replaced, the key kept.
* Quoted values (single/double/backtick) are redacted whole (so a quoted
* mnemonic with spaces is fully removed); bare values up to the next
* delimiter otherwise.
*/
export function redactMessage(message: string, keyRegex: RegExp, jwtRegex: RegExp): string {
if (!message) return message;
// Reset lastIndex defensively (these are global regexes reused across calls).
jwtRegex.lastIndex = 0;
keyRegex.lastIndex = 0;
let out = message.replace(jwtRegex, REDACTED);
out = out.replace(keyRegex, (_full, keyAndSep: string) => `${keyAndSep}${REDACTED}`);
return out;
}

function buildKeyRegex(keys: readonly string[]): RegExp {
const alt = keys.map(escapeRegExp).join('|');
// group 1 = key (optionally quoted) + separator (kept verbatim)
// group 2 = value (redacted): a quoted run, or a bare token up to a delimiter
return new RegExp(
'(' +
'["\'`]?\\b(?:' + alt + ')\\b["\'`]?' + // key, optionally quoted
'\\s*[:=]\\s*' + // : or =
')' +
'(' +
'"[^"]*"' + '|' +
"'[^']*'" + '|' +
'`[^`]*`' + '|' +
'[^\\s,;}\\]\\)]+' + // bare token
Comment thread
Bojan131 marked this conversation as resolved.
')',
'gi',
);
}

// JWT: three base64url segments separated by dots, starting with the
// canonical `eyJ` ('{"' base64url-encoded). Conservative min lengths.
const JWT_SOURCE = '\\beyJ[A-Za-z0-9_-]{6,}\\.[A-Za-z0-9_-]{6,}\\.[A-Za-z0-9_-]{6,}\\b';

/**
* Compile a redactor once, then reuse it on the hot path (one per shipper).
* `extraKeys` are operator-configured additional sensitive key names.
*/
export function createLogRedactor(extraKeys: readonly string[] = []): (record: LogRecord) => LogRecord {
const keys = extraKeys.length ? [...DEFAULT_SENSITIVE_KEYS, ...extraKeys] : DEFAULT_SENSITIVE_KEYS;
const keyRegex = buildKeyRegex(keys);
const jwtRegex = new RegExp(JWT_SOURCE, 'g');
return (record: LogRecord): LogRecord => {
if (!record || !record.message) return record;
const redacted = redactMessage(record.message, keyRegex, jwtRegex);
if (redacted === record.message) return record; // no change → no alloc
return { ...record, message: redacted };
};
}

/** One-shot convenience (recompiles each call — do not use on the hot path). */
export function redactLogEntry(record: LogRecord, extraKeys: readonly string[] = []): LogRecord {
return createLogRedactor(extraKeys)(record);
}
12 changes: 10 additions & 2 deletions packages/core/src/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,22 @@ export interface OperationContext {
sourceOperationId?: string;
}

export type LogSink = (entry: {
/**
* The canonical structured log record emitted on every Logger call. This is
* the single shape that flows to the local dashboard DB and to any remote
* shipper (syslog, OTLP). Keep it stable — redaction and the OTLP exporter
* both consume it.
*/
export interface LogRecord {
level: string;
operationName: string;
operationId: string;
sourceOperationId?: string;
module: string;
message: string;
}) => void;
}

export type LogSink = (entry: LogRecord) => void;

/**
* Structured logger that prefixes every message with a timestamp,
Expand Down
Loading
Loading