Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
6cc97d0
feat(telemetry): opt-in OTLP log collection for core nodes (PoC)
Bojan131 Jun 24, 2026
ab56bf7
feat(telemetry): per-node Grafana selection + dashboard
Bojan131 Jun 24, 2026
4cdeef6
feat(telemetry): production ingest path for existing Loki (Alloy bridge)
Bojan131 Jun 24, 2026
242c54f
docs(telemetry): production deploy bundle + manager handoff
Bojan131 Jun 24, 2026
a3c5257
docs(telemetry): exact Cloudflare WAF expression + real-Grafana verif…
Bojan131 Jun 24, 2026
9a76a4a
feat(telemetry): fleet dashboard + operator guide + example alerts
Bojan131 Jun 24, 2026
d9b557b
chore: drop accidentally-committed localhost hardhat deploy artifacts
Bojan131 Jun 24, 2026
10da035
chore: restore tracked localhost_contracts.json (only the untracked l…
Bojan131 Jun 24, 2026
a30318d
feat(telemetry): OTel SDK foundation — traces+metrics providers, log …
Bojan131 Jun 25, 2026
80e536d
feat(telemetry): instrument spans + metrics across agent/publisher/ch…
Bojan131 Jun 25, 2026
a841d00
chore(telemetry): collector traces+metrics pipelines + mock OTLP coll…
Bojan131 Jun 25, 2026
2aa31f7
feat(telemetry): add protocol_router.send span + P2P send-duration me…
Bojan131 Jun 25, 2026
414936b
fix(telemetry): address bug-bot review (1 bug + 3 issues + 2 nits)
Bojan131 Jun 25, 2026
9f3625d
Merge remote-tracking branch 'origin/main' into feat/core-node-log-co…
Bojan131 Jun 25, 2026
f78d06f
fix(telemetry): two CI regressions from instrumentation
Bojan131 Jun 25, 2026
908e298
Merge remote-tracking branch 'origin/main' into feat/core-node-log-co…
Bojan131 Jun 26, 2026
0bfdadd
chore: drop accidentally-committed localhost Hardhat deploy artifacts…
Bojan131 Jun 26, 2026
6a8c971
feat(telemetry): address #1317 review — unify log resource attrs + fu…
Bojan131 Jun 26, 2026
66ef5b7
Merge remote-tracking branch 'origin/main' into feat/core-node-log-co…
Bojan131 Jun 26, 2026
2771d2e
fix(telemetry): tie OTel traces/metrics to the master gate + prove in…
Bojan131 Jun 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 66 additions & 4 deletions packages/cli/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -538,8 +538,65 @@ export interface DkgConfig {
* on first start and stored in `<DKG_HOME>/auth.token`.
*/
auth?: { enabled?: boolean; tokens?: string[] };
/** Opt-in telemetry streaming to central network dashboard. */
telemetry?: { enabled?: boolean };
/**
* Opt-in telemetry streaming to a central network dashboard.
* `enabled` is the master gate: when false, NOTHING is forwarded off the
* node (local logging — SQLite + daemon.log — is always on regardless).
*/
telemetry?: {
enabled?: boolean;
/**
* Remote log forwarding (opt-in). Active only when `enabled` is true.
*/
logs?: {
/**
* Outbound transport for logs. 'none' = local only; 'otlp' = OTLP/HTTP
* to an OpenTelemetry collector; 'syslog' = legacy RFC 5424 → Graylog.
* Defaults to 'syslog' when unset (preserves prior behaviour).
*/
exporter?: 'none' | 'otlp' | 'syslog';
/**
* OTLP/HTTP logs endpoint, e.g. http://localhost:4318/v1/logs. Falls
* back to the per-network default (TELEMETRY_ENDPOINTS[network].otlpLogs).
*/
endpoint?: string;
/** Bearer credential for the operator's collector. Treated as a secret. */
token?: string;
/** Minimum level forwarded remotely. Local sink keeps everything. Default 'info'. */
level?: 'debug' | 'info' | 'warn' | 'error';
/** Extra sensitive key names to redact from messages before they leave the node. */
redact?: string[];
/** Bounded in-memory buffer; drop-oldest on overflow. Default 500. */
bufferMaxEntries?: number;
};
/**
* OTel trace export (opt-in, independent of logs). Registers the tracer
* ONLY when an endpoint resolves (config or OTEL_EXPORTER_OTLP_* env);
* never falls back to a guessed prod URL.
*/
traces?: {
enabled?: boolean;
/** OTLP traces endpoint, e.g. http://localhost:4318/v1/traces. */
endpoint?: string;
/** Bearer credential. Treated as a secret. */
token?: string;
/** Parent-based ratio sampler 0..1. Default 1.0. */
sampleRatio?: number;
};
/**
* OTel metric export (opt-in, independent of logs). Registers the meter
* ONLY when an endpoint resolves (config or OTEL_EXPORTER_OTLP_* env).
*/
metrics?: {
enabled?: boolean;
/** OTLP metrics endpoint, e.g. http://localhost:4318/v1/metrics. */
endpoint?: string;
/** Bearer credential. Treated as a secret. */
token?: string;
/** PeriodicExportingMetricReader interval. Default 30000ms. */
exportIntervalMs?: number;
};
};
/** Shared memory (workspace) data TTL in milliseconds. Default: 30 days (2592000000). Set to 0 to disable cleanup. */
sharedMemoryTtlMs?: number;
/** @deprecated Legacy alias for sharedMemoryTtlMs */
Expand Down Expand Up @@ -736,14 +793,19 @@ export interface DkgConfig {
* Nodes resolve the correct endpoints from the network they're on.
* Operators only see a single toggle — no endpoint configuration.
*/
export const TELEMETRY_ENDPOINTS: Record<string, { syslog: { host: string; port: number }; otlp: string }> = {
export const TELEMETRY_ENDPOINTS: Record<
string,
{ syslog: { host: string; port: number }; otlp: string; otlpLogs?: string }
> = {
testnet: {
syslog: { host: 'loggly.origin-trail.network', port: 12201 },
otlp: 'https://telemetry-testnet.origintrail.io/v1/metrics',
otlpLogs: 'https://telemetry-testnet.origintrail.io/v1/logs', // OriginTrail-hosted opt-in collector (TBD)
},
mainnet: {
syslog: { host: 'loggly.origin-trail.network', port: 0 }, // TODO: assign mainnet syslog port
syslog: { host: 'loggly.origin-trail.network', port: 0 }, // legacy syslog — OTLP is the mainnet path
otlp: 'https://telemetry.origintrail.io/v1/metrics',
otlpLogs: 'https://telemetry.origintrail.io/v1/logs', // OriginTrail-hosted opt-in collector (TBD)
},
};

Expand Down
134 changes: 129 additions & 5 deletions packages/cli/src/daemon/lifecycle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ import {
} from '@origintrail-official/dkg-chain';
import { DKGAgent, loadOpWallets, KaNumberAllocator } from '@origintrail-official/dkg-agent';
import { isExternalBackend } from '@origintrail-official/dkg-storage';
import { computeNetworkId, createOperationContext, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, sparqlIri, contextGraphSharedMemoryUri, contextGraphAssertionUri, contextGraphMetaUri, DEFAULT_PROTOCOL_OUTBOX_BACKOFFS_MS, DEFAULT_PROTOCOL_OUTBOX_MAX_AGE_MS, pickNetworkTunables } from '@origintrail-official/dkg-core';
import { computeNetworkId, createOperationContext, createLogRedactor, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, sparqlIri, contextGraphSharedMemoryUri, contextGraphAssertionUri, contextGraphMetaUri, DEFAULT_PROTOCOL_OUTBOX_BACKOFFS_MS, DEFAULT_PROTOCOL_OUTBOX_MAX_AGE_MS, pickNetworkTunables } from '@origintrail-official/dkg-core';
import { findReservedSubjectPrefix, isSkolemizedUri } from '@origintrail-official/dkg-publisher';
import {
DashboardDB,
Expand All @@ -79,6 +79,9 @@ import {
handleNodeUIRequest,
ChatMemoryManager,
LogPushWorker,
OtlpLogWorker,
initTelemetry,
shutdownTelemetry,
LlmClient,
SqliteMessageIdempotencyStore,
SqliteProtocolOutboxStore,
Expand Down Expand Up @@ -1966,6 +1969,11 @@ export async function runDaemonInner(
chatDb = dashDb;
log("Dashboard DB initialized at " + join(dkgDir(), "node-ui.db"));

// Redactor for the copy of each record that LEAVES the node. The local
// dashboard DB keeps full-fidelity records (it's the operator's own machine);
// redaction only protects data crossing the trust boundary to a collector.
const redactForRemote = createLogRedactor(config.telemetry?.logs?.redact);

Logger.setSink((entry) => {
try {
dashDb.insertLog({
Expand All @@ -1979,7 +1987,12 @@ export async function runDaemonInner(
} catch {
/* DB write must never break the node */
}
logPusher?.push(entry);
// Fan out a single redacted copy to every active remote shipper.
if (logPusher || otlpExporter) {
const safe = redactForRemote(entry);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Issue: Remote log redaction wiring is not verified

What's wrong
The PR tests the redaction helper and the OTLP worker separately, but not the daemon wiring that decides which copy leaves the node. This is the privacy-critical behavior: local logs should remain full-fidelity, while syslog/OTLP receive only the redacted record.

Example
A future edit that accidentally changes the fan-out to otlpExporter?.push(entry) would leak operationalWalletPrivateKey=... remotely. The redactor tests and OtlpLogWorker tests would still pass because neither test proves Logger.setSink sends the redacted copy to the remote shippers while keeping the local DB record unchanged.

Suggested direction
Add a lifecycle-level or extracted-helper test that exercises the actual Logger.setSink fan-out path with sensitive log content and configured telemetry modes.

For Agents
Add a focused daemon-sink wiring test, or extract the sink fan-out into a small helper that can be unit-tested. Mock/stub dashDb.insertLog, LogPushWorker.push, and OtlpLogWorker.push; emit a log containing a private key; assert the DB gets the original message, both remote shippers get [REDACTED], and logs.exporter: 'none' leaves remote pushers inactive.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Issue: Remote redaction wiring is not verified at the daemon sink

What's wrong
The existing tests prove the redactor can scrub a standalone record and that OtlpLogWorker can send records, but they do not exercise the trust-boundary code that actually applies redaction before forwarding daemon logs. Because this path handles secrets leaving the node, a regression that accidentally pushes entry instead of safe, or fails to apply configured extra redact keys, would not be caught.

Example
A daemon-level test should log operationalWalletPrivateKey=0x... with remote telemetry enabled and assert that dashDb.insertLog(...) receives the original message while both logPusher.push(...) and otlpExporter.push(...) receive [REDACTED], including an extra key from telemetry.logs.redact.

Suggested direction
Add a focused test around the daemon log sink fan-out, not just the standalone redactor and standalone OTLP worker.

For Agents
Look at the Logger.setSink block in packages/cli/src/daemon/lifecycle.ts. Extract the fan-out/redaction logic or add a focused daemon unit with fake DB, syslog worker, and OTLP worker. Preserve full-fidelity local logs and prove every remote shipper gets only the redacted copy.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Issue: Remote redaction is not verified at the daemon fan-out boundary

What's wrong
The PR adds at-source redaction before remote log shipping, but the tests only verify the pure redactor and the OTLP worker separately. They do not prove that daemon log forwarding applies the redactor before either remote shipper receives a record.

Example
A regression that changed line 2012 to const safe = entry, or pushed entry directly to otlpExporter, would still leave packages/core/test/log-redaction.test.ts green because those tests only call the redactor directly; they never exercise the daemon sink that forwards records off-node.

Suggested direction
Cover the actual Logger.setSink fan-out behavior, not only createLogRedactor in isolation, because this is the trust boundary where secrets either stay local or leave the node.

For Agents
Add a focused test around the daemon log fan-out path, or extract a small helper from lifecycle.ts, using fake dashboard/remote shippers. Prove local storage receives the original message while remote shippers receive a redacted message for privateKey=... and configured extra keys.

logPusher?.push(safe);
otlpExporter?.push(safe);
}
});

// Extract the plain value from an RDF typed literal like "6"^^<xsd:integer>
Expand Down Expand Up @@ -2149,6 +2162,7 @@ export async function runDaemonInner(
: "mainnet";
const syslogEndpoint = TELEMETRY_ENDPOINTS[networkKey]?.syslog;
let logPusher: LogPushWorker | null = null;
let otlpExporter: OtlpLogWorker | null = null;

function startLogPusher(): { ok: boolean; error?: string } {
if (logPusher) return { ok: true };
Expand Down Expand Up @@ -2190,8 +2204,115 @@ export async function runDaemonInner(
log("Telemetry: log streaming disabled");
}

function startOtlpExporter(): { ok: boolean; error?: string } {
if (otlpExporter) return { ok: true };
const endpoint =
config.telemetry?.logs?.endpoint || TELEMETRY_ENDPOINTS[networkKey]?.otlpLogs;
if (!endpoint) {
return {
ok: false,
error: `OTLP log export is not configured for ${networkKey} (set config.telemetry.logs.endpoint)`,
};
}
const minLevel = config.telemetry?.logs?.level ?? "info";
otlpExporter = new OtlpLogWorker({
endpoint,
token: config.telemetry?.logs?.token,
network: networkKey,
peerId: agent.peerId,
nodeName: config.name,
version: nodeVersion,
commit: nodeCommit,
role: config.nodeRole ?? "edge",
minLevel,
bufferMaxEntries: config.telemetry?.logs?.bufferMaxEntries,
onError: (m) => log(`Telemetry(OTLP): ${m}`),
});
otlpExporter.start();
log(`Telemetry: OTLP log export enabled → ${endpoint} (level ≥ ${minLevel})`);
return { ok: true };
}

function stopOtlpExporter(): void {
if (!otlpExporter) return;
otlpExporter.stop();
otlpExporter = null;
log("Telemetry: OTLP log export disabled");
}

// Dispatch to the configured log exporter. 'syslog' is the default when
// unset (preserves prior behaviour); 'otlp' is the recommended path; 'none'
// keeps logs local-only even while telemetry is enabled.
function startTelemetry(): { ok: boolean; error?: string } {
const mode = config.telemetry?.logs?.exporter ?? "syslog";
Comment thread
Bojan131 marked this conversation as resolved.
Outdated
if (mode === "none") return { ok: true };
if (mode === "otlp") return startOtlpExporter();
return startLogPusher();
}

function stopTelemetry(): void {
stopLogPusher();
stopOtlpExporter();
}

// OTel traces + metrics SDK (independent of the log-exporter path above).
// Endpoints resolve env-first (standard OTEL_EXPORTER_OTLP_* names) then
// config; a signal registers ONLY when its endpoint resolves — never a
// guessed prod default. enabled=false ⇒ initTelemetry is a total no-op.
if (config.telemetry?.enabled) {
const otelBase = process.env.OTEL_EXPORTER_OTLP_ENDPOINT?.replace(/\/$/, "");
const tracesEndpoint =
process.env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT ||
(otelBase ? `${otelBase}/v1/traces` : undefined) ||
config.telemetry.traces?.endpoint;
const metricsEndpoint =
process.env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT ||
(otelBase ? `${otelBase}/v1/metrics` : undefined) ||
config.telemetry.metrics?.endpoint;
const tracesOn = !!tracesEndpoint && config.telemetry.traces?.enabled !== false;
const metricsOn = !!metricsEndpoint && config.telemetry.metrics?.enabled !== false;
try {
initTelemetry({
enabled: true,
resource: {
serviceName: "dkg-node",
serviceVersion: nodeVersion,
serviceInstanceId: config.name,
network: networkKey,
peerId: agent.peerId,
nodeName: config.name,
nodeRole: config.nodeRole ?? "edge",
commit: nodeCommit,
chainId: config.chain?.chainId,
},
traces: tracesOn
? {
endpoint: tracesEndpoint,
token: config.telemetry.traces?.token,
sampleRatio: config.telemetry.traces?.sampleRatio,
}
: undefined,
metrics: metricsOn
? {
endpoint: metricsEndpoint,
token: config.telemetry.metrics?.token,
exportIntervalMs: config.telemetry.metrics?.exportIntervalMs,
}
: undefined,
});
if (tracesOn || metricsOn) {
log(
`Telemetry: OTel SDK registered (traces=${tracesOn ? tracesEndpoint : "off"}, metrics=${metricsOn ? metricsEndpoint : "off"})`,
);
}
} catch (err) {
// Telemetry must never block startup.
log(`Telemetry: OTel init failed (non-fatal): ${String(err)}`);
}
}

if (config.telemetry?.enabled) {
const r = startLogPusher();
const r = startTelemetry();
if (!r.ok) {
log(`Telemetry: ${r.error}`);
config.telemetry.enabled = false;
Expand Down Expand Up @@ -2619,10 +2740,10 @@ export async function runDaemonInner(
enabled: boolean,
): Promise<{ ok: boolean; error?: string }> => {
if (enabled) {
const r = startLogPusher();
const r = startTelemetry();
if (!r.ok) return r;
} else {
stopLogPusher();
stopTelemetry();
Comment thread
Bojan131 marked this conversation as resolved.
Outdated
}
config.telemetry = { ...config.telemetry, enabled };
await saveConfig(config);
Expand Down Expand Up @@ -3107,6 +3228,9 @@ export async function runDaemonInner(
clearInterval(pruneTimer);
rateLimiter.destroy();
metricsCollector.stop();
stopTelemetry();
// Flush + shut down the OTel SDK (no-op if never registered).
await shutdownTelemetry().catch(() => {});
natStatusWatcherStop?.();
resetNatStatus();
await publisherRuntime
Expand Down
1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"@multiformats/multiaddr": "^13.0.3",
"@noble/ed25519": "^3.1.0",
"@noble/hashes": "^2.2.0",
"@opentelemetry/api": "^1.9.1",
"js-yaml": "^4.1.1",
"libp2p": "^3.3.1",
"protobufjs": "^8.3.0",
Expand Down
7 changes: 6 additions & 1 deletion packages/core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ export * from './publisher-extension.js';
export * from './imported-artifact-bytes.js';
export * from './imported-artifact-metadata.js';
export * from './event-bus.js';
export { Logger, createOperationContext, type OperationContext, type OperationName, type LogSink } from './logger.js';
export { Logger, createOperationContext, type OperationContext, type OperationName, type LogSink, type LogRecord } from './logger.js';
export { createLogRedactor, redactLogEntry, redactMessage, DEFAULT_SENSITIVE_KEYS, REDACTED } from './log-redaction.js';
export {
getTracer, withSpan, linkedSpan, currentTraceIds, activeSpanContext,
getMetrics, rebuildMetrics, type WithSpanOpts, type DkgMetrics,
} from './telemetry-api.js';
export * from './crypto/index.js';
export * from './proto/index.js';
export {
Expand Down
Loading
Loading