diff --git a/.gitignore b/.gitignore index d1c5936ef..0f38b113a 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ packages/evm-module/typechain/ packages/evm-module/deployments/hardhat_contracts.json packages/evm-module/deployments/localhost_contracts.json +packages/evm-module/deployments/localhost/ snapshots/_cache_phase1_neuroweb_epoch16.json .claude/ .orchestrator/ diff --git a/packages/agent/package.json b/packages/agent/package.json index 95437c380..991c05c8b 100644 --- a/packages/agent/package.json +++ b/packages/agent/package.json @@ -14,6 +14,7 @@ "clean": "rm -rf dist tsconfig.tsbuildinfo" }, "dependencies": { + "@opentelemetry/api": "^1.9.1", "@libp2p/peer-id": "^6.0.9", "@multiformats/multiaddr": "^13.0.3", "@noble/ciphers": "^2.2.0", diff --git a/packages/agent/src/dkg-agent-publish.ts b/packages/agent/src/dkg-agent-publish.ts index 801994616..f67461772 100644 --- a/packages/agent/src/dkg-agent-publish.ts +++ b/packages/agent/src/dkg-agent-publish.ts @@ -97,8 +97,11 @@ import { pickNetworkTunables, sharedMemoryReadBothFilter, partitionCatalogQuads, + withSpan, + getMetrics, assertQuadLiteralsMutf8Safe, } from '@origintrail-official/dkg-core'; +import { SpanStatusCode } from '@opentelemetry/api'; import { GraphManager, PrivateContentStore, createTripleStore, type TripleStore, type TripleStoreConfig, type Quad, type LargeLiteralStorageConfig } from '@origintrail-official/dkg-storage'; import { EVMChainAdapter, NoChainAdapter, enrichEvmError, buildKnowledgeAssetUal, type EVMAdapterConfig, type ChainAdapter, type CreateContextGraphParams, type CreateOnChainContextGraphParams, type CreateOnChainContextGraphResult, type TxResult, type V10PublishingConvictionAccountInfo } from '@origintrail-official/dkg-chain'; import { @@ -1290,6 +1293,15 @@ export class PublishMethods extends DKGAgentBase { privateQuads?: Quad[], opts?: PublishOpts, ): Promise { + return withSpan('agent.publish', async (span) => { + const chainId = typeof this.chain?.chainId === 'string' && this.chain.chainId !== 'none' ? this.chain.chainId : undefined; + const publishStartedAt = Date.now(); + span.setAttributes({ + 'dkg.context_graph_id': contextGraphId, + 'dkg.triple_count': quads.length, + 'dkg.has_private': !!privateQuads && privateQuads.length > 0, + ...(chainId ? { 'dkg.chain_id': chainId } : {}), + }); const ctx = opts?.operationCtx ?? createOperationContext('publish'); const onPhase = opts?.onPhase; this.log.info(ctx, `Starting publish to context graph "${contextGraphId}" with ${quads.length} triples`); @@ -1425,6 +1437,12 @@ export class PublishMethods extends DKGAgentBase { encryptInlineChunked, }); + span.setAttribute('dkg.publish_status', result.status); + if (result.status === 'failed') { + span.setStatus({ code: SpanStatusCode.ERROR }); + span.addEvent('publish_failed', { error: String(result.contextGraphError ?? '') }); + } + onPhase?.('broadcast', 'start'); this.log.info(ctx, `Local publish complete, broadcasting to peers`); await this.broadcastPublish(contextGraphId, result, ctx); @@ -1436,7 +1454,12 @@ export class PublishMethods extends DKGAgentBase { // it can never affect the publish just completed. await this.emitPublicProjectionAfterPublish(contextGraphId, result, ctx); + const publishMetricAttrs = { outcome: result.status, source: 'direct', ...(chainId ? { chain_id: chainId } : {}) }; + getMetrics().publishTotal.add(1, publishMetricAttrs); + getMetrics().publishDuration.record(Date.now() - publishStartedAt, publishMetricAttrs); + return result; + }); } /** @@ -4117,6 +4140,14 @@ export class PublishMethods extends DKGAgentBase { schemeVersion?: number; }, ): Promise { + return withSpan('agent.publish_from_swm', async (span) => { + const chainId = typeof this.chain?.chainId === 'string' && this.chain.chainId !== 'none' ? this.chain.chainId : undefined; + const publishStartedAt = Date.now(); + span.setAttributes({ + 'dkg.context_graph_id': contextGraphId, + 'dkg.selection': selection === 'all' ? 'all' : 'roots', + ...(chainId ? { 'dkg.chain_id': chainId } : {}), + }); const ctx = options?.operationCtx ?? createOperationContext('publishFromSWM'); const effectiveSubCG = options?.subContextGraphId ?? options?.contextGraphId; // `ctxGraphIdStr` doubles as `publishContextGraphId` for REMAP-flow @@ -4248,6 +4279,12 @@ export class PublishMethods extends DKGAgentBase { encryptInlineChunked, }); + span.setAttribute('dkg.publish_status', result.status); + if (result.status === 'failed') { + span.setStatus({ code: SpanStatusCode.ERROR }); + span.addEvent('publish_failed', { error: String(result.contextGraphError ?? '') }); + } + if (result.status === 'confirmed' && result.onChainResult) { const rootEntities = result.kaManifest.map(ka => ka.rootEntity); @@ -4338,7 +4375,12 @@ export class PublishMethods extends DKGAgentBase { } } + const publishMetricAttrs = { outcome: result.status, source: 'swm', ...(chainId ? { chain_id: chainId } : {}) }; + getMetrics().publishTotal.add(1, publishMetricAttrs); + getMetrics().publishDuration.record(Date.now() - publishStartedAt, publishMetricAttrs); + return result; + }); } /** @deprecated Use publishFromSharedMemory. Will be removed in V10.1. */ diff --git a/packages/agent/src/p2p/sync-transport.ts b/packages/agent/src/p2p/sync-transport.ts index 745c376f6..fb4ae7ddb 100644 --- a/packages/agent/src/p2p/sync-transport.ts +++ b/packages/agent/src/p2p/sync-transport.ts @@ -1,5 +1,5 @@ import { randomUUID } from 'node:crypto'; -import { withRetry } from '@origintrail-official/dkg-core'; +import { withRetry, withSpan, getMetrics } from '@origintrail-official/dkg-core'; import { markSyncTransportFailure } from '../sync/error-tags.js'; /** @@ -82,7 +82,11 @@ interface SyncSendParams { } export async function sendSyncRequest(params: SyncSendParams): Promise { - return withRetry( + return withSpan( + 'sync.request', + async () => { + try { + const out = await withRetry( async () => { throwIfAborted(params.signal); const requestBytes = await params.requestFactory(); @@ -114,6 +118,15 @@ export async function sendSyncRequest(params: SyncSendParams): Promise params.signal?.aborted !== true, onRetry: params.onRetry, }, + ); + getMetrics().syncRequestTotal.add(1, { outcome: 'ok', protocol_id: params.protocolId }); + return out; + } catch (err) { + getMetrics().syncRequestTotal.add(1, { outcome: 'error', protocol_id: params.protocolId }); + throw err; + } + }, + { attributes: { 'dkg.protocol_id': params.protocolId } }, ); } diff --git a/packages/agent/src/sync/responder/sync-handler.ts b/packages/agent/src/sync/responder/sync-handler.ts index 5c00015fa..c89f8074f 100644 --- a/packages/agent/src/sync/responder/sync-handler.ts +++ b/packages/agent/src/sync/responder/sync-handler.ts @@ -1,6 +1,8 @@ import { createOperationContext, QuietRetryableHandlerError, + withSpan, + getMetrics, type OperationContext, } from '@origintrail-official/dkg-core'; import type { TripleStore } from '@origintrail-official/dkg-storage'; @@ -297,7 +299,8 @@ export function registerSyncHandler(params: RegisterSyncHandlerParams): void { }; }; - register(protocolSync, async (data, peerId, options) => { + register(protocolSync, async (data, peerId, options) => withSpan('sync.response', async (span) => { + span.setAttribute('dkg.protocol_id', protocolSync); const signal = options?.signal; const handlerStartedAt = Date.now(); const request = parseSyncRequest(data); @@ -495,12 +498,19 @@ export function registerSyncHandler(params: RegisterSyncHandlerParams): void { logDebug(createOperationContext('sync'), `Sync responder total for "${contextGraphId}" (phase=${phase}, workspace=${isWorkspace}): ${totalDurationMs}ms`); } return new TextEncoder().encode(nquads.join('\n')); + }).then((res) => { + getMetrics().syncResponseTotal.add(1, { outcome: 'ok' }); + return res; }).catch((err) => { if (err instanceof SyncResponderBusyError) { + getMetrics().syncResponseTotal.add(1, { outcome: 'busy' }); + span.setAttribute('dkg.sync_response_outcome', 'busy'); logDebug(createOperationContext('sync'), `Sync responder busy for "${contextGraphId}" from peer ${peerId} (phase=${phase}): ${err.message}`); throw new QuietRetryableHandlerError(err.message); } if (err instanceof SyncRowSnapshotLimitError) { + getMetrics().syncResponseTotal.add(1, { outcome: 'limit' }); + span.setAttribute('dkg.sync_response_outcome', 'limit'); logWarn( createOperationContext('sync'), `Sync responder snapshot limit for "${contextGraphId}" from peer ${peerId} (phase=${phase}, workspace=${isWorkspace}): active=${err.activeEntries}/${err.maxEntries} cached=${err.cachedEntries} inflight=${err.inflightEntries} key=${err.key}`, @@ -509,7 +519,8 @@ export function registerSyncHandler(params: RegisterSyncHandlerParams): void { `sync responder snapshot limit exceeded (active=${err.activeEntries}/${err.maxEntries})`, ); } + getMetrics().syncResponseTotal.add(1, { outcome: 'error' }); throw err; }); - }); + })); } diff --git a/packages/chain/package.json b/packages/chain/package.json index 51078dcbf..d11e1a936 100644 --- a/packages/chain/package.json +++ b/packages/chain/package.json @@ -19,6 +19,8 @@ "@origintrail-official/dkg-evm-module": "workspace:*" }, "devDependencies": { + "@opentelemetry/api": "^1.9.1", + "@opentelemetry/sdk-metrics": "^2.8.0", "@vitest/coverage-v8": "^4.0.18", "vitest": "^4.0.18" }, diff --git a/packages/chain/src/evm-adapter-base.ts b/packages/chain/src/evm-adapter-base.ts index e2ba1720d..2bb2c7735 100644 --- a/packages/chain/src/evm-adapter-base.ts +++ b/packages/chain/src/evm-adapter-base.ts @@ -18,7 +18,7 @@ import { DEFAULT_APPROVAL_POLICY } from './chain-adapter.js'; import type { ApprovalPolicy, V10PublishParams, OnChainPublishResult, ConvictionReader } from './chain-adapter.js'; import { HubResolutionCache } from './hub-resolution-cache.js'; import { KeyedSerializer } from './keyed-mutex.js'; -import { floorPublishTokenAmount } from '@origintrail-official/dkg-core'; +import { floorPublishTokenAmount, withSpan, getMetrics } from '@origintrail-official/dkg-core'; import { loadAbi } from './evm-adapter-abi.js'; import { errorCode, errorMessage, errorStatus, isTooLowAllowanceError, enrichEvmError, HUB_STALE_ERROR_MARKERS, isInsufficientFundsError, InsufficientPublisherFundsError, formatNoFundedPublisherWalletMessage, type PublisherWalletBalance } from './evm-adapter-errors.js'; import { resolveRpcUrls, boundedRetryFetchRequest, withTimeout, isKnownTransactionError, isRetryableRpcError, assertSuccessfulReceipt, sleep } from './evm-adapter-rpc.js'; @@ -759,72 +759,169 @@ export class EVMChainAdapterBase { return this.signerPool.find((signer) => signer.address.toLowerCase() === normalized); } + /** + * Classify an RPC error for low-cardinality metric labels: `timeout` for the + * synthetic `withTimeout` TIMEOUT code, else `error`. Used at the chain RPC + * metric record sites so the `outcome` label stays bounded. + */ + private _rpcOutcomeForError(err: unknown): 'error' | 'timeout' { + return errorCode(err) === 'TIMEOUT' ? 'timeout' : 'error'; + } + protected async broadcastSignedTransactionWithFailover( signedTx: string, txHash: string, label: string, ): Promise { - let lastRetryable: unknown; - for (let i = 0; i < this.providers.length; i += 1) { - const provider = this.providers[i]; - try { - await withTimeout( - provider.broadcastTransaction(signedTx), - RPC_BROADCAST_ATTEMPT_TIMEOUT_MS, - `${label} broadcast via RPC #${i + 1}`, - ); - return; - } catch (err) { - if (isKnownTransactionError(err)) return; - if (!isRetryableRpcError(err)) throw err; - lastRetryable = err; - if (i < this.providers.length - 1) { - noteRpcFailover(`${label} broadcast`, this.rpcUrls[i], err, this.rpcUrls[i + 1]); + return withSpan( + 'chain.tx_submit', + async (span) => { + const metrics = getMetrics(); + const startedAt = Date.now(); + let lastRetryable: unknown; + for (let i = 0; i < this.providers.length; i += 1) { + const provider = this.providers[i]; + span.addEvent('broadcast.attempt', { attempt: i + 1 }); + try { + await withTimeout( + provider.broadcastTransaction(signedTx), + RPC_BROADCAST_ATTEMPT_TIMEOUT_MS, + `${label} broadcast via RPC #${i + 1}`, + ); + span.setAttribute('dkg.tx_hash', txHash); + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_sendRawTransaction', outcome: 'ok', retryable: false, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_sendRawTransaction', chain_id: this.chainId, + }); + return; + } catch (err) { + if (isKnownTransactionError(err)) { + // Already-known / already-mined tx is success for our purposes. + span.setAttribute('dkg.tx_hash', txHash); + span.addEvent('broadcast.already_known', { attempt: i + 1 }); + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_sendRawTransaction', outcome: 'ok', retryable: false, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_sendRawTransaction', chain_id: this.chainId, + }); + return; + } + if (!isRetryableRpcError(err)) { + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_sendRawTransaction', outcome: this._rpcOutcomeForError(err), + retryable: false, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_sendRawTransaction', chain_id: this.chainId, + }); + throw err; + } + lastRetryable = err; + if (i < this.providers.length - 1) { + noteRpcFailover(`${label} broadcast`, this.rpcUrls[i], err, this.rpcUrls[i + 1]); + } + } } - } - } - if (lastRetryable) noteRpcExhaustion(`${label} broadcast`, this.rpcUrls); - // Typed transport error (mirroring the preparation loop + CLI path) so a - // broadcast-time all-endpoints-exhausted failure maps to a retryable 503 at - // the HTTP boundary, not a generic 500 — an exhaustion after a provider - // populated/signed would otherwise surface code-less. - throw new ChainRpcTransportError( - 'RPC_ENDPOINTS_EXHAUSTED', - `${label} broadcast failed on all configured RPC endpoints for tx ${txHash}: ${errorMessage(lastRetryable)}`, - { cause: lastRetryable, rpcUrls: this.rpcUrls }, + // All configured endpoints exhausted. + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_sendRawTransaction', outcome: this._rpcOutcomeForError(lastRetryable), + retryable: true, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_sendRawTransaction', chain_id: this.chainId, + }); + metrics.chainRpcFailoverTotal.add(1, { + rpc_method: 'eth_sendRawTransaction', chain_id: this.chainId, reason: 'exhausted', + }); + if (lastRetryable) noteRpcExhaustion(`${label} broadcast`, this.rpcUrls); + // Typed transport error so a broadcast-time all-endpoints-exhausted + // failure maps to a retryable 503 at the HTTP boundary, not a 500. + throw new ChainRpcTransportError( + 'RPC_ENDPOINTS_EXHAUSTED', + `${label} broadcast failed on all configured RPC endpoints for tx ${txHash}: ${errorMessage(lastRetryable)}`, + { cause: lastRetryable, rpcUrls: this.rpcUrls }, + ); + }, + { attributes: { 'rpc.method': 'eth_sendRawTransaction', 'dkg.chain_id': this.chainId } }, ); } protected async getTransactionReceiptWithFailover(txHash: string): Promise { - let lastRetryable: unknown; - let sawNonErrorResponse = false; - for (let i = 0; i < this.providers.length; i += 1) { - const provider = this.providers[i]; - try { - const receipt = await withTimeout( - provider.getTransactionReceipt(txHash), - RPC_RECEIPT_ATTEMPT_TIMEOUT_MS, - `receipt lookup via RPC #${i + 1}`, - ); - sawNonErrorResponse = true; - if (receipt) return receipt; - } catch (err) { - if (!isRetryableRpcError(err)) throw err; - lastRetryable = err; - if (i < this.providers.length - 1) { - noteRpcFailover('receipt lookup', this.rpcUrls[i], err, this.rpcUrls[i + 1]); + return withSpan( + 'chain.tx_wait', + async (span) => { + const metrics = getMetrics(); + const startedAt = Date.now(); + let lastRetryable: unknown; + let sawNonErrorResponse = false; + for (let i = 0; i < this.providers.length; i += 1) { + const provider = this.providers[i]; + span.addEvent('receipt.attempt', { attempt: i + 1 }); + try { + const receipt = await withTimeout( + provider.getTransactionReceipt(txHash), + RPC_RECEIPT_ATTEMPT_TIMEOUT_MS, + `receipt lookup via RPC #${i + 1}`, + ); + sawNonErrorResponse = true; + if (receipt) { + span.setAttribute('dkg.tx_hash', txHash); + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_getTransactionReceipt', outcome: 'ok', retryable: false, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_getTransactionReceipt', chain_id: this.chainId, + }); + return receipt; + } + } catch (err) { + if (!isRetryableRpcError(err)) { + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_getTransactionReceipt', outcome: this._rpcOutcomeForError(err), + retryable: false, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_getTransactionReceipt', chain_id: this.chainId, + }); + throw err; + } + lastRetryable = err; + if (i < this.providers.length - 1) { + noteRpcFailover('receipt lookup', this.rpcUrls[i], err, this.rpcUrls[i + 1]); + } + } } - } - } - if (lastRetryable && !sawNonErrorResponse) { - noteRpcExhaustion('receipt lookup', this.rpcUrls); - throw new ChainRpcTransportError( - 'RPC_RECEIPT_LOOKUP_FAILED', - `Receipt lookup for tx ${txHash} failed on all configured RPC endpoints: ${errorMessage(lastRetryable)}`, - { cause: lastRetryable, txHash }, - ); - } - return null; + if (lastRetryable && !sawNonErrorResponse) { + // No backend could even answer the lookup → endpoints exhausted. + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_getTransactionReceipt', outcome: this._rpcOutcomeForError(lastRetryable), + retryable: true, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_getTransactionReceipt', chain_id: this.chainId, + }); + metrics.chainRpcFailoverTotal.add(1, { + rpc_method: 'eth_getTransactionReceipt', chain_id: this.chainId, reason: 'exhausted', + }); + noteRpcExhaustion('receipt lookup', this.rpcUrls); + throw new ChainRpcTransportError( + 'RPC_RECEIPT_LOOKUP_FAILED', + `Receipt lookup for tx ${txHash} failed on all configured RPC endpoints: ${errorMessage(lastRetryable)}`, + { cause: lastRetryable, txHash }, + ); + } + // At least one backend answered but the tx is not yet mined (null + // receipt). This is a benign poll tick, not a terminal outcome, so we + // intentionally do NOT emit an outcome metric here (the surrounding + // poll loop calls this repeatedly until mined/timeout). + span.setAttribute('dkg.receipt_pending', true); + return null; + }, + { attributes: { 'rpc.method': 'eth_getTransactionReceipt', 'dkg.chain_id': this.chainId } }, + ); } /** @@ -943,10 +1040,41 @@ export class EVMChainAdapterBase { isRetryable?: (err: unknown) => boolean; }, ): Promise { - return this.readWithFailover(label, (p) => fn(this.rebindContract(contract, p)), { - ...opts, - isRetryable: opts?.isRetryable ?? isContractViewRetryable, - }); + // Single telemetry choke-point for every CONTRACT VIEW read (eth_call): + // one `chain.eth_call` span + RPC metric spanning the whole failover + // sequence. Replaces the former per-call-site spans (token.allowance, + // Hub.getContractAddress, identityStorage.getIdentityId, …) so all view + // reads are instrumented uniformly now that they route through here. + return withSpan( + 'chain.eth_call', + async (span) => { + const metrics = getMetrics(); + const startedAt = Date.now(); + try { + const out = await this.readWithFailover(label, (p) => fn(this.rebindContract(contract, p)), { + ...opts, + isRetryable: opts?.isRetryable ?? isContractViewRetryable, + }); + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_call', outcome: 'ok', retryable: false, chain_id: this.chainId, + }); + return out; + } catch (err) { + const outcome = this._rpcOutcomeForError(err); + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_call', outcome, retryable: isRetryableRpcError(err), chain_id: this.chainId, + }); + throw err; + } finally { + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_call', chain_id: this.chainId, + }); + } + }, + // `dkg.read` carries the per-call identity (e.g. 'token.allowance') on + // the span only — kept OFF the metric to hold its label set low-cardinality. + { attributes: { 'rpc.method': 'eth_call', 'dkg.chain_id': this.chainId, 'dkg.read': label } }, + ); } protected async waitForReceiptWithFailover( @@ -1213,6 +1341,11 @@ export class EVMChainAdapterBase { ? errorMessage(lastRetryable) : `${label} transaction preparation failed on all configured RPC endpoints ` + `(${this.rpcUrls.map(rpcHost).join(', ')}): ${errorMessage(lastRetryable)}`; + // Populate+sign exhausted every endpoint (mirrors the broadcast/receipt + // exhaustion counters in sendSignedTransactionAndWait). + getMetrics().chainRpcFailoverTotal.add(1, { + rpc_method: 'eth_sendRawTransaction', chain_id: this.chainId, reason: 'exhausted', + }); throw new ChainRpcTransportError('RPC_ENDPOINTS_EXHAUSTED', message, { cause: lastRetryable, rpcUrls: this.rpcUrls, @@ -1238,13 +1371,24 @@ export class EVMChainAdapterBase { // the limit by `gasLimitBufferBps` basis points so the drift can't OOG. opts?: { gasLimitBufferBps?: number }, ): Promise { - // Populate+sign with per-endpoint failover (shared with the V10 path), then - // broadcast+confirm the single signed tx. Split so `onBroadcast` (the WAL - // checkpoint) can sit between sign and broadcast for the V10 callers. - const { signedTx, txHash } = await this.populateAndSignAcrossProviders( - contract, method, args, signer, label, opts, + // Parent span for the whole send. Broadcast + receipt-wait open their own + // nested spans/metrics (chain.tx_submit / chain.tx_wait) inside + // sendSignedTransactionAndWait; populate+sign failover is counted inside + // populateAndSignAcrossProviders. + return withSpan( + 'chain.tx_send', + async (span) => { + // Populate+sign with per-endpoint failover (shared with the V10 path), + // then broadcast+confirm the single signed tx. Split so `onBroadcast` + // (the WAL checkpoint) can sit between sign and broadcast for V10 callers. + const { signedTx, txHash } = await this.populateAndSignAcrossProviders( + contract, method, args, signer, label, opts, + ); + span.setAttribute('dkg.tx_hash', txHash); + return this.sendSignedTransactionAndWait(signedTx, txHash, label); + }, + { attributes: { 'rpc.method': 'eth_sendRawTransaction', 'dkg.chain_id': this.chainId } }, ); - return this.sendSignedTransactionAndWait(signedTx, txHash, label); } /** @@ -2184,39 +2328,66 @@ export class EVMChainAdapterBase { label: string, preferred?: JsonRpcProvider, ): Promise<{ logs: ReadonlyArray; provider: JsonRpcProvider }> { - // Eligible backends (tip covers the page), with the sticky preferred one moved - // to the front when it still qualifies; the remainder keep their freshest-first - // order from `scanProviders`. - const eligible = scanProviders.filter(({ backendHead }) => backendHead >= hi); - const ordered = - preferred && eligible.some(({ provider }) => provider === preferred) - ? [ - ...eligible.filter(({ provider }) => provider === preferred), - ...eligible.filter(({ provider }) => provider !== preferred), - ] - : eligible; - let pageError: unknown; - for (const { provider } of ordered) { - let contract = connected.get(provider); - if (!contract) { - contract = baseContract.connect(provider) as Contract; - connected.set(provider, contract); - } - try { - const logs = await withTimeout( - contract.queryFilter(filter as any, lo, hi), - KA_HIGH_WATER_PAGE_TIMEOUT_MS, - `${label} getLogs [${lo}, ${hi}]`, + return withSpan( + 'chain.eth_getLogs', + async (span) => { + const metrics = getMetrics(); + const startedAt = Date.now(); + // Eligible backends (tip covers the page), with the sticky preferred one moved + // to the front when it still qualifies; the remainder keep their freshest-first + // order from `scanProviders`. + const eligible = scanProviders.filter(({ backendHead }) => backendHead >= hi); + const ordered = + preferred && eligible.some(({ provider }) => provider === preferred) + ? [ + ...eligible.filter(({ provider }) => provider === preferred), + ...eligible.filter(({ provider }) => provider !== preferred), + ] + : eligible; + let pageError: unknown; + for (const { provider } of ordered) { + let contract = connected.get(provider); + if (!contract) { + contract = baseContract.connect(provider) as Contract; + connected.set(provider, contract); + } + try { + const logs = await withTimeout( + contract.queryFilter(filter as any, lo, hi), + KA_HIGH_WATER_PAGE_TIMEOUT_MS, + `${label} getLogs [${lo}, ${hi}]`, + ); + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_getLogs', outcome: 'ok', retryable: false, chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_getLogs', chain_id: this.chainId, + }); + return { logs, provider }; + } catch (err) { + pageError = err; // hung or errored — fail over to the next eligible backend + } + } + // Every eligible backend failed for this page → one error/timeout outcome. + const outcome = pageError ? this._rpcOutcomeForError(pageError) : 'error'; + metrics.chainRpcTotal.add(1, { + rpc_method: 'eth_getLogs', outcome, retryable: isRetryableRpcError(pageError), chain_id: this.chainId, + }); + metrics.chainRpcDuration.record(Date.now() - startedAt, { + rpc_method: 'eth_getLogs', chain_id: this.chainId, + }); + throw new Error( + `${label}: no configured RPC could serve the log range [${lo}, ${hi}]` + + `${pageError ? `: ${errorMessage(pageError)}` : ''}.`, + pageError ? { cause: pageError } : undefined, ); - return { logs, provider }; - } catch (err) { - pageError = err; // hung or errored — fail over to the next eligible backend - } - } - throw new Error( - `${label}: no configured RPC could serve the log range [${lo}, ${hi}]` + - `${pageError ? `: ${errorMessage(pageError)}` : ''}.`, - pageError ? { cause: pageError } : undefined, + }, + { + attributes: { + 'rpc.method': 'eth_getLogs', 'dkg.chain_id': this.chainId, + 'dkg.block_lo': lo, 'dkg.block_hi': hi, + }, + }, ); } diff --git a/packages/chain/test/chain-rpc-telemetry.unit.test.ts b/packages/chain/test/chain-rpc-telemetry.unit.test.ts new file mode 100644 index 000000000..1631783cf --- /dev/null +++ b/packages/chain/test/chain-rpc-telemetry.unit.test.ts @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * R3 (PR #1317 review): the metric instrumentation must be proven through the + * REAL instrumented code path, not by hand-emitting `getMetrics().xxx.add(...)` + * with canned attributes. This drives the actual `contractReadWithFailover` + * choke-point (the single seam every contract VIEW read funnels through after + * the #1335 failover refactor) under an in-memory OTel meter and asserts the + * emitted `dkg.chain.rpc.total` carries ONLY the bounded low-cardinality label + * set. It is the direct regression gate for the reviewer's example: "if + * `chainRpcTotal.add(...)` were changed to include `{ rpc_url: this.rpcUrls[i] }` + * this test would fail" — whereas the prior hand-written-sample test would not. + */ +import { describe, it, expect, afterEach } from 'vitest'; +import { metrics } from '@opentelemetry/api'; +import { + MeterProvider, + PeriodicExportingMetricReader, + InMemoryMetricExporter, + AggregationTemporality, +} from '@opentelemetry/sdk-metrics'; +import { rebuildMetrics } from '@origintrail-official/dkg-core'; +import { EVMChainAdapter, type EVMAdapterConfig } from '../src/evm-adapter.js'; + +const DEPLOYER_PK = '0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80'; +const HUB = '0x0000000000000000000000000000000000000001'; + +function minimalConfig(overrides: Partial = {}): EVMAdapterConfig { + return { + rpcUrl: 'http://127.0.0.1:1', + privateKey: DEPLOYER_PK, + hubAddress: HUB, + chainId: 'evm:31337', + allowNoAdminSigner: true, + ...overrides, + }; +} + +// A contract stub whose `.connect(runner)` returns itself, so `rebindContract` +// is a no-op and the read `fn` runs deterministically without a live provider. +function fakeContract(): any { + const c: any = { connect: () => c }; + return c; +} + +const ALLOWED_RPC_LABELS = new Set(['rpc_method', 'outcome', 'retryable', 'chain_id']); +const FORBIDDEN_LABELS = ['rpc_url', 'peer_id', 'tx_hash', 'operation_id', 'assertion_id', 'kaId']; + +describe('chain RPC telemetry — real contractReadWithFailover emits bounded labels', () => { + let mp: MeterProvider | null = null; + let exporter: InMemoryMetricExporter; + + function installMeter(): void { + exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); + mp = new MeterProvider({ + readers: [new PeriodicExportingMetricReader({ exporter, exportIntervalMillis: 60_000 })], + }); + metrics.setGlobalMeterProvider(mp); + rebuildMetrics(); // rebind the core facade's instrument cache to the real meter + } + + function chainRpcDataPoints(): Array<{ attrs: Record }> { + const out: Array<{ attrs: Record }> = []; + for (const rm of exporter.getMetrics()) + for (const sm of rm.scopeMetrics) + for (const metric of sm.metrics) + if (metric.descriptor.name === 'dkg.chain.rpc.total') + for (const dp of metric.dataPoints) out.push({ attrs: dp.attributes as Record }); + return out; + } + + afterEach(async () => { + if (mp) { + await mp.forceFlush().catch(() => {}); + await mp.shutdown().catch(() => {}); + mp = null; + } + metrics.disable(); + rebuildMetrics(); // back to the no-op meter for the next test/file + }); + + it('SUCCESS path: records outcome=ok with exactly {rpc_method,outcome,retryable,chain_id}', async () => { + installMeter(); + const a: any = new EVMChainAdapter(minimalConfig()); + a.providers = [{}]; // one dummy provider — `fn` ignores it + a.rpcUrls = ['http://loopback']; + + const out = await a.contractReadWithFailover('token.allowance', fakeContract(), () => 42n); + expect(out).toBe(42n); + + await mp!.forceFlush(); + const pts = chainRpcDataPoints(); + expect(pts.length).toBeGreaterThanOrEqual(1); + + const keys = new Set(pts.flatMap((p) => Object.keys(p.attrs))); + // Exactly the allow-listed labels — nothing high-cardinality leaked in. + expect([...keys].filter((k) => !ALLOWED_RPC_LABELS.has(k))).toEqual([]); + for (const bad of FORBIDDEN_LABELS) expect(keys.has(bad)).toBe(false); + // Bounded VALUES on the success branch. + expect( + pts.some((p) => p.attrs.rpc_method === 'eth_call' && p.attrs.outcome === 'ok' && p.attrs.chain_id === 'evm:31337'), + ).toBe(true); + + a.destroy?.(); + }); + + it('ERROR path: records a non-ok outcome with the same bounded label set', async () => { + installMeter(); + const a: any = new EVMChainAdapter(minimalConfig()); + a.providers = [{}]; + a.rpcUrls = ['http://loopback']; + + await expect( + a.contractReadWithFailover('token.allowance', fakeContract(), () => { + throw new Error('boom'); + }), + ).rejects.toBeTruthy(); + + await mp!.forceFlush(); + const pts = chainRpcDataPoints(); + expect(pts.length).toBeGreaterThanOrEqual(1); + expect(pts.every((p) => p.attrs.outcome !== 'ok')).toBe(true); + const keys = new Set(pts.flatMap((p) => Object.keys(p.attrs))); + expect([...keys].filter((k) => !ALLOWED_RPC_LABELS.has(k))).toEqual([]); + for (const bad of FORBIDDEN_LABELS) expect(keys.has(bad)).toBe(false); + + a.destroy?.(); + }); +}); diff --git a/packages/cli/src/config.ts b/packages/cli/src/config.ts index ec608eddd..e49e57c15 100644 --- a/packages/cli/src/config.ts +++ b/packages/cli/src/config.ts @@ -538,8 +538,65 @@ export interface DkgConfig { * on first start and stored in `/auth.token`. */ auth?: { enabled?: boolean; tokens?: string[] }; - /** Opt-in telemetry streaming to central network dashboard. */ - telemetry?: { enabled?: boolean }; + /** + * Opt-in telemetry streaming to a central network dashboard. + * `enabled` is the master gate: when false, NOTHING is forwarded off the + * node (local logging — SQLite + daemon.log — is always on regardless). + */ + telemetry?: { + enabled?: boolean; + /** + * Remote log forwarding (opt-in). Active only when `enabled` is true. + */ + logs?: { + /** + * Outbound transport for logs. 'none' = local only; 'otlp' = OTLP/HTTP + * to an OpenTelemetry collector; 'syslog' = legacy RFC 5424 → Graylog. + * Defaults to 'syslog' when unset (preserves prior behaviour). + */ + exporter?: 'none' | 'otlp' | 'syslog'; + /** + * OTLP/HTTP logs endpoint, e.g. http://localhost:4318/v1/logs. Falls + * back to the per-network default (TELEMETRY_ENDPOINTS[network].otlpLogs). + */ + endpoint?: string; + /** Bearer credential for the operator's collector. Treated as a secret. */ + token?: string; + /** Minimum level forwarded remotely. Local sink keeps everything. Default 'info'. */ + level?: 'debug' | 'info' | 'warn' | 'error'; + /** Extra sensitive key names to redact from messages before they leave the node. */ + redact?: string[]; + /** Bounded in-memory buffer; drop-oldest on overflow. Default 500. */ + bufferMaxEntries?: number; + }; + /** + * OTel trace export (opt-in, independent of logs). Registers the tracer + * ONLY when an endpoint resolves (config or OTEL_EXPORTER_OTLP_* env); + * never falls back to a guessed prod URL. + */ + traces?: { + enabled?: boolean; + /** OTLP traces endpoint, e.g. http://localhost:4318/v1/traces. */ + endpoint?: string; + /** Bearer credential. Treated as a secret. */ + token?: string; + /** Parent-based ratio sampler 0..1. Default 1.0. */ + sampleRatio?: number; + }; + /** + * OTel metric export (opt-in, independent of logs). Registers the meter + * ONLY when an endpoint resolves (config or OTEL_EXPORTER_OTLP_* env). + */ + metrics?: { + enabled?: boolean; + /** OTLP metrics endpoint, e.g. http://localhost:4318/v1/metrics. */ + endpoint?: string; + /** Bearer credential. Treated as a secret. */ + token?: string; + /** PeriodicExportingMetricReader interval. Default 30000ms. */ + exportIntervalMs?: number; + }; + }; /** Shared memory (workspace) data TTL in milliseconds. Default: 30 days (2592000000). Set to 0 to disable cleanup. */ sharedMemoryTtlMs?: number; /** @deprecated Legacy alias for sharedMemoryTtlMs */ @@ -736,14 +793,19 @@ export interface DkgConfig { * Nodes resolve the correct endpoints from the network they're on. * Operators only see a single toggle — no endpoint configuration. */ -export const TELEMETRY_ENDPOINTS: Record = { +export const TELEMETRY_ENDPOINTS: Record< + string, + { syslog: { host: string; port: number }; otlp: string; otlpLogs?: string } +> = { testnet: { syslog: { host: 'loggly.origin-trail.network', port: 12201 }, otlp: 'https://telemetry-testnet.origintrail.io/v1/metrics', + otlpLogs: 'https://telemetry-testnet.origintrail.io/v1/logs', // OriginTrail-hosted opt-in collector (TBD) }, mainnet: { - syslog: { host: 'loggly.origin-trail.network', port: 0 }, // TODO: assign mainnet syslog port + syslog: { host: 'loggly.origin-trail.network', port: 0 }, // legacy syslog — OTLP is the mainnet path otlp: 'https://telemetry.origintrail.io/v1/metrics', + otlpLogs: 'https://telemetry.origintrail.io/v1/logs', // OriginTrail-hosted opt-in collector (TBD) }, }; diff --git a/packages/cli/src/daemon/lifecycle.ts b/packages/cli/src/daemon/lifecycle.ts index 8d7404baa..97137cec2 100644 --- a/packages/cli/src/daemon/lifecycle.ts +++ b/packages/cli/src/daemon/lifecycle.ts @@ -69,7 +69,7 @@ import { } from '@origintrail-official/dkg-chain'; import { DKGAgent, loadOpWallets, KaNumberAllocator } from '@origintrail-official/dkg-agent'; import { isExternalBackend } from '@origintrail-official/dkg-storage'; -import { computeNetworkId, createOperationContext, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, sparqlIri, contextGraphSharedMemoryUri, contextGraphAssertionUri, contextGraphMetaUri, DEFAULT_PROTOCOL_OUTBOX_BACKOFFS_MS, DEFAULT_PROTOCOL_OUTBOX_MAX_AGE_MS, pickNetworkTunables } from '@origintrail-official/dkg-core'; +import { computeNetworkId, createOperationContext, createLogRedactor, DKGEvent, Logger, PayloadTooLargeError, GET_VIEWS, TrustLevel, validateSubGraphName, validateAssertionName, validateContextGraphId, isSafeIri, assertSafeIri, sparqlIri, contextGraphSharedMemoryUri, contextGraphAssertionUri, contextGraphMetaUri, DEFAULT_PROTOCOL_OUTBOX_BACKOFFS_MS, DEFAULT_PROTOCOL_OUTBOX_MAX_AGE_MS, pickNetworkTunables } from '@origintrail-official/dkg-core'; import { findReservedSubjectPrefix, isSkolemizedUri } from '@origintrail-official/dkg-publisher'; import { DashboardDB, @@ -78,6 +78,9 @@ import { handleNodeUIRequest, ChatMemoryManager, LogPushWorker, + OtlpLogWorker, + initTelemetry, + shutdownTelemetry, LlmClient, SqliteMessageIdempotencyStore, SqliteProtocolOutboxStore, @@ -127,6 +130,7 @@ import { exitOnStoreConfigErrors, validateNetworkConfigReadiness, } from '../config.js'; +import { resolveOtelSignals, resolveLogExporterMode } from '../telemetry-config.js'; import { createPublicSnapshotStore, createPublisherControlFromStore, startPublisherRuntimeIfEnabled, type PublisherRuntime } from '../publisher-runner.js'; import { createCatchupRunner, type CatchupJobResult, type CatchupRunner } from '../catchup-runner.js'; import { loadTokens, httpAuthGuard } from '../auth.js'; @@ -1985,6 +1989,11 @@ export async function runDaemonInner( chatDb = dashDb; log("Dashboard DB initialized at " + join(dkgDir(), "node-ui.db")); + // Redactor for the copy of each record that LEAVES the node. The local + // dashboard DB keeps full-fidelity records (it's the operator's own machine); + // redaction only protects data crossing the trust boundary to a collector. + const redactForRemote = createLogRedactor(config.telemetry?.logs?.redact); + Logger.setSink((entry) => { try { dashDb.insertLog({ @@ -1998,7 +2007,12 @@ export async function runDaemonInner( } catch { /* DB write must never break the node */ } - logPusher?.push(entry); + // Fan out a single redacted copy to every active remote shipper. + if (logPusher || otlpExporter) { + const safe = redactForRemote(entry); + logPusher?.push(safe); + otlpExporter?.push(safe); + } }); // Extract the plain value from an RDF typed literal like "6"^^ @@ -2168,6 +2182,7 @@ export async function runDaemonInner( : "mainnet"; const syslogEndpoint = TELEMETRY_ENDPOINTS[networkKey]?.syslog; let logPusher: LogPushWorker | null = null; + let otlpExporter: OtlpLogWorker | null = null; function startLogPusher(): { ok: boolean; error?: string } { if (logPusher) return { ok: true }; @@ -2209,11 +2224,124 @@ export async function runDaemonInner( log("Telemetry: log streaming disabled"); } + function startOtlpExporter(): { ok: boolean; error?: string } { + if (otlpExporter) return { ok: true }; + const endpoint = + config.telemetry?.logs?.endpoint || TELEMETRY_ENDPOINTS[networkKey]?.otlpLogs; + if (!endpoint) { + return { + ok: false, + error: `OTLP log export is not configured for ${networkKey} (set config.telemetry.logs.endpoint)`, + }; + } + const minLevel = config.telemetry?.logs?.level ?? "info"; + otlpExporter = new OtlpLogWorker({ + endpoint, + token: config.telemetry?.logs?.token, + network: networkKey, + peerId: agent.peerId, + nodeName: config.name, + version: nodeVersion, + commit: nodeCommit, + role: config.nodeRole ?? "edge", + chainId: config.chain?.chainId, + minLevel, + bufferMaxEntries: config.telemetry?.logs?.bufferMaxEntries, + onError: (m) => log(`Telemetry(OTLP): ${m}`), + }); + otlpExporter.start(); + log(`Telemetry: OTLP log export enabled → ${endpoint} (level ≥ ${minLevel})`); + return { ok: true }; + } + + function stopOtlpExporter(): void { + if (!otlpExporter) return; + otlpExporter.stop(); + otlpExporter = null; + log("Telemetry: OTLP log export disabled"); + } + + // OTel traces + metrics SDK (independent of the log-exporter path below). + // Endpoints resolve env-first (standard OTEL_EXPORTER_OTLP_* names) then + // config; a signal registers ONLY when its endpoint resolves — never a + // guessed prod default. Idempotent (initTelemetry no-ops once configured), so + // it is safe to call both at boot AND from the runtime enable toggle. + function startOtelSdk(): void { + const { tracesEndpoint, metricsEndpoint, tracesOn, metricsOn } = resolveOtelSignals( + config.telemetry, + ); + if (!tracesOn && !metricsOn) return; + try { + initTelemetry({ + enabled: true, + resource: { + serviceName: "dkg-node", + serviceVersion: nodeVersion, + serviceInstanceId: config.name, + network: networkKey, + peerId: agent.peerId, + nodeName: config.name, + nodeRole: config.nodeRole ?? "edge", + commit: nodeCommit, + chainId: config.chain?.chainId, + }, + traces: tracesOn + ? { + endpoint: tracesEndpoint, + token: config.telemetry?.traces?.token, + sampleRatio: config.telemetry?.traces?.sampleRatio, + } + : undefined, + metrics: metricsOn + ? { + endpoint: metricsEndpoint, + token: config.telemetry?.metrics?.token, + exportIntervalMs: config.telemetry?.metrics?.exportIntervalMs, + } + : undefined, + }); + log( + `Telemetry: OTel SDK registered (traces=${tracesOn ? tracesEndpoint : "off"}, metrics=${metricsOn ? metricsEndpoint : "off"})`, + ); + } catch (err) { + // Telemetry must never block startup. + log(`Telemetry: OTel init failed (non-fatal): ${String(err)}`); + } + } + + // Start ALL telemetry signals under the master gate: OTel traces/metrics + // (SDK) AND the configured log exporter. Used at boot and from the runtime + // enable toggle so both behave identically (the bug fixed here: enabling from + // a boot-disabled state previously started only logs, never traces/metrics). + // The log-exporter result is returned separately — a failed log shipper must + // NOT tear down or disable the independent traces/metrics signals. + function startTelemetry(): { ok: boolean; error?: string } { + startOtelSdk(); + // Dispatch to the configured log exporter. 'syslog' is the default when + // unset (preserves prior behaviour); 'otlp' is the recommended path; 'none' + // keeps logs local-only even while telemetry is enabled. + const mode = resolveLogExporterMode(config.telemetry); + if (mode === "none") return { ok: true }; + if (mode === "otlp") return startOtlpExporter(); + return startLogPusher(); + } + + // Stop ALL telemetry signals: log exporters AND the OTel SDK (flush + shut + // down + clear the API globals). Async so a runtime disable actually stops + // traces/metrics — not just logs — and callers can await an orderly teardown. + async function stopTelemetry(): Promise { + stopLogPusher(); + stopOtlpExporter(); + await shutdownTelemetry().catch(() => {}); + } + if (config.telemetry?.enabled) { - const r = startLogPusher(); + const r = startTelemetry(); if (!r.ok) { - log(`Telemetry: ${r.error}`); - config.telemetry.enabled = false; + // Log forwarding failed to start (e.g. mainnet syslog port 0). Disable + // ONLY the log signal — leave the master gate and any registered + // traces/metrics intact; they are independent signals. + log(`Telemetry: log exporter not started — ${r.error} (traces/metrics unaffected)`); } } @@ -2638,10 +2766,10 @@ export async function runDaemonInner( enabled: boolean, ): Promise<{ ok: boolean; error?: string }> => { if (enabled) { - const r = startLogPusher(); + const r = startTelemetry(); if (!r.ok) return r; } else { - stopLogPusher(); + await stopTelemetry(); } config.telemetry = { ...config.telemetry, enabled }; await saveConfig(config); @@ -3113,6 +3241,8 @@ export async function runDaemonInner( clearInterval(pruneTimer); rateLimiter.destroy(); metricsCollector.stop(); + // Stops log exporters AND flushes + shuts down the OTel SDK. + await stopTelemetry(); natStatusWatcherStop?.(); resetNatStatus(); await publisherRuntime diff --git a/packages/cli/src/telemetry-config.ts b/packages/cli/src/telemetry-config.ts new file mode 100644 index 000000000..7c0ed75f0 --- /dev/null +++ b/packages/cli/src/telemetry-config.ts @@ -0,0 +1,51 @@ +import type { DkgConfig } from './config.js'; + +/** + * Pure resolution of the daemon's telemetry routing — extracted from + * `lifecycle.ts` so it is unit-testable (endpoint precedence, no-TBD-default, + * per-signal gating, log-exporter selection). + */ + +export type LogExporterMode = 'none' | 'otlp' | 'syslog'; + +/** + * Which log exporter the daemon should start. Only consulted when + * `telemetry.enabled`. Defaults to 'syslog' (preserves prior behaviour). + */ +export function resolveLogExporterMode(telemetry: DkgConfig['telemetry']): LogExporterMode { + return telemetry?.logs?.exporter ?? 'syslog'; +} + +export interface ResolvedOtelSignals { + tracesEndpoint?: string; + metricsEndpoint?: string; + /** Register traces only when an endpoint resolves AND not explicitly disabled. */ + tracesOn: boolean; + metricsOn: boolean; +} + +/** + * Resolve OTLP traces/metrics endpoints env-first (standard `OTEL_EXPORTER_OTLP_*` + * names) then config. A signal is "on" only when an endpoint resolves and it is + * not explicitly disabled — there is NO guessed/TBD production default. + */ +export function resolveOtelSignals( + telemetry: DkgConfig['telemetry'], + env: Record = process.env, +): ResolvedOtelSignals { + const base = env.OTEL_EXPORTER_OTLP_ENDPOINT?.replace(/\/$/, ''); + const tracesEndpoint = + env.OTEL_EXPORTER_OTLP_TRACES_ENDPOINT || + (base ? `${base}/v1/traces` : undefined) || + telemetry?.traces?.endpoint; + const metricsEndpoint = + env.OTEL_EXPORTER_OTLP_METRICS_ENDPOINT || + (base ? `${base}/v1/metrics` : undefined) || + telemetry?.metrics?.endpoint; + return { + tracesEndpoint, + metricsEndpoint, + tracesOn: !!tracesEndpoint && telemetry?.traces?.enabled !== false, + metricsOn: !!metricsEndpoint && telemetry?.metrics?.enabled !== false, + }; +} diff --git a/packages/cli/test/telemetry-config.test.ts b/packages/cli/test/telemetry-config.test.ts new file mode 100644 index 000000000..b49d995e0 --- /dev/null +++ b/packages/cli/test/telemetry-config.test.ts @@ -0,0 +1,69 @@ +import { describe, it, expect } from 'vitest'; +import { resolveOtelSignals, resolveLogExporterMode } from '../src/telemetry-config.js'; + +/** + * Daemon telemetry-routing resolution (the logic lifecycle.ts uses to pick the + * log exporter and to register OTLP traces/metrics). Verifies env precedence, + * the no-TBD-prod-default rule, and per-signal gating. + */ + +describe('resolveLogExporterMode', () => { + it('defaults to syslog when unset (preserves prior behaviour)', () => { + expect(resolveLogExporterMode(undefined)).toBe('syslog'); + expect(resolveLogExporterMode({ enabled: true })).toBe('syslog'); + }); + it('honors an explicit exporter', () => { + expect(resolveLogExporterMode({ enabled: true, logs: { exporter: 'otlp' } })).toBe('otlp'); + expect(resolveLogExporterMode({ enabled: true, logs: { exporter: 'none' } })).toBe('none'); + expect(resolveLogExporterMode({ enabled: true, logs: { exporter: 'syslog' } })).toBe('syslog'); + }); +}); + +describe('resolveOtelSignals', () => { + it('is OFF when no endpoint resolves — never a guessed prod default', () => { + const r = resolveOtelSignals({ enabled: true }, {}); + expect(r.tracesOn).toBe(false); + expect(r.metricsOn).toBe(false); + expect(r.tracesEndpoint).toBeUndefined(); + expect(r.metricsEndpoint).toBeUndefined(); + }); + + it('uses per-signal config endpoints', () => { + const r = resolveOtelSignals( + { enabled: true, traces: { endpoint: 'http://c/v1/traces' }, metrics: { endpoint: 'http://c/v1/metrics' } }, + {}, + ); + expect(r.tracesOn).toBe(true); + expect(r.tracesEndpoint).toBe('http://c/v1/traces'); + expect(r.metricsOn).toBe(true); + expect(r.metricsEndpoint).toBe('http://c/v1/metrics'); + }); + + it('derives per-signal paths from OTEL_EXPORTER_OTLP_ENDPOINT (base, trailing slash trimmed)', () => { + const r = resolveOtelSignals({ enabled: true }, { OTEL_EXPORTER_OTLP_ENDPOINT: 'http://base:4318/' }); + expect(r.tracesEndpoint).toBe('http://base:4318/v1/traces'); + expect(r.metricsEndpoint).toBe('http://base:4318/v1/metrics'); + expect(r.tracesOn).toBe(true); + expect(r.metricsOn).toBe(true); + }); + + it('precedence: signal-specific env > base env > config', () => { + const r = resolveOtelSignals( + { enabled: true, metrics: { endpoint: 'http://cfg/v1/metrics' } }, + { + OTEL_EXPORTER_OTLP_ENDPOINT: 'http://base:4318', + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: 'http://specific/v1/metrics', + }, + ); + expect(r.metricsEndpoint).toBe('http://specific/v1/metrics'); + }); + + it('a signal explicitly disabled stays OFF even with an endpoint', () => { + const r = resolveOtelSignals( + { enabled: true, traces: { endpoint: 'http://c/v1/traces', enabled: false } }, + {}, + ); + expect(r.tracesEndpoint).toBe('http://c/v1/traces'); + expect(r.tracesOn).toBe(false); + }); +}); diff --git a/packages/core/package.json b/packages/core/package.json index a4725f58c..2e6ee39cd 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -30,6 +30,7 @@ "@multiformats/multiaddr": "^13.0.3", "@noble/ed25519": "^3.1.0", "@noble/hashes": "^2.2.0", + "@opentelemetry/api": "^1.9.1", "js-yaml": "^4.1.1", "libp2p": "^3.3.1", "protobufjs": "^8.3.0", diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 03cbb02b5..374bbaba6 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -8,7 +8,12 @@ export * from './publisher-extension.js'; export * from './imported-artifact-bytes.js'; export * from './imported-artifact-metadata.js'; export * from './event-bus.js'; -export { Logger, createOperationContext, type OperationContext, type OperationName, type LogSink } from './logger.js'; +export { Logger, createOperationContext, type OperationContext, type OperationName, type LogSink, type LogRecord } from './logger.js'; +export { createLogRedactor, redactLogEntry, redactMessage, DEFAULT_SENSITIVE_KEYS, REDACTED } from './log-redaction.js'; +export { + getTracer, withSpan, linkedSpan, currentTraceIds, activeSpanContext, + getMetrics, rebuildMetrics, type WithSpanOpts, type DkgMetrics, +} from './telemetry-api.js'; export * from './crypto/index.js'; export * from './proto/index.js'; export { diff --git a/packages/core/src/log-redaction.ts b/packages/core/src/log-redaction.ts new file mode 100644 index 000000000..cc83024f3 --- /dev/null +++ b/packages/core/src/log-redaction.ts @@ -0,0 +1,129 @@ +/** + * At-source redaction of secrets from log records before they leave the node. + * + * Why this exists: V10 nodes are run by independent operators, and once a + * secret (a wallet private key, a mnemonic, an API token) is shipped to a + * remote collector it is irreversibly leaked. Redaction therefore runs on the + * node, on the copy of every log record that is about to be FORWARDED. The + * local dashboard DB keeps full-fidelity records for the operator's own + * debugging — redaction only protects data that crosses the trust boundary. + * + * Design choices (deliberately conservative to avoid mangling useful logs): + * - Structured "key: value" / key=value / "key":"value" shapes are redacted + * by KEY NAME (high precision). This is how DKG actually logs secrets + * (e.g. operationalWalletPrivateKey, mnemonic). + * - JWTs are redacted by shape (eyJ….….…) — effectively zero false positives. + * - We deliberately do NOT blanket-redact 0x-prefixed 64-hex strings: in DKG + * those are overwhelmingly Merkle roots, KC roots and tx hashes (public, + * non-secret) and nuking them would destroy debuggability. A bare private + * key with no key-name context is a residual gap best closed with a + * collector-side OTTL/regex backstop (see the PoC stack). + */ + +import type { LogRecord } from './logger.js'; + +/** + * Default sensitive key names whose values are scrubbed from log messages + * before forwarding. Matched case-insensitively. + */ +export const DEFAULT_SENSITIVE_KEYS: readonly string[] = [ + 'privateKey', + 'private_key', + 'privKey', + 'operationalWalletPrivateKey', + 'managementWalletPrivateKey', + 'mnemonic', + 'seedPhrase', + 'seed_phrase', + 'seed', + 'secret', + 'secretKey', + 'clientSecret', + 'password', + 'passphrase', + 'passwd', + 'pwd', + 'apiKey', + 'api_key', + 'apiToken', + 'accessToken', + 'access_token', + 'refreshToken', + 'refresh_token', + 'token', + 'authorization', + 'bearer', + 'sessionKey', + 'encryptionKey', +]; + +export const REDACTED = '[REDACTED]'; + +function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +/** + * Redact the free-form `message` of a log record. The two patterns are: + * 1. JWT-shaped tokens (header.payload.signature, base64url) — by shape. + * 2. `` — the value is replaced, the key kept. + * Quoted values (single/double/backtick) are redacted whole (so a quoted + * mnemonic with spaces is fully removed); bare values up to the next + * delimiter otherwise. + */ +export function redactMessage(message: string, keyRegex: RegExp, jwtRegex: RegExp): string { + if (!message) return message; + // Reset lastIndex defensively (these are global regexes reused across calls). + jwtRegex.lastIndex = 0; + keyRegex.lastIndex = 0; + let out = message.replace(jwtRegex, REDACTED); + out = out.replace(keyRegex, (_full, keyAndSep: string) => `${keyAndSep}${REDACTED}`); + return out; +} + +function buildKeyRegex(keys: readonly string[]): RegExp { + const alt = keys.map(escapeRegExp).join('|'); + // group 1 = key (optionally quoted) + separator (kept verbatim) + // group 2 = value (redacted): a quoted run, or a bare token up to a delimiter + return new RegExp( + '(' + + '["\'`]?\\b(?:' + alt + ')\\b["\'`]?' + // key, optionally quoted + '\\s*[:=]\\s*' + // : or = + ')' + + '(' + + '"[^"]*"' + '|' + + "'[^']*'" + '|' + + '`[^`]*`' + '|' + + // auth-scheme + credential as ONE value, so `authorization: Bearer ` + // redacts the token too (not just the scheme word). + '(?:Bearer|Basic|Bot|Token|Digest|ApiKey)\\s+[^\\s,;}\\]\\)]+' + '|' + + '[^\\s,;}\\]\\)]+' + // bare token + ')', + 'gi', + ); +} + +// JWT: three base64url segments separated by dots, starting with the +// canonical `eyJ` ('{"' base64url-encoded). Conservative min lengths. +const JWT_SOURCE = '\\beyJ[A-Za-z0-9_-]{6,}\\.[A-Za-z0-9_-]{6,}\\.[A-Za-z0-9_-]{6,}\\b'; + +/** + * Compile a redactor once, then reuse it on the hot path (one per shipper). + * `extraKeys` are operator-configured additional sensitive key names. + */ +export function createLogRedactor(extraKeys: readonly string[] = []): (record: LogRecord) => LogRecord { + const keys = extraKeys.length ? [...DEFAULT_SENSITIVE_KEYS, ...extraKeys] : DEFAULT_SENSITIVE_KEYS; + const keyRegex = buildKeyRegex(keys); + const jwtRegex = new RegExp(JWT_SOURCE, 'g'); + return (record: LogRecord): LogRecord => { + if (!record || !record.message) return record; + const redacted = redactMessage(record.message, keyRegex, jwtRegex); + if (redacted === record.message) return record; // no change → no alloc + return { ...record, message: redacted }; + }; +} + +/** One-shot convenience (recompiles each call — do not use on the hot path). */ +export function redactLogEntry(record: LogRecord, extraKeys: readonly string[] = []): LogRecord { + return createLogRedactor(extraKeys)(record); +} diff --git a/packages/core/src/logger.ts b/packages/core/src/logger.ts index d4ab14a71..f20b1d5c5 100644 --- a/packages/core/src/logger.ts +++ b/packages/core/src/logger.ts @@ -1,4 +1,5 @@ import { randomUUID } from 'node:crypto'; +import { currentTraceIds } from './telemetry-api.js'; export type OperationName = 'publish' | 'update' | 'query' | 'resolve' | 'connect' | 'sync' | 'system' | 'share' | 'publishFromSWM' | 'gossip' | 'ka-update' | 'reconstruct' | 'init' | 'verify' | 'migrate-swm-attr'; @@ -9,14 +10,25 @@ export interface OperationContext { sourceOperationId?: string; } -export type LogSink = (entry: { +/** + * The canonical structured log record emitted on every Logger call. This is + * the single shape that flows to the local dashboard DB and to any remote + * shipper (syslog, OTLP). Keep it stable — redaction and the OTLP exporter + * both consume it. + */ +export interface LogRecord { level: string; operationName: string; operationId: string; sourceOperationId?: string; module: string; message: string; -}) => void; + /** Hex W3C trace/span id of the active span when logged (when a span is recording), for trace↔log correlation. */ + traceId?: string; + spanId?: string; +} + +export type LogSink = (entry: LogRecord) => void; /** * Structured logger that prefixes every message with a timestamp, @@ -36,23 +48,41 @@ export class Logger { this.prefix = moduleName; } + /** + * Build the structured record and hand it to the sink. Attaches the active + * span's trace/span id when one is recording (no-op/empty otherwise), so logs + * emitted inside an instrumented boundary correlate to its trace. + */ + private emit(level: string, ctx: OperationContext, message: string): void { + if (!Logger.sink) return; + Logger.sink({ + level, + operationName: ctx.operationName, + operationId: ctx.operationId, + sourceOperationId: ctx.sourceOperationId, + module: this.moduleName, + message, + ...currentTraceIds(), + }); + } + debug(ctx: OperationContext, message: string): void { - Logger.sink?.({ level: 'debug', operationName: ctx.operationName, operationId: ctx.operationId, sourceOperationId: ctx.sourceOperationId, module: this.moduleName, message }); + this.emit('debug', ctx, message); } info(ctx: OperationContext, message: string): void { process.stdout.write(`${this.format(ctx, message)}\n`); - Logger.sink?.({ level: 'info', operationName: ctx.operationName, operationId: ctx.operationId, sourceOperationId: ctx.sourceOperationId, module: this.moduleName, message }); + this.emit('info', ctx, message); } warn(ctx: OperationContext, message: string): void { process.stderr.write(`${this.format(ctx, message)} [WARN]\n`); - Logger.sink?.({ level: 'warn', operationName: ctx.operationName, operationId: ctx.operationId, sourceOperationId: ctx.sourceOperationId, module: this.moduleName, message }); + this.emit('warn', ctx, message); } error(ctx: OperationContext, message: string): void { process.stderr.write(`${this.format(ctx, message)} [ERROR]\n`); - Logger.sink?.({ level: 'error', operationName: ctx.operationName, operationId: ctx.operationId, sourceOperationId: ctx.sourceOperationId, module: this.moduleName, message }); + this.emit('error', ctx, message); } private format(ctx: OperationContext, message: string): string { diff --git a/packages/core/src/protocol-router.ts b/packages/core/src/protocol-router.ts index 72bff04a3..f97978feb 100644 --- a/packages/core/src/protocol-router.ts +++ b/packages/core/src/protocol-router.ts @@ -7,6 +7,7 @@ import { POOLED_MESSAGE_PROTOCOL, type MessageStreamPoolOptions, } from './message-stream-pool.js'; +import { withSpan, getMetrics } from './telemetry-api.js'; type AbortableByteStream = Stream | (AsyncIterable & { abort(reason?: unknown): void }); @@ -487,11 +488,38 @@ export class ProtocolRouter { } } + // Outbound P2P send — wrapped with a `protocol_router.send` span + the P2P + // send-duration metric. No-op when telemetry is disabled. The heavy + // pooled/one-shot logic lives in sendInner; this wrapper only measures. async send( peerIdStr: string, protocolId: string, data: Uint8Array, timeoutMsOrOpts: number | SendOptions = DEFAULT_SEND_TIMEOUT_MS, + ): Promise { + const startedAt = Date.now(); + const m = getMetrics(); + try { + const res = await withSpan( + 'protocol_router.send', + () => this.sendInner(peerIdStr, protocolId, data, timeoutMsOrOpts), + { attributes: { 'dkg.protocol_id': protocolId, 'dkg.peer': peerIdStr.slice(0, 8) } }, + ); + m.protocolSendTotal.add(1, { protocol_id: protocolId, outcome: 'ok' }); + return res; + } catch (err) { + m.protocolSendTotal.add(1, { protocol_id: protocolId, outcome: 'error' }); + throw err; + } finally { + m.protocolSendDuration.record(Date.now() - startedAt, { protocol_id: protocolId }); + } + } + + private async sendInner( + peerIdStr: string, + protocolId: string, + data: Uint8Array, + timeoutMsOrOpts: number | SendOptions = DEFAULT_SEND_TIMEOUT_MS, ): Promise { const opts: SendOptions = typeof timeoutMsOrOpts === 'number' ? { timeoutMs: timeoutMsOrOpts } : timeoutMsOrOpts; diff --git a/packages/core/src/telemetry-api.ts b/packages/core/src/telemetry-api.ts new file mode 100644 index 000000000..44cf59bcd --- /dev/null +++ b/packages/core/src/telemetry-api.ts @@ -0,0 +1,191 @@ +/** + * Thin call-site facade over the OpenTelemetry API. Used by agent / publisher / + * chain / sync code to open spans and read the active trace context WITHOUT + * importing the SDK or having a tracer threaded through constructors. + * + * It talks only to the ambient `@opentelemetry/api`, which resolves whatever + * provider is globally registered. When telemetry is disabled, NO provider is + * registered and the API returns its built-in no-op tracer: `withSpan` runs the + * body inside a non-recording span (zero cost, no export, no context switch). + * So instrumentation stays compiled-in but inert — there is never an + * `if (enabled)` guard at a call site. + * + * The SDK that makes these spans real is registered once at daemon boot by + * `initTelemetry()` in @origintrail-official/dkg-node-ui. + */ + +import { + trace, + context, + metrics, + SpanStatusCode, + type Span, + type Tracer, + type Attributes, + type SpanContext, + type Counter, + type Histogram, +} from '@opentelemetry/api'; + +const TRACER_NAME = '@origintrail-official/dkg'; +const METER_NAME = '@origintrail-official/dkg'; +const INVALID_TRACE_ID = '00000000000000000000000000000000'; + +export function getTracer(version?: string): Tracer { + return trace.getTracer(TRACER_NAME, version); +} + +export interface WithSpanOpts { + /** Initial span attributes (low-cardinality keys only; never secrets). */ + attributes?: Attributes; + /** Span links — e.g. the parent context of detached/queued work. */ + links?: SpanContext[]; +} + +/** + * Run `fn` inside an active span. Records the exception and sets ERROR status + * if `fn` throws, and always ends the span. Returns whatever `fn` returns. + * Because the SDK registers an AsyncLocalStorage context manager at boot, the + * span is active across awaits inside `fn`, so nested `withSpan` calls and the + * Logger pick up its context automatically. + */ +export async function withSpan( + name: string, + fn: (span: Span) => Promise | T, + opts?: WithSpanOpts, +): Promise { + return getTracer().startActiveSpan( + name, + { attributes: opts?.attributes, links: opts?.links?.map((c) => ({ context: c })) }, + async (span: Span): Promise => { + try { + return await fn(span); + } catch (err) { + span.recordException(err as Error); + span.setStatus({ + code: SpanStatusCode.ERROR, + message: err instanceof Error ? err.message : String(err), + }); + throw err; + } finally { + span.end(); + } + }, + ); +} + +/** + * Start a span linked to a (synchronously-captured) parent context. Use for + * detached/queued work whose parent span is no longer the active context + * (e.g. a Promise.race that settles after the caller has suspended). + */ +export async function linkedSpan( + name: string, + parent: SpanContext | undefined, + fn: (span: Span) => Promise | T, + attributes?: Attributes, +): Promise { + return withSpan(name, fn, { attributes, links: parent ? [parent] : undefined }); +} + +/** + * The active span's trace/span id (hex) for log correlation. Returns empty + * fields when no recording span is active (telemetry off, or outside any span) + * so logging behaviour is unchanged in that case. + */ +export function currentTraceIds(): { traceId?: string; spanId?: string } { + const span = trace.getSpan(context.active()); + if (!span) return {}; + const sc = span.spanContext(); + if (!sc || !sc.traceId || sc.traceId === INVALID_TRACE_ID) return {}; + return { traceId: sc.traceId, spanId: sc.spanId }; +} + +/** Synchronously capture the active span context (to link detached work later). */ +export function activeSpanContext(): SpanContext | undefined { + return trace.getSpan(context.active())?.spanContext(); +} + +// ───────────────────────── metrics ───────────────────────── +// +// Low-cardinality invocation metrics. Instruments are created lazily from the +// global meter, so they bind to the real MeterProvider once the SDK registers +// it at boot (and to the API no-op meter — inert — when telemetry is off). +// Attributes are supplied at record time; keep them to the bounded keys +// documented per metric (never peer_id / cg_id / tx_hash / op_id as labels). + +/** Duration buckets (ms) for operation-level histograms. */ +const OP_DURATION_BUCKETS = [50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, 60000, 120000]; +/** Duration buckets (ms) for chain RPC histograms (faster floor). */ +const RPC_DURATION_BUCKETS = [25, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000, 120000]; + +export interface DkgMetrics { + /** outcome={tentative|confirmed|failed}, source={direct|swm}, chain_id */ + publishTotal: Counter; + /** ms; outcome, source, chain_id */ + publishDuration: Histogram; + /** outcome={reached|timeout|impossible}, chain_id */ + ackQuorumTotal: Counter; + /** result={ack|decline|transport_error}, decline_code? (fixed enum) */ + ackPeerTotal: Counter; + /** result={valid|key-not-registered|not-in-sharding-table|rpc-error} */ + ackVerifyTotal: Counter; + /** outcome={ack|decline|reset}, decline_code?, chain_id */ + ackHandlerTotal: Counter; + /** rpc_method, outcome={ok|error|timeout}, retryable, chain_id */ + chainRpcTotal: Counter; + /** ms; rpc_method, chain_id */ + chainRpcDuration: Histogram; + /** rpc_method, chain_id, reason={exhausted|recovered} */ + chainRpcFailoverTotal: Counter; + /** outcome={ok|error}, protocol_id */ + syncRequestTotal: Counter; + /** outcome={ok|busy|limit|error} */ + syncResponseTotal: Counter; + /** outcome={ok|error}, protocol_id — outbound P2P protocol send */ + protocolSendTotal: Counter; + /** ms; protocol_id — P2P protocol send duration */ + protocolSendDuration: Histogram; +} + +function buildMetrics(): DkgMetrics { + const meter = metrics.getMeter(METER_NAME); + return { + publishTotal: meter.createCounter('dkg.publish.total', { description: 'Publish operations by outcome/source' }), + publishDuration: meter.createHistogram('dkg.publish.duration', { + unit: 'ms', description: 'Publish wall-time', advice: { explicitBucketBoundaries: OP_DURATION_BUCKETS }, + }), + ackQuorumTotal: meter.createCounter('dkg.ack.quorum.total', { description: 'ACK quorum outcomes' }), + ackPeerTotal: meter.createCounter('dkg.ack.peer.total', { description: 'Per-peer ACK request results' }), + ackVerifyTotal: meter.createCounter('dkg.ack.verify.total', { description: 'ACK identity verification results' }), + ackHandlerTotal: meter.createCounter('dkg.ack.handler.total', { description: 'Inbound storage-ACK handler outcomes' }), + chainRpcTotal: meter.createCounter('dkg.chain.rpc.total', { description: 'Chain RPC calls by method/outcome' }), + chainRpcDuration: meter.createHistogram('dkg.chain.rpc.duration', { + unit: 'ms', description: 'Chain RPC wall-time', advice: { explicitBucketBoundaries: RPC_DURATION_BUCKETS }, + }), + chainRpcFailoverTotal: meter.createCounter('dkg.chain.rpc.failover.total', { description: 'Chain RPC endpoint failover events' }), + syncRequestTotal: meter.createCounter('dkg.sync.request.total', { description: 'Outbound sync requests' }), + syncResponseTotal: meter.createCounter('dkg.sync.response.total', { description: 'Inbound sync responses' }), + protocolSendTotal: meter.createCounter('dkg.protocol_router.send.total', { description: 'Outbound P2P protocol sends' }), + protocolSendDuration: meter.createHistogram('dkg.protocol_router.send.duration', { + unit: 'ms', description: 'P2P protocol send wall-time', advice: { explicitBucketBoundaries: RPC_DURATION_BUCKETS }, + }), + }; +} + +let metricsCache: DkgMetrics | undefined; + +/** Lazily-bound metric instruments (call this at every record site). */ +export function getMetrics(): DkgMetrics { + if (!metricsCache) metricsCache = buildMetrics(); + return metricsCache; +} + +/** + * Rebuild instruments against the currently-registered global meter. Called by + * `initTelemetry()` right after `metrics.setGlobalMeterProvider()` so the cache + * (which may have been created against the no-op meter) binds to the real one. + */ +export function rebuildMetrics(): void { + metricsCache = buildMetrics(); +} diff --git a/packages/core/test/log-redaction.test.ts b/packages/core/test/log-redaction.test.ts new file mode 100644 index 000000000..6419b10e2 --- /dev/null +++ b/packages/core/test/log-redaction.test.ts @@ -0,0 +1,88 @@ +import { describe, it, expect } from 'vitest'; +import { createLogRedactor, redactLogEntry, REDACTED } from '../src/log-redaction.js'; +import type { LogRecord } from '../src/logger.js'; + +function rec(message: string): LogRecord { + return { level: 'info', operationName: 'publish', operationId: 'op-1', module: 'test', message }; +} + +describe('log redaction — secrets are scrubbed before logs leave the node', () => { + const redact = createLogRedactor(); + + it('redacts a wallet private key given by key name (any value shape)', () => { + const out = redact(rec('loaded operationalWalletPrivateKey=0xabc123def4567890abc123def4567890abc123def4567890abc123def4567890')); + expect(out.message).toContain('operationalWalletPrivateKey='); + expect(out.message).toContain(REDACTED); + expect(out.message).not.toMatch(/0xabc123def4567890/); + }); + + it('redacts privateKey in JSON-ish "key": "value" form', () => { + const out = redact(rec('signing with {"privateKey":"0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef"}')); + expect(out.message).toContain('"privateKey":'); + expect(out.message).toContain(REDACTED); + expect(out.message).not.toContain('deadbeef'); + }); + + it('redacts a quoted mnemonic INCLUDING its spaces', () => { + const out = redact(rec('restored mnemonic="legal winner thank year wave sausage worth useful legal winner thank yellow"')); + expect(out.message).toContain('mnemonic='); + expect(out.message).toContain(REDACTED); + expect(out.message).not.toMatch(/legal winner|sausage|yellow/); + }); + + it('redacts bearer tokens and api keys', () => { + const out = redact(rec('headers authorization=Bearer-abc.def token: sk_live_9f8e7d6c apiKey=AKIAEXAMPLE123')); + expect(out.message).not.toContain('sk_live_9f8e7d6c'); + expect(out.message).not.toContain('AKIAEXAMPLE123'); + expect((out.message.match(/\[REDACTED\]/g) ?? []).length).toBeGreaterThanOrEqual(2); + }); + + it('redacts the credential in a header-style "authorization: Bearer " (not just the scheme word)', () => { + const out = redact(rec('GET /x authorization: Bearer sk_live_9f8e7d6c5b4a received')); + expect(out.message).toContain('authorization:'); + expect(out.message).toContain(REDACTED); + expect(out.message).not.toContain('sk_live_9f8e7d6c5b4a'); + expect(out.message).not.toContain('Bearer sk_live'); // scheme+token redacted as one + }); + + it('redacts a JWT by shape even with no key name', () => { + const jwt = 'eyJhbGciOiJIUzI1Ni1.eyJzdWIiOiIxMjM0NTY3.SflKxwRJSMeKKF2QT4'; + const out = redact(rec(`auth header ${jwt} received`)); + expect(out.message).toContain(REDACTED); + expect(out.message).not.toContain('eyJhbGci'); + }); + + it('does NOT redact public 0x hashes / Merkle roots (no false positives)', () => { + const root = '0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef'; + const input = rec(`published KC root ${root} tx ${root} merkleRoot ${root}`); + const out = redact(input); + expect(out).toBe(input); // unchanged → same object, no redaction + expect(out.message).toContain(root); + }); + + it('leaves ordinary messages untouched (returns same object)', () => { + const original = rec('peer connected: 12D3KooWabc, 3 direct / 6 relayed'); + const out = redact(original); + expect(out).toBe(original); // no allocation when nothing matched + }); + + it('honors operator-configured extra sensitive keys', () => { + const withExtra = createLogRedactor(['walletPassword', 'customSecretField']); + const out = withExtra(rec('config customSecretField=hunter2 walletPassword="p@ss w0rd"')); + expect(out.message).not.toContain('hunter2'); + expect(out.message).not.toContain('p@ss w0rd'); + }); + + it('redactLogEntry one-shot matches the compiled redactor', () => { + const msg = 'token=secret-value-123'; + expect(redactLogEntry(rec(msg)).message).toBe(redact(rec(msg)).message); + }); + + it('never touches non-message fields', () => { + const out = redact(rec('privateKey=0xabc')); + expect(out.level).toBe('info'); + expect(out.operationId).toBe('op-1'); + expect(out.operationName).toBe('publish'); + expect(out.module).toBe('test'); + }); +}); diff --git a/packages/node-ui/package.json b/packages/node-ui/package.json index 22320f22b..1bdcf6a0f 100644 --- a/packages/node-ui/package.json +++ b/packages/node-ui/package.json @@ -19,6 +19,14 @@ "clean": "node -e \"const fs=require('fs'); ['dist','tsconfig.tsbuildinfo'].forEach((p)=>fs.rmSync(p,{recursive:true,force:true}))\"" }, "dependencies": { + "@opentelemetry/api": "^1.9.1", + "@opentelemetry/core": "^2.8.0", + "@opentelemetry/exporter-metrics-otlp-proto": "^0.219.0", + "@opentelemetry/exporter-trace-otlp-proto": "^0.219.0", + "@opentelemetry/resources": "^2.8.0", + "@opentelemetry/sdk-metrics": "^2.8.0", + "@opentelemetry/sdk-trace-node": "^2.8.0", + "@opentelemetry/semantic-conventions": "^1.41.1", "@openuidev/react-lang": "^0.2.3", "@origintrail-official/dkg-core": "workspace:*", "@origintrail-official/dkg-graph-viz": "workspace:*", diff --git a/packages/node-ui/src/index.ts b/packages/node-ui/src/index.ts index 779776616..36f4764c0 100644 --- a/packages/node-ui/src/index.ts +++ b/packages/node-ui/src/index.ts @@ -47,6 +47,8 @@ export type { export type { LlmSettingsCallbacks, TelemetrySettingsCallbacks } from './api.js'; export { LogPushWorker } from './gelf-push-worker.js'; export type { LogPushWorkerOptions } from './gelf-push-worker.js'; +export { OtlpLogWorker } from './otlp-log-worker.js'; +export type { OtlpLogWorkerOptions } from './otlp-log-worker.js'; export { ChatMemoryManager } from './chat-memory.js'; export type { MemoryToolContext, @@ -60,5 +62,5 @@ export type { export { LlmClient, LlmRequestError } from './llm/client.js'; export { resolveCapabilities } from './llm/capability-resolver.js'; export type { LlmConfig, LlmChatRequest, LlmChatMessage, LlmStreamEvent, LlmCompletionResult, LlmCapabilities } from './llm/types.js'; -export { initTelemetry, recordGauge, setOperationSpan, isTelemetryConfigured } from './telemetry.js'; -export type { TelemetryConfig } from './telemetry.js'; +export { initTelemetry, shutdownTelemetry, isTelemetryConfigured } from './telemetry.js'; +export type { TelemetryInitConfig, TelemetryResource, OtlpSignalConfig } from './telemetry.js'; diff --git a/packages/node-ui/src/otlp-log-worker.ts b/packages/node-ui/src/otlp-log-worker.ts new file mode 100644 index 000000000..6360aeb7c --- /dev/null +++ b/packages/node-ui/src/otlp-log-worker.ts @@ -0,0 +1,302 @@ +/** + * OTLP/HTTP log exporter — ships structured log records to an OpenTelemetry + * collector (or any OTLP/HTTP logs endpoint) as a second, opt-in sink + * alongside the always-on local SQLite store. + * + * Why hand-rolled rather than @opentelemetry/sdk-logs: as of 2026 the OTel JS + * Logs SDK is still "Development" (Traces/Metrics are Stable). We emit the + * stable OTLP/HTTP JSON wire format (ExportLogsServiceRequest) directly and + * reuse the buffer/flush/backoff shape proven by the syslog LogPushWorker. + * + * Reliability contract (this must NEVER slow down or crash the node): + * - push() only appends to a bounded in-memory buffer (drop-oldest on + * overflow) and returns immediately; it never awaits the network. + * - flush() runs on a timer, is guarded against overlap, and swallows every + * error. On a retryable failure the batch is requeued and an exponential + * backoff (with jitter, honoring Retry-After) is applied. On a + * non-retryable failure the batch is dropped (logged once). + * - The flush timer is unref'd so it never keeps the process alive. + * - Transport is outbound HTTPS push only — no inbound scrape endpoint. + */ + +import type { LogRecord } from '@origintrail-official/dkg-core'; + +// OpenTelemetry severity numbers (logs data model). +const OTEL_SEVERITY: Record = { + debug: { num: 5, text: 'DEBUG' }, + info: { num: 9, text: 'INFO' }, + warn: { num: 13, text: 'WARN' }, + error: { num: 17, text: 'ERROR' }, +}; + +export type LogLevel = 'debug' | 'info' | 'warn' | 'error'; +const LEVEL_RANK: Record = { debug: 0, info: 1, warn: 2, error: 3 }; + +const RETRYABLE_STATUS = new Set([408, 429, 500, 502, 503, 504]); + +const DEFAULTS = { + flushIntervalMs: 2_000, + maxBuffer: 500, + maxBatch: 256, + requestTimeoutMs: 10_000, + baseBackoffMs: 1_000, + maxBackoffMs: 60_000, + serviceName: 'dkg-node', + minLevel: 'info' as LogLevel, +}; + +export interface OtlpLogWorkerOptions { + /** Full OTLP/HTTP logs URL, e.g. http://localhost:4318/v1/logs */ + endpoint: string; + /** Bearer credential for the collector (sent as Authorization: Bearer …). */ + token?: string; + /** Extra static headers to attach to every request. */ + headers?: Record; + /** Network identifier: 'testnet' | 'mainnet' | 'devnet'. */ + network: string; + /** Node's libp2p peer ID. */ + peerId: string; + nodeName?: string; + version?: string; + commit?: string; + /** 'core' | 'edge' */ + role?: string; + /** Chain id string, e.g. 'base:8453' → resource attr dkg.chain (matches traces/metrics). */ + chainId?: string; + /** OTel resource service.name. Default 'dkg-node'. */ + serviceName?: string; + /** + * Per-node identifier. Becomes the OTel `service.instance.id`, which Loki + * promotes to the index label `service_instance_id` — this is what a Grafana + * "pick a node" dashboard variable selects on. Defaults to nodeName, then + * peerId. Hosted nodes should set a unique `name` in config. + */ + serviceInstanceId?: string; + /** + * Deployment environment (e.g. 'testnet' | 'mainnet'). Becomes the OTel + * `deployment.environment`, which Loki promotes to the label + * `deployment_environment`. Defaults to `network`. + */ + deploymentEnvironment?: string; + /** Minimum level forwarded remotely. Default 'info' (debug stays local). */ + minLevel?: LogLevel; + /** Bounded in-memory buffer; drop-oldest on overflow. Default 500. */ + bufferMaxEntries?: number; + flushIntervalMs?: number; + requestTimeoutMs?: number; + /** Initial retry backoff in ms (doubles up to maxBackoffMs). Default 1000. */ + baseBackoffMs?: number; + /** Ceiling for retry backoff in ms. Default 60000. */ + maxBackoffMs?: number; + /** Optional diagnostic sink (e.g. the daemon log). Never throws. */ + onError?: (message: string) => void; +} + +interface OtlpAttribute { + key: string; + value: { stringValue: string }; +} + +function attr(key: string, value: string | undefined): OtlpAttribute | null { + if (value == null || value === '') return null; + return { key, value: { stringValue: String(value) } }; +} + +export class OtlpLogWorker { + private buffer: Array<{ r: LogRecord; tsMs: number }> = []; + private timer: ReturnType | null = null; + private stopped = false; + private flushing = false; + private nextAttemptAt = 0; + private backoffMs = 0; + private droppedNonRetryable = 0; + private loggedNonRetryable = false; + + private readonly endpoint: string; + private readonly minRank: number; + private readonly maxBuffer: number; + private readonly flushIntervalMs: number; + private readonly requestTimeoutMs: number; + private readonly baseBackoffMs: number; + private readonly maxBackoffMs: number; + private readonly authHeaders: Record; + private readonly resourceAttrs: OtlpAttribute[]; + private readonly scopeVersion: string | undefined; + private readonly onError: (message: string) => void; + + constructor(opts: OtlpLogWorkerOptions) { + this.endpoint = opts.endpoint; + this.minRank = LEVEL_RANK[opts.minLevel ?? DEFAULTS.minLevel] ?? LEVEL_RANK.info; + this.maxBuffer = Math.max(1, opts.bufferMaxEntries ?? DEFAULTS.maxBuffer); + this.flushIntervalMs = opts.flushIntervalMs ?? DEFAULTS.flushIntervalMs; + this.requestTimeoutMs = opts.requestTimeoutMs ?? DEFAULTS.requestTimeoutMs; + this.baseBackoffMs = opts.baseBackoffMs ?? DEFAULTS.baseBackoffMs; + this.maxBackoffMs = opts.maxBackoffMs ?? DEFAULTS.maxBackoffMs; + this.scopeVersion = opts.version || undefined; + this.onError = opts.onError ?? (() => {}); + + this.authHeaders = { 'content-type': 'application/json', ...(opts.headers ?? {}) }; + if (opts.token) this.authHeaders['authorization'] = `Bearer ${opts.token}`; + + this.resourceAttrs = [ + attr('service.name', opts.serviceName ?? DEFAULTS.serviceName), + attr('service.version', opts.version), + // Promoted to Loki index labels by default → drive Grafana dashboard + // variables (node selector + environment) off these. + attr('service.instance.id', opts.serviceInstanceId ?? opts.nodeName ?? opts.peerId), + attr('deployment.environment', opts.deploymentEnvironment ?? opts.network), + // Kept as structured metadata for richer filtering / correlation. + attr('dkg.network', opts.network), + attr('dkg.peer_id', opts.peerId), + attr('dkg.node.name', opts.nodeName), + attr('dkg.node.role', opts.role), + attr('dkg.chain', opts.chainId), + attr('dkg.commit', opts.commit), + ].filter((a): a is OtlpAttribute => a !== null); + } + + /** Append a record. Filters below minLevel; never awaits the network. */ + push(record: LogRecord): void { + if ((LEVEL_RANK[record.level as LogLevel] ?? LEVEL_RANK.info) < this.minRank) return; + if (this.buffer.length >= this.maxBuffer) this.buffer.shift(); + this.buffer.push({ r: record, tsMs: Date.now() }); + } + + start(): void { + if (this.timer) return; + this.stopped = false; + this.timer = setInterval(() => { + void this.flush(); + }, this.flushIntervalMs); + this.timer.unref(); + } + + stop(): void { + this.stopped = true; + if (this.timer) { + clearInterval(this.timer); + this.timer = null; + } + // Best-effort final flush; failures are swallowed. + void this.flush(true); + } + + /** Test/diagnostic hook. */ + pending(): number { + return this.buffer.length; + } + + private async flush(force = false): Promise { + if (this.flushing) return; + if (this.stopped && !force) return; + if (this.buffer.length === 0) return; + if (!force && Date.now() < this.nextAttemptAt) return; + + this.flushing = true; + const batch = this.buffer.splice(0, DEFAULTS.maxBatch); + try { + const body = this.buildPayload(batch); + const res = await this.post(body); + if (res.ok) { + this.backoffMs = 0; + this.nextAttemptAt = 0; + } else if (RETRYABLE_STATUS.has(res.status)) { + this.requeue(batch); + this.scheduleBackoff(res.retryAfterMs); + } else { + // Non-retryable (e.g. 400/401/403): dropping is correct — retrying + // a malformed/unauthorized request forever would just grow the buffer. + this.droppedNonRetryable += batch.length; + if (!this.loggedNonRetryable) { + this.loggedNonRetryable = true; + this.onError( + `dropping logs — collector returned ${res.status} (non-retryable); check endpoint/token. ` + + `${this.droppedNonRetryable} record(s) dropped so far.`, + ); + } + } + } catch (err) { + // Network/timeout error → treat as retryable. + this.requeue(batch); + this.scheduleBackoff(); + void err; + } finally { + this.flushing = false; + } + } + + private requeue(batch: Array<{ r: LogRecord; tsMs: number }>): void { + this.buffer.unshift(...batch); + while (this.buffer.length > this.maxBuffer) this.buffer.shift(); + } + + private scheduleBackoff(retryAfterMs?: number): void { + const next = this.backoffMs === 0 ? this.baseBackoffMs : Math.min(this.backoffMs * 2, this.maxBackoffMs); + this.backoffMs = next; + const jitter = next * 0.2 * Math.random(); + const delay = retryAfterMs != null ? Math.max(retryAfterMs, next) : next + jitter; + this.nextAttemptAt = Date.now() + delay; + } + + private async post(body: string): Promise<{ ok: boolean; status: number; retryAfterMs?: number }> { + const ac = new AbortController(); + const timer = setTimeout(() => ac.abort(), this.requestTimeoutMs); + try { + const res = await fetch(this.endpoint, { + method: 'POST', + headers: this.authHeaders, + body, + signal: ac.signal, + }); + // Drain the body so the socket can be reused / freed. + await res.arrayBuffer().catch(() => {}); + let retryAfterMs: number | undefined; + const ra = res.headers.get('retry-after'); + if (ra) { + const secs = Number(ra); + if (!Number.isNaN(secs)) retryAfterMs = secs * 1000; + } + return { ok: res.ok, status: res.status, retryAfterMs }; + } finally { + clearTimeout(timer); + } + } + + private buildPayload(batch: Array<{ r: LogRecord; tsMs: number }>): string { + const logRecords = batch.map(({ r, tsMs }) => { + const sev = OTEL_SEVERITY[r.level] ?? OTEL_SEVERITY.info; + const nano = String(tsMs * 1_000_000); + return { + timeUnixNano: nano, + observedTimeUnixNano: nano, + severityNumber: sev.num, + severityText: sev.text, + body: { stringValue: r.message }, + // W3C trace correlation — OTLP top-level fields (hex), not attributes. + // Present only when the log was emitted inside a recording span. + ...(r.traceId ? { traceId: r.traceId } : {}), + ...(r.spanId ? { spanId: r.spanId } : {}), + attributes: [ + attr('dkg.operation_id', r.operationId), + attr('dkg.operation_name', r.operationName), + attr('dkg.source_operation_id', r.sourceOperationId), + attr('dkg.module', r.module), + ].filter((a): a is OtlpAttribute => a !== null), + }; + }); + + return JSON.stringify({ + resourceLogs: [ + { + resource: { attributes: this.resourceAttrs }, + scopeLogs: [ + { + scope: { name: 'dkg-node', version: this.scopeVersion }, + logRecords, + }, + ], + }, + ], + }); + } +} diff --git a/packages/node-ui/src/telemetry.ts b/packages/node-ui/src/telemetry.ts index 9e4ffbf41..552a35463 100644 --- a/packages/node-ui/src/telemetry.ts +++ b/packages/node-ui/src/telemetry.ts @@ -1,54 +1,178 @@ /** - * Optional OpenTelemetry integration for the Node UI. - * When enabled (via config), exports metrics and optionally traces to an OTLP endpoint. - * operationId is set as the trace/span attribute for correlation with the dashboard. + * OpenTelemetry SDK bootstrap for a DKG node (boot side, daemon-only consumer). * - * To enable: add @opentelemetry/api, @opentelemetry/sdk-metrics, and - * @opentelemetry/exporter-metrics-otlp-http; then implement registerMeter() and - * use the same metric names below. For traces, use @opentelemetry/sdk-trace-base - * and set span attributes { 'dkg.operation_id': operationId }. + * Registers the global Tracer + Meter providers ONCE at daemon startup so that + * the call-site facade in `@origintrail-official/dkg-core` (getTracer/withSpan/ + * getMetrics) — used across agent/publisher/chain/sync — produces real spans and + * metrics. When telemetry is disabled, or a signal has no endpoint, this + * registers NOTHING: the core facade then talks to the API's built-in no-op + * providers (zero cost, no outbound calls). + * + * Logs are NOT handled here — they stay on the hand-rolled `OtlpLogWorker` + * (bounded buffer + retry + at-source redaction); the OTel Logs SDK is still + * "Development". This module only wires traces + metrics, and shares ONE + * Resource with the log worker so all three signals describe the same node. + * + * This file is server-only (it pulls the Node OTel SDK). It must never be + * imported into the browser UI bundle. */ -export interface TelemetryConfig { +import { metrics as otelMetrics, trace as otelTrace } from '@opentelemetry/api'; +import { resourceFromAttributes } from '@opentelemetry/resources'; +import { + NodeTracerProvider, + BatchSpanProcessor, + ParentBasedSampler, + TraceIdRatioBasedSampler, +} from '@opentelemetry/sdk-trace-node'; +import { MeterProvider, PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics'; +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-proto'; +import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-proto'; +import { rebuildMetrics } from '@origintrail-official/dkg-core'; + +/** Stable resource identity, shared by logs + traces + metrics. */ +export interface TelemetryResource { + serviceName?: string; // default 'dkg-node' + serviceVersion?: string; + /** Per-node id → service.instance.id (the Grafana node selector). */ + serviceInstanceId?: string; + /** testnet | mainnet | devnet → deployment.environment + dkg.network */ + network?: string; + peerId?: string; + nodeName?: string; + nodeRole?: string; + commit?: string; + /** e.g. 'base:8453' → dkg.chain */ + chainId?: string; +} + +export interface OtlpSignalConfig { + endpoint?: string; // full signal URL, e.g. http://localhost:4318/v1/traces + /** Bearer token → Authorization header. */ + token?: string; + headers?: Record; +} + +export interface TelemetryInitConfig { + /** Master gate. When false, nothing is registered. */ enabled?: boolean; - /** OTLP HTTP endpoint for metrics (e.g. http://localhost:4318/v1/metrics) */ - metricsEndpoint?: string; - /** Service name for resource attributes */ - serviceName?: string; + resource?: TelemetryResource; + traces?: OtlpSignalConfig & { sampleRatio?: number }; + metrics?: OtlpSignalConfig & { exportIntervalMs?: number }; } +let tracerProvider: NodeTracerProvider | null = null; +let meterProvider: MeterProvider | null = null; let configured = false; -/** - * Initialize telemetry. No-op if disabled or OTel packages not installed. - * Call once at daemon startup. - */ -export function initTelemetry(_config: TelemetryConfig): void { - if (!_config.enabled || !_config.metricsEndpoint) return; - configured = true; - // When OTel is added: create MeterProvider, OTLP exporter, register gauges - // meter.createObservableGauge('dkg.network.peers', ...), etc. +function buildResource(r: TelemetryResource = {}) { + const attrs: Record = { + 'service.name': r.serviceName ?? 'dkg-node', + }; + if (r.serviceVersion) attrs['service.version'] = r.serviceVersion; + if (r.serviceInstanceId) attrs['service.instance.id'] = r.serviceInstanceId; + if (r.network) { + attrs['deployment.environment'] = r.network; + attrs['dkg.network'] = r.network; + } + if (r.peerId) attrs['dkg.peer_id'] = r.peerId; + if (r.nodeName) attrs['dkg.node.name'] = r.nodeName; + if (r.nodeRole) attrs['dkg.node.role'] = r.nodeRole; + if (r.commit) attrs['dkg.commit'] = r.commit; + if (r.chainId) attrs['dkg.chain'] = r.chainId; + return resourceFromAttributes(attrs); } -/** - * Record a gauge value for export. No-op when telemetry is disabled. - * Metric names match dashboard: dkg.network.peers, dkg.knowledge.triples, dkg.system.cpu_percent, etc. - */ -export function recordGauge(_name: string, _value: number): void { - if (!configured) return; - // When OTel is added: update the observable gauge callback or record value +function authHeaders(sig: OtlpSignalConfig): Record | undefined { + const headers = { ...(sig.headers ?? {}) }; + if (sig.token) headers['Authorization'] = `Bearer ${sig.token}`; + return Object.keys(headers).length ? headers : undefined; } /** - * Start a span for an operation (for trace correlation). - * operationId should be set as span attribute so traces match the Operations panel. - * No-op when telemetry is disabled. + * Initialize traces + metrics. No-op when disabled or when a signal has no + * endpoint. Safe to call once at daemon boot. Idempotent (subsequent calls are + * ignored once configured). */ -export function setOperationSpan(_operationId: string, _operationName: string): void { - if (!configured) return; - // When OTel is added: tracer.startSpan(operationName, { attributes: { 'dkg.operation_id': operationId } }) +export function initTelemetry(cfg: TelemetryInitConfig): void { + if (configured) return; + if (!cfg.enabled) return; + + const resource = buildResource(cfg.resource); + + // ── Traces ── + if (cfg.traces?.endpoint) { + const exporter = new OTLPTraceExporter({ + url: cfg.traces.endpoint, + headers: authHeaders(cfg.traces), + }); + const ratio = cfg.traces.sampleRatio ?? 1; + tracerProvider = new NodeTracerProvider({ + resource, + sampler: new ParentBasedSampler({ root: new TraceIdRatioBasedSampler(ratio) }), + spanProcessors: [new BatchSpanProcessor(exporter)], + }); + // register() installs the global tracer provider + an AsyncLocalStorage + // context manager (so spans flow across awaits) + W3C trace-context propagator. + tracerProvider.register(); + configured = true; + } + + // ── Metrics ── + if (cfg.metrics?.endpoint) { + const exporter = new OTLPMetricExporter({ + url: cfg.metrics.endpoint, + headers: authHeaders(cfg.metrics), + }); + meterProvider = new MeterProvider({ + resource, + readers: [ + new PeriodicExportingMetricReader({ + exporter, + exportIntervalMillis: cfg.metrics.exportIntervalMs ?? 30_000, + }), + ], + }); + otelMetrics.setGlobalMeterProvider(meterProvider); + // Re-bind the core facade's instrument cache to the real meter. + rebuildMetrics(); + configured = true; + } } export function isTelemetryConfigured(): boolean { return configured; } + +/** + * Flush + shut down providers. Used both at daemon teardown AND when telemetry + * is turned off via the runtime master gate, so it must FULLY reverse + * `initTelemetry`: stop the exporters, then clear the OTel API globals so a + * later `initTelemetry` (live re-enable) can register fresh providers — without + * the `disable()` calls, the API keeps the first (now shut-down) provider and a + * re-enable would silently no-op. Safe if never initialized; idempotent. + */ +export async function shutdownTelemetry(): Promise { + const tasks: Promise[] = []; + if (tracerProvider) { + tasks.push(tracerProvider.forceFlush().catch(() => {})); + tasks.push(tracerProvider.shutdown().catch(() => {})); + } + if (meterProvider) { + tasks.push(meterProvider.forceFlush().catch(() => {})); + tasks.push(meterProvider.shutdown().catch(() => {})); + } + await Promise.all(tasks); + // Reset the global API so a subsequent initTelemetry() can re-register + // (setGlobal*Provider only takes effect once until the slot is disabled). + if (tracerProvider) otelTrace.disable(); + if (meterProvider) { + otelMetrics.disable(); + // Rebind the core facade's instrument cache back to the no-op meter so + // getMetrics() after disable is inert rather than holding dead instruments. + rebuildMetrics(); + } + tracerProvider = null; + meterProvider = null; + configured = false; +} diff --git a/packages/node-ui/test/otlp-log-worker.test.ts b/packages/node-ui/test/otlp-log-worker.test.ts new file mode 100644 index 000000000..073ec675e --- /dev/null +++ b/packages/node-ui/test/otlp-log-worker.test.ts @@ -0,0 +1,240 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { createServer, type Server } from 'node:http'; +import { AddressInfo } from 'node:net'; +import { OtlpLogWorker, type OtlpLogWorkerOptions } from '../src/otlp-log-worker.js'; +import type { LogRecord } from '@origintrail-official/dkg-core'; + +/** + * Real end-to-end exercise of the OTLP/HTTP log exporter against a live local + * HTTP server (no mocks) — proves the wire format, batching, level filtering, + * buffer bounds, and retry/backoff behaviour. + */ + +interface Captured { + body: any; + headers: Record; + /** Status the server RESPONDED with for this request — lets a test tell a + * failed (503) attempt apart from the successful (200) delivery. */ + status: number; +} + +type Responder = (reqIndex: number) => { status: number; headers?: Record }; + +function startServer(responder: Responder): Promise<{ url: string; received: Captured[]; close: () => Promise; server: Server }> { + const received: Captured[] = []; + let reqIndex = 0; + const server = createServer((req, res) => { + const chunks: Buffer[] = []; + req.on('data', (c) => chunks.push(c)); + req.on('end', () => { + const raw = Buffer.concat(chunks).toString('utf8'); + let body: unknown = raw; + try { body = JSON.parse(raw); } catch { /* keep raw */ } + const { status, headers } = responder(reqIndex++); + received.push({ body, headers: req.headers, status }); + if (headers) for (const [k, v] of Object.entries(headers)) res.setHeader(k, v); + res.statusCode = status; + res.end('{}'); + }); + }); + return new Promise((resolve) => { + server.listen(0, '127.0.0.1', () => { + const port = (server.address() as AddressInfo).port; + resolve({ + url: `http://127.0.0.1:${port}/v1/logs`, + received, + server, + close: () => new Promise((r) => server.close(() => r())), + }); + }); + }); +} + +function rec(over: Partial = {}): LogRecord { + return { + level: 'info', + operationName: 'publish', + operationId: 'op-abc', + module: 'publisher', + message: 'published KC 42', + ...over, + }; +} + +function baseOpts(url: string, over: Partial = {}): OtlpLogWorkerOptions { + return { + endpoint: url, + network: 'devnet', + peerId: '12D3KooWtest', + nodeName: 'test-node', + version: '10.0.0', + commit: 'abc1234', + role: 'core', + chainId: 'base:8453', + flushIntervalMs: 20, + baseBackoffMs: 40, + maxBackoffMs: 200, + ...over, + }; +} + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); +async function waitFor(cond: () => boolean, timeoutMs = 3000): Promise { + const start = Date.now(); + while (!cond()) { + if (Date.now() - start > timeoutMs) throw new Error('waitFor timed out'); + await sleep(10); + } +} + +describe('OtlpLogWorker — OTLP/HTTP log export', () => { + const workers: OtlpLogWorker[] = []; + const servers: Array<() => Promise> = []; + afterEach(async () => { + for (const w of workers.splice(0)) w.stop(); + for (const close of servers.splice(0)) await close(); + }); + + it('ships a well-formed OTLP request with resource + record attributes and severity', async () => { + const srv = await startServer(() => ({ status: 200 })); + servers.push(srv.close); + const w = new OtlpLogWorker(baseOpts(srv.url)); + workers.push(w); + w.start(); + + w.push(rec({ level: 'error', message: 'boom', operationId: 'op-1', sourceOperationId: 'op-src' })); + await waitFor(() => srv.received.length >= 1); + + const { body, headers } = srv.received[0]; + expect(headers['content-type']).toContain('application/json'); + const resourceLogs = body.resourceLogs; + expect(Array.isArray(resourceLogs)).toBe(true); + + const resourceAttrs: any[] = resourceLogs[0].resource.attributes; + const attrMap = Object.fromEntries(resourceAttrs.map((a) => [a.key, a.value.stringValue])); + expect(attrMap['service.name']).toBe('dkg-node'); + expect(attrMap['dkg.network']).toBe('devnet'); + // Dotted OTel-style keys, matching the traces/metrics resource (telemetry.ts). + expect(attrMap['dkg.node.role']).toBe('core'); + expect(attrMap['dkg.node.name']).toBe('test-node'); + expect(attrMap['dkg.chain']).toBe('base:8453'); + expect(attrMap['dkg.peer_id']).toBe('12D3KooWtest'); + // Label-promoted identity for the Grafana node selector. + expect(attrMap['service.instance.id']).toBe('test-node'); // defaults to nodeName + expect(attrMap['deployment.environment']).toBe('devnet'); // defaults to network + + const logRecord = resourceLogs[0].scopeLogs[0].logRecords[0]; + expect(logRecord.severityNumber).toBe(17); // error + expect(logRecord.severityText).toBe('ERROR'); + expect(logRecord.body.stringValue).toBe('boom'); + expect(typeof logRecord.timeUnixNano).toBe('string'); + const recAttrs = Object.fromEntries(logRecord.attributes.map((a: any) => [a.key, a.value.stringValue])); + expect(recAttrs['dkg.operation_id']).toBe('op-1'); + expect(recAttrs['dkg.source_operation_id']).toBe('op-src'); + expect(recAttrs['dkg.module']).toBe('publisher'); + }); + + it('batches multiple records into a single request', async () => { + const srv = await startServer(() => ({ status: 200 })); + servers.push(srv.close); + const w = new OtlpLogWorker(baseOpts(srv.url)); + workers.push(w); + w.start(); + for (let i = 0; i < 5; i++) w.push(rec({ message: `m${i}` })); + await waitFor(() => srv.received.length >= 1); + await sleep(60); // allow any stragglers + const totalRecords = srv.received.reduce( + (n, r) => n + r.body.resourceLogs[0].scopeLogs[0].logRecords.length, + 0, + ); + expect(totalRecords).toBe(5); + expect(srv.received.length).toBe(1); // one flush, one POST + }); + + it('filters records below minLevel (debug stays local)', async () => { + const srv = await startServer(() => ({ status: 200 })); + servers.push(srv.close); + const w = new OtlpLogWorker(baseOpts(srv.url, { minLevel: 'info' })); + workers.push(w); + w.start(); + w.push(rec({ level: 'debug', message: 'noisy' })); + w.push(rec({ level: 'info', message: 'kept' })); + await waitFor(() => srv.received.length >= 1); + await sleep(60); + const bodies = srv.received.flatMap((r) => + r.body.resourceLogs[0].scopeLogs[0].logRecords.map((lr: any) => lr.body.stringValue), + ); + expect(bodies).toContain('kept'); + expect(bodies).not.toContain('noisy'); + }); + + it('sends Authorization bearer header when a token is configured', async () => { + const srv = await startServer(() => ({ status: 200 })); + servers.push(srv.close); + const w = new OtlpLogWorker(baseOpts(srv.url, { token: 'sekret-token' })); + workers.push(w); + w.start(); + w.push(rec()); + await waitFor(() => srv.received.length >= 1); + expect(srv.received[0].headers['authorization']).toBe('Bearer sekret-token'); + }); + + it('on overflow keeps the NEWEST entries and drops the OLDEST (bounded memory)', async () => { + const srv = await startServer(() => ({ status: 200 })); + servers.push(srv.close); + const w = new OtlpLogWorker(baseOpts(srv.url, { bufferMaxEntries: 3 })); + workers.push(w); + // Overflow the bound by 10→3 BEFORE starting, then flush and inspect which survived. + for (let i = 0; i < 10; i++) w.push(rec({ message: `m${i}` })); + expect(w.pending()).toBe(3); // capped + w.start(); + await waitFor(() => srv.received.length >= 1); + await sleep(60); + const delivered = srv.received + .flatMap((r) => r.body.resourceLogs[0].scopeLogs[0].logRecords.map((lr: any) => lr.body.stringValue)) + .sort(); + // drop-OLDEST ⇒ the survivors are the last three pushed (m7,m8,m9), NOT m0..m2. + expect(delivered).toEqual(['m7', 'm8', 'm9']); + }); + + it('retries on a retryable 503 then succeeds — the batch is preserved into the 200', async () => { + let calls = 0; + const srv = await startServer((i) => { + calls = i + 1; + return { status: i === 0 ? 503 : 200 }; + }); + servers.push(srv.close); + const w = new OtlpLogWorker(baseOpts(srv.url)); + workers.push(w); + w.start(); + w.push(rec({ message: 'must-arrive' })); + // First attempt 503, then backoff (~40ms), then a 200. + await waitFor(() => srv.received.length >= 2, 4000); + const msgsOf = (r: Captured): string[] => + r.body.resourceLogs[0].scopeLogs[0].logRecords.map((lr: any) => lr.body.stringValue); + // The 503 attempt must NOT be counted as delivery evidence: assert the + // record arrived specifically on the SUCCESSFUL (200) request, proving the + // requeued batch survived the retry rather than being lost or replaced. + const okReq = srv.received.find((r) => r.status === 200); + const failReq = srv.received.find((r) => r.status === 503); + expect(failReq, 'expected a failed 503 attempt').toBeDefined(); + expect(okReq, 'expected a successful 200 retry').toBeDefined(); + expect(msgsOf(okReq!)).toContain('must-arrive'); + expect(calls).toBeGreaterThanOrEqual(2); + }); + + it('drops batch and reports once on a non-retryable 400 (no infinite retry)', async () => { + const errors: string[] = []; + const srv = await startServer(() => ({ status: 400 })); + servers.push(srv.close); + const w = new OtlpLogWorker(baseOpts(srv.url, { onError: (m) => errors.push(m) })); + workers.push(w); + w.start(); + w.push(rec({ message: 'bad' })); + await waitFor(() => srv.received.length >= 1); + await sleep(120); // give it time to (not) retry + expect(srv.received.length).toBe(1); // dropped, not retried + expect(w.pending()).toBe(0); + expect(errors.length).toBe(1); // reported exactly once + }); +}); diff --git a/packages/node-ui/test/telemetry.test.ts b/packages/node-ui/test/telemetry.test.ts new file mode 100644 index 000000000..18507d4df --- /dev/null +++ b/packages/node-ui/test/telemetry.test.ts @@ -0,0 +1,220 @@ +import { describe, it, expect, afterEach } from 'vitest'; +import { createServer, type Server } from 'node:http'; +import type { AddressInfo } from 'node:net'; +import { trace, metrics, SpanStatusCode } from '@opentelemetry/api'; +import { NodeTracerProvider, InMemorySpanExporter, SimpleSpanProcessor } from '@opentelemetry/sdk-trace-node'; +import { + MeterProvider, + PeriodicExportingMetricReader, + InMemoryMetricExporter, + AggregationTemporality, +} from '@opentelemetry/sdk-metrics'; +import { withSpan, getMetrics, rebuildMetrics, Logger, type OperationContext } from '@origintrail-official/dkg-core'; +import { initTelemetry, isTelemetryConfigured, shutdownTelemetry } from '../src/telemetry.js'; + +/** + * Acceptance-criteria tests for the OTel observability foundation: + * disabled ⇒ no-op, span error status, metric label cardinality, and + * log↔trace correlation. Uses in-memory OTel exporters (no network). + */ + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); +async function waitFor(cond: () => boolean, timeoutMs = 5000): Promise { + const start = Date.now(); + while (!cond()) { + if (Date.now() - start > timeoutMs) throw new Error('waitFor timed out'); + await sleep(20); + } +} + +function registerInMemoryTracer(): InMemorySpanExporter { + const exporter = new InMemorySpanExporter(); + const provider = new NodeTracerProvider({ spanProcessors: [new SimpleSpanProcessor(exporter)] }); + provider.register(); + return exporter; +} + +afterEach(async () => { + await shutdownTelemetry(); + trace.disable(); + metrics.disable(); + rebuildMetrics(); // rebind the facade metric cache to the (now no-op) global meter + Logger.setSink(null); +}); + +describe('telemetry — disabled is a total no-op', () => { + it('withSpan returns the body value with no provider registered', async () => { + trace.disable(); + expect(await withSpan('agent.publish', () => 42)).toBe(42); + }); + + it('initTelemetry({enabled:false}) registers nothing', () => { + initTelemetry({ enabled: false }); + expect(isTelemetryConfigured()).toBe(false); + }); + + it('initTelemetry with no endpoints registers nothing even when enabled', () => { + initTelemetry({ enabled: true, resource: { serviceInstanceId: 'n1' } }); + expect(isTelemetryConfigured()).toBe(false); + }); + + it('metric instruments are usable (no throw) when telemetry is off', () => { + expect(() => getMetrics().publishTotal.add(1, { outcome: 'failed', source: 'direct' })).not.toThrow(); + expect(() => getMetrics().chainRpcDuration.record(12, { rpc_method: 'eth_call' })).not.toThrow(); + }); +}); + +describe('withSpan — status + attributes + error recording', () => { + it('success: returns value, records attributes, status not ERROR', async () => { + const exporter = registerInMemoryTracer(); + const out = await withSpan( + 'agent.publish', + (span) => { + span.setAttribute('dkg.publish_status', 'tentative'); + return 'ok'; + }, + { attributes: { 'dkg.context_graph_id': 'cg-1' } }, + ); + expect(out).toBe('ok'); + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].name).toBe('agent.publish'); + expect(spans[0].attributes['dkg.context_graph_id']).toBe('cg-1'); + expect(spans[0].attributes['dkg.publish_status']).toBe('tentative'); + expect(spans[0].status.code).not.toBe(SpanStatusCode.ERROR); + }); + + it('throw: sets ERROR status, records the exception, and rethrows', async () => { + const exporter = registerInMemoryTracer(); + await expect( + withSpan('publisher.ack_collect', () => { + throw new Error('quorum unmet'); + }), + ).rejects.toThrow('quorum unmet'); + const spans = exporter.getFinishedSpans(); + expect(spans).toHaveLength(1); + expect(spans[0].status.code).toBe(SpanStatusCode.ERROR); + expect(spans[0].events.some((e) => e.name === 'exception')).toBe(true); + }); +}); + +describe('initTelemetry — real OTLP exporter bootstrap exports traces + metrics', () => { + // Proves the actual boot path (OTLP/proto exporters, BatchSpanProcessor, + // tracerProvider.register(), setGlobalMeterProvider + rebuildMetrics) really + // ships data — the prior suite only covered the disabled / no-endpoint cases + // and manually-registered in-memory providers, so a regression that dropped + // register()/rebuildMetrics() or the span processor went unnoticed. + it('a withSpan + getMetrics emission reaches a local OTLP collector', async () => { + const hits: Record = { '/v1/traces': 0, '/v1/metrics': 0 }; + const server: Server = createServer((req, res) => { + const chunks: Buffer[] = []; + req.on('data', (c) => chunks.push(c)); + req.on('end', () => { + const path = req.url ?? ''; + if (path in hits && Buffer.concat(chunks).length > 0) hits[path] += 1; + res.statusCode = 200; + res.end(); + }); + }); + await new Promise((r) => server.listen(0, '127.0.0.1', () => r())); + const port = (server.address() as AddressInfo).port; + const base = `http://127.0.0.1:${port}`; + try { + initTelemetry({ + enabled: true, + resource: { serviceInstanceId: 'n1', network: 'devnet', chainId: 'base:8453' }, + traces: { endpoint: `${base}/v1/traces` }, + metrics: { endpoint: `${base}/v1/metrics`, exportIntervalMs: 60_000 }, + }); + expect(isTelemetryConfigured()).toBe(true); + + // Emit through the SAME facade the production call sites use. + await withSpan('chain.tx_send', () => 'done', { + attributes: { 'rpc.method': 'eth_sendRawTransaction', 'dkg.chain_id': 'base:8453' }, + }); + getMetrics().chainRpcTotal.add(1, { + rpc_method: 'eth_call', outcome: 'ok', retryable: false, chain_id: 'base:8453', + }); + + // shutdownTelemetry force-flushes the span batch + the metric reader. + await shutdownTelemetry(); + + await waitFor(() => hits['/v1/traces'] >= 1 && hits['/v1/metrics'] >= 1, 5000); + expect(hits['/v1/traces']).toBeGreaterThanOrEqual(1); + expect(hits['/v1/metrics']).toBeGreaterThanOrEqual(1); + } finally { + await new Promise((r) => server.close(() => r())); + } + }); +}); + +describe('metrics — bounded, low-cardinality attributes only', () => { + // NOTE: the END-TO-END proof that a REAL instrumented call site emits bounded + // labels lives in packages/chain/test/chain-rpc-telemetry.unit.test.ts (it + // drives the actual contractReadWithFailover seam). This test pins the + // allow-list contract at the facade level. + it('emits counters/histograms with only allow-listed attribute keys', async () => { + const exporter = new InMemoryMetricExporter(AggregationTemporality.CUMULATIVE); + const mp = new MeterProvider({ + readers: [new PeriodicExportingMetricReader({ exporter, exportIntervalMillis: 60_000 })], + }); + metrics.setGlobalMeterProvider(mp); + rebuildMetrics(); + + const m = getMetrics(); + m.publishTotal.add(1, { outcome: 'failed', source: 'direct', chain_id: 'base:8453' }); + m.publishDuration.record(1234, { outcome: 'failed', source: 'direct', chain_id: 'base:8453' }); + m.chainRpcTotal.add(1, { rpc_method: 'eth_call', outcome: 'error', retryable: true, chain_id: 'base:8453' }); + m.ackPeerTotal.add(1, { result: 'decline', decline_code: 'NO_DATA_IN_SWM' }); + m.ackQuorumTotal.add(1, { outcome: 'timeout', chain_id: 'base:8453' }); + m.syncRequestTotal.add(1, { outcome: 'ok', protocol_id: '/dkg/10.0.2/sync' }); + m.protocolSendTotal.add(1, { outcome: 'ok', protocol_id: '/dkg/10.0.2/sync' }); + m.protocolSendDuration.record(5, { protocol_id: '/dkg/10.0.2/sync' }); + + await mp.forceFlush(); + + const keys = new Set(); + for (const rm of exporter.getMetrics()) + for (const sm of rm.scopeMetrics) + for (const metric of sm.metrics) + for (const dp of metric.dataPoints) for (const k of Object.keys(dp.attributes)) keys.add(k); + + const ALLOWED = new Set([ + 'outcome', 'source', 'chain_id', 'rpc_method', 'retryable', 'result', + 'decline_code', 'protocol_id', 'method', 'module', 'role', 'reason', 'error_type', + ]); + expect([...keys].filter((k) => !ALLOWED.has(k))).toEqual([]); + // high-cardinality keys must never be metric labels + for (const bad of ['peer_id', 'context_graph_id', 'tx_hash', 'operation_id', 'assertion_id', 'kaId', 'rpc_url']) { + expect(keys.has(bad)).toBe(false); + } + }); +}); + +describe('log ↔ trace correlation', () => { + it('Logger attaches the active span trace_id/span_id to the record', async () => { + const exporter = registerInMemoryTracer(); + const captured: Array<{ traceId?: string; spanId?: string }> = []; + Logger.setSink((e) => captured.push(e)); + const logger = new Logger('test'); + const ctx: OperationContext = { operationId: 'op-1', operationName: 'publish' }; + + await withSpan('agent.publish', () => { + logger.info(ctx, 'inside the span'); + }); + + expect(captured).toHaveLength(1); + expect(captured[0].traceId).toMatch(/^[0-9a-f]{32}$/); + expect(captured[0].spanId).toMatch(/^[0-9a-f]{16}$/); + expect(captured[0].traceId).toBe(exporter.getFinishedSpans()[0].spanContext().traceId); + }); + + it('Logger emits no trace ids when no span is active', () => { + trace.disable(); + const captured: Array<{ traceId?: string }> = []; + Logger.setSink((e) => captured.push(e)); + new Logger('test').info({ operationId: 'op-2', operationName: 'query' }, 'no span here'); + expect(captured).toHaveLength(1); + expect(captured[0].traceId).toBeUndefined(); + }); +}); diff --git a/packages/publisher/src/ack-collector.ts b/packages/publisher/src/ack-collector.ts index 731e8d4cd..71a14d3ad 100644 --- a/packages/publisher/src/ack-collector.ts +++ b/packages/publisher/src/ack-collector.ts @@ -9,6 +9,8 @@ import { isStorageACKDecline, isTransientStorageACKDeclineCode, isSubscriptionSource, + withSpan, + getMetrics, type PublishIntentMsg, type UpdateIntentMsg, type StorageACKMsg, @@ -115,6 +117,19 @@ function transientDeclineBackoffMs(retry: number): number { const MAX_DECLINE_CODE_CHARS = 64; const MAX_DECLINE_MESSAGE_CHARS = 240; +/** + * Map a terminal {@link QuorumUnmetError} to the low-cardinality + * `ackQuorumTotal` outcome label. The collector encodes the failure kind + * only in the embedded `legacyMessage` prefix (no structured `kind` + * field): `storage_ack_timeout*` ⇒ the round ran out of time, every other + * quorum-fail throw (`no connected core peers`, `quorum impossible`, + * `storage_ack_insufficient`) ⇒ quorum was/became impossible. + */ +function quorumOutcomeFromError(err: QuorumUnmetError): 'timeout' | 'impossible' { + const msg = err.legacyMessage ?? err.message ?? ''; + return msg.includes('storage_ack_timeout') ? 'timeout' : 'impossible'; +} + function sanitizeDeclineField(value: string, maxChars: number): string { const compacted = value.replace(/[\u0000-\u001f\u007f]+/g, ' ').replace(/\s+/g, ' ').trim(); if (compacted.length <= maxChars) return compacted; @@ -223,6 +238,66 @@ export class ACKCollector { kaCount, rootEntities, chainId, kav10Address, } = params; const REQUIRED_ACKS = params.requiredACKs ?? DEFAULT_REQUIRED_ACKS; + const chainIdLabel = chainId != null ? chainId.toString() : undefined; + + return withSpan('publisher.ack_collect', async (span) => { + span.setAttributes({ + 'dkg.context_graph_id': contextGraphIdStr, + 'dkg.required_acks': REQUIRED_ACKS, + }); + try { + const result = await this.collectInner({ + merkleRoot, contextGraphId, contextGraphIdStr, + publisherPeerId, publicByteSize, isPrivate, + kaCount, rootEntities, chainId, kav10Address, + REQUIRED_ACKS, + params, + }); + span.setAttribute('dkg.collected_acks', result.acks.length); + getMetrics().ackQuorumTotal.add(1, { + outcome: 'reached', + ...(chainIdLabel ? { chain_id: chainIdLabel } : {}), + }); + return result; + } catch (err) { + // QuorumUnmetError throw is auto-recorded by withSpan (ERROR status). + // Map its failure kind to the terminal quorum metric outcome. + if (err instanceof QuorumUnmetError) { + getMetrics().ackQuorumTotal.add(1, { + outcome: quorumOutcomeFromError(err), + ...(chainIdLabel ? { chain_id: chainIdLabel } : {}), + }); + } + throw err; + } + }); + } + + /** + * Body of {@link collect}, split out so the public method is a thin + * `withSpan('publisher.ack_collect')` wrapper. Preserves the exact + * control flow + error propagation of the original method body. + */ + private async collectInner(ctx: { + merkleRoot: Uint8Array; + contextGraphId: bigint; + contextGraphIdStr: string; + publisherPeerId: string; + publicByteSize: bigint; + isPrivate: boolean; + kaCount: number; + rootEntities: string[]; + chainId: bigint; + kav10Address: string; + REQUIRED_ACKS: number; + params: Parameters[0]; + }): Promise { + const { + merkleRoot, contextGraphId, contextGraphIdStr, + publisherPeerId, publicByteSize, isPrivate, + kaCount, rootEntities, chainId, kav10Address, + REQUIRED_ACKS, params, + } = ctx; const log = this.deps.log ?? (() => {}); if (!Number.isInteger(params.merkleLeafCount) || params.merkleLeafCount < 1) { @@ -656,8 +731,37 @@ export class ACKCollector { let declineRetries = 0; for (;;) { try { - const response = await this.deps.sendP2P(peerId, ackProtocolId, intentBytes); - const ack: StorageACKMsg = decodeStorageACK(response); + // publisher.ack_peer_request — one per-peer sendP2P attempt. + // Transport throw → withSpan auto-records ERROR; the catch below + // tags ackPeerTotal{result:'transport_error'}. A decline response + // is an expected negative (status stays OK, dkg.decline_code set). + const ack: StorageACKMsg = await withSpan( + 'publisher.ack_peer_request', + async (span) => { + const response = await this.deps.sendP2P(peerId, ackProtocolId, intentBytes); + const decoded: StorageACKMsg = decodeStorageACK(response); + if (isStorageACKDecline(decoded)) { + const declineCode = sanitizeDeclineField( + decoded.declineCode ?? 'UNKNOWN', + MAX_DECLINE_CODE_CHARS, + ) || 'UNKNOWN'; + span.setAttribute('dkg.decline_code', declineCode); + getMetrics().ackPeerTotal.add(1, { + result: 'decline', + decline_code: declineCode, + }); + } else { + getMetrics().ackPeerTotal.add(1, { result: 'ack' }); + } + return decoded; + }, + { + attributes: { + 'dkg.protocol_id': ackProtocolId, + 'dkg.peer': peerId.slice(0, 8), + }, + }, + ); if (isStorageACKDecline(ack)) { const code = sanitizeDeclineField( @@ -747,7 +851,20 @@ export class ACKCollector { // the chain to check" (infra-side, retryable). Pre-PR every // failure surfaced as the same "not registered" string. if (this.deps.verifyIdentityDetailed) { - const verdict = await this.deps.verifyIdentityDetailed(recoveredAddress, identityId); + const verdict = await withSpan( + 'publisher.ack_verify_identity', + async (span) => { + const v = await this.deps.verifyIdentityDetailed!(recoveredAddress, identityId); + // A rejected identity is an expected negative, not a span + // error (status stays OK). withSpan only flips ERROR on throw. + span.setAttribute('dkg.verify_result', v.valid ? 'valid' : (v.reason ?? 'unknown')); + getMetrics().ackVerifyTotal.add(1, { + result: v.valid ? 'valid' : (v.reason ?? 'key-not-registered'), + }); + return v; + }, + { attributes: { 'dkg.identity_id': String(identityId) } }, + ); if (!verdict.valid) { const reason = sanitizeDeclineField(verdict.reason ?? 'unknown', MAX_DECLINE_CODE_CHARS) || 'unknown'; recordACKFailure(peerId, `ACK_VERIFY:${reason}`); @@ -758,7 +875,20 @@ export class ACKCollector { return null; } } else if (this.deps.verifyIdentity) { - const valid = await this.deps.verifyIdentity(recoveredAddress, identityId); + const valid = await withSpan( + 'publisher.ack_verify_identity', + async (span) => { + const ok = await this.deps.verifyIdentity!(recoveredAddress, identityId); + // Legacy boolean verifier surfaces no structured reason; an + // invalid result maps to the generic registration gate. + span.setAttribute('dkg.verify_result', ok ? 'valid' : 'key-not-registered'); + getMetrics().ackVerifyTotal.add(1, { + result: ok ? 'valid' : 'key-not-registered', + }); + return ok; + }, + { attributes: { 'dkg.identity_id': String(identityId) } }, + ); if (!valid) { recordACKFailure(peerId, 'ACK_VERIFY:key-not-registered'); log(`[ACKCollector] Signer ${recoveredAddress.slice(0, 10)}... not registered for identity ${identityId} — rejecting ACK from ${peerId.slice(-8)}`); @@ -794,6 +924,7 @@ export class ACKCollector { } catch (err) { const msg = err instanceof Error ? err.message : String(err); transportAttempts += 1; + getMetrics().ackPeerTotal.add(1, { result: 'transport_error' }); if (transportAttempts < MAX_RETRIES) { if (roundIsOver()) { log( diff --git a/packages/publisher/src/storage-ack-handler.ts b/packages/publisher/src/storage-ack-handler.ts index 05a40229c..631c21c16 100644 --- a/packages/publisher/src/storage-ack-handler.ts +++ b/packages/publisher/src/storage-ack-handler.ts @@ -4,6 +4,10 @@ import { decodePublishIntent, decodeUpdateIntent, encodeStorageACK, + decodeStorageACK, + isStorageACKDecline, + withSpan, + getMetrics, computePublishACKDigest, computeUpdateACKDigest, assertSafeIri, @@ -300,8 +304,68 @@ export class StorageACKHandler { /** * Protocol stream handler for `/dkg/10.0.1/storage-ack`. * Receives PublishIntent, returns StorageACK. + * + * Wrapped in a fresh ROOT span (`publisher.storage_ack_handler`) — this + * is an inbound libp2p callback with no cross-node trace context. Kept + * MINIMAL (no per-step child spans) because it runs under libp2p stream + * backpressure. Classifies the terminal outcome (ack / decline / reset) + * for the `ackHandlerTotal` metric; a thrown error resets the stream and + * is auto-recorded as a span ERROR by withSpan. + */ + handler = async (data: Uint8Array, peerId: PeerId): Promise => { + const chainIdLabel = this.config.chainId != null + ? this.config.chainId.toString() + : undefined; + return withSpan('publisher.storage_ack_handler', async (span) => { + let cgIdAttr: string | undefined; + try { + // contextGraphId is cheap to read off the decoded intent for the span + // attribute; the full classification rides the encoded response below. + const intentPreview = decodePublishIntent(data); + cgIdAttr = intentPreview.contextGraphId; + if (cgIdAttr) span.setAttribute('dkg.context_graph_id', cgIdAttr); + } catch { + // Malformed request — handlePublishIntent will throw + reset below. + } + try { + const result = await this.handlePublishIntent(data, peerId); + const decoded = decodeStorageACK(result); + if (isStorageACKDecline(decoded)) { + const declineCode = decoded.declineCode || 'UNKNOWN'; + span.setAttribute('dkg.ack_outcome', 'decline'); + span.setAttribute('dkg.decline_code', declineCode); + getMetrics().ackHandlerTotal.add(1, { + outcome: 'decline', + decline_code: declineCode, + ...(chainIdLabel ? { chain_id: chainIdLabel } : {}), + }); + } else { + span.setAttribute('dkg.ack_outcome', 'ack'); + getMetrics().ackHandlerTotal.add(1, { + outcome: 'ack', + ...(chainIdLabel ? { chain_id: chainIdLabel } : {}), + }); + } + return result; + } catch (err) { + // Throw resets the libp2p stream — withSpan records ERROR. Tag the + // terminal outcome + metric, then re-throw to preserve control flow. + span.setAttribute('dkg.ack_outcome', 'reset'); + getMetrics().ackHandlerTotal.add(1, { + outcome: 'reset', + ...(chainIdLabel ? { chain_id: chainIdLabel } : {}), + }); + throw err; + } + }); + }; + + /** + * Original publish-intent handling body. Split out from {@link handler} + * so the public entry point is a thin `withSpan` wrapper; the logic here + * is byte-for-byte the pre-instrumentation behaviour. */ - handler = async (data: Uint8Array, _peerId: PeerId): Promise => { + private handlePublishIntent = async (data: Uint8Array, _peerId: PeerId): Promise => { if (this.config.nodeRole !== 'core') { throw new Error('Only core nodes can issue StorageACKs'); } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index fda7dc775..5ad4d99ec 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -48,7 +48,7 @@ importers: version: 22.19.11 '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) esbench: specifier: ^0.8.1 version: 0.8.1(esbuild@0.27.7)(playwright-core@1.59.1)(rollup@4.60.4)(sucrase@3.35.1)(typescript@5.9.3)(vite@7.3.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) @@ -66,7 +66,7 @@ importers: version: 5.9.3 vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) demo: dependencies: @@ -100,7 +100,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/conviction-lazy-settle: dependencies: @@ -110,7 +110,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/core-peers-features: dependencies: @@ -120,13 +120,13 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/edge-update-flow: devDependencies: vitest: specifier: 4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/greenfield-10min: dependencies: @@ -142,7 +142,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/rfc51-publishing-allocation: dependencies: @@ -152,7 +152,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/rich-scenario: dependencies: @@ -165,7 +165,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/v10-core-flows: dependencies: @@ -175,7 +175,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/v10-end-to-end: dependencies: @@ -185,7 +185,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/v10-rs-prune: dependencies: @@ -195,7 +195,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) devnet/v10-stress: dependencies: @@ -205,7 +205,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/adapter-elizaos: dependencies: @@ -215,10 +215,10 @@ importers: devDependencies: '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/adapter-hermes: dependencies: @@ -228,10 +228,10 @@ importers: devDependencies: '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/adapter-openclaw: dependencies: @@ -241,10 +241,10 @@ importers: devDependencies: '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/agent: dependencies: @@ -263,6 +263,9 @@ importers: '@noble/hashes': specifier: ^2.2.0 version: 2.2.0 + '@opentelemetry/api': + specifier: ^1.9.1 + version: 1.9.1 '@origintrail-official/dkg-chain': specifier: workspace:* version: link:../chain @@ -296,10 +299,10 @@ importers: version: 4.0.9 '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/chain: dependencies: @@ -310,12 +313,18 @@ importers: specifier: ^6 version: 6.16.0(bufferutil@4.1.0)(utf-8-validate@5.0.10) devDependencies: + '@opentelemetry/api': + specifier: ^1.9.1 + version: 1.9.1 + '@opentelemetry/sdk-metrics': + specifier: ^2.8.0 + version: 2.8.0(@opentelemetry/api@1.9.1) '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) optionalDependencies: '@origintrail-official/dkg-evm-module': specifier: workspace:* @@ -386,10 +395,10 @@ importers: version: 1.26.1 '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/cli/test-fixtures/sample-kafka-extension: dependencies: @@ -473,6 +482,9 @@ importers: '@noble/hashes': specifier: ^2.2.0 version: 2.2.0 + '@opentelemetry/api': + specifier: ^1.9.1 + version: 1.9.1 js-yaml: specifier: ^4.1.1 version: 4.1.1 @@ -491,10 +503,10 @@ importers: version: 4.0.9 '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/epcis: dependencies: @@ -510,10 +522,10 @@ importers: devDependencies: '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/evm-module: dependencies: @@ -620,7 +632,7 @@ importers: version: 19.2.14 '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) three: specifier: ^0.184.0 version: 0.184.0 @@ -635,7 +647,7 @@ importers: version: 6.4.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) optionalDependencies: jsonld: specifier: ^8.3.3 @@ -655,10 +667,10 @@ importers: version: link:../chain '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/mcp-dkg: dependencies: @@ -674,7 +686,7 @@ importers: devDependencies: vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.8.3) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.8.3) packages/network-sim: dependencies: @@ -696,7 +708,7 @@ importers: version: 4.7.0(vite@6.4.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) typescript: specifier: ^5.7.0 version: 5.9.3 @@ -705,10 +717,34 @@ importers: version: 6.4.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/node-ui: dependencies: + '@opentelemetry/api': + specifier: ^1.9.1 + version: 1.9.1 + '@opentelemetry/core': + specifier: ^2.8.0 + version: 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/exporter-metrics-otlp-proto': + specifier: ^0.219.0 + version: 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/exporter-trace-otlp-proto': + specifier: ^0.219.0 + version: 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': + specifier: ^2.8.0 + version: 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-metrics': + specifier: ^2.8.0 + version: 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-trace-node': + specifier: ^2.8.0 + version: 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/semantic-conventions': + specifier: ^1.41.1 + version: 1.41.1 '@openuidev/react-lang': specifier: ^0.2.3 version: 0.2.3(@modelcontextprotocol/sdk@1.27.1(zod@3.25.76))(react@19.2.4)(zod@3.25.76) @@ -769,7 +805,7 @@ importers: version: 4.7.0(vite@6.4.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) cross-env: specifier: ^10.1.0 version: 10.1.0 @@ -799,7 +835,7 @@ importers: version: 6.4.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/publisher: dependencies: @@ -824,10 +860,10 @@ importers: devDependencies: '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/query: dependencies: @@ -840,10 +876,10 @@ importers: devDependencies: '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/random-sampling: dependencies: @@ -862,13 +898,13 @@ importers: version: link:../publisher '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) ethers: specifier: ^6 version: 6.16.0(bufferutil@4.1.0)(utf-8-validate@5.0.10) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages/storage: dependencies: @@ -881,10 +917,10 @@ importers: devDependencies: '@vitest/coverage-v8': specifier: ^4.0.18 - version: 4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) + version: 4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) vitest: specifier: ^4.0.18 - version: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + version: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) packages: @@ -1919,6 +1955,90 @@ packages: resolution: {integrity: sha512-q4n32/FNKIhQ3zQGGw5CvPF6GTvDCpYwIf7bEY/dZTZbgfDsHyjJwURxUJf3VQuuJj+fDIFl4+KkBVbw4Ef6jA==} engines: {node: '>= 12'} + '@opentelemetry/api-logs@0.219.0': + resolution: {integrity: sha512-FFx7YnaYJlIjqWW/AG/yAZ0L/NEY724PipXXXQLdtZPbLwBGbUMTGL1i/esI56TWfTUXxhLfpgrnWJCG8aUJyg==} + engines: {node: '>=8.0.0'} + + '@opentelemetry/api@1.9.1': + resolution: {integrity: sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==} + engines: {node: '>=8.0.0'} + + '@opentelemetry/context-async-hooks@2.8.0': + resolution: {integrity: sha512-/3FIraneMcng67SUJCxvyInk/oxzwsxyadufk0wwfOBLf5wqtAGX4MoQASwSbndBPeARzBryUM9Azr5kHIdWLw==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/core@2.8.0': + resolution: {integrity: sha512-hd1Lfh8p545nNz+jq1Ejfz+Mn1hyLuxYn1YzTfFNrxr8urEWMNQLPf1Th8kjOH+HxwawCrtgBp8JpBUR4ZSgww==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/exporter-metrics-otlp-http@0.219.0': + resolution: {integrity: sha512-6CaDRbMVHZSDWzNXwrR8y/H4B/Z1eMNnkHiPQlTx3Ojz2OHY4X/aff/UC4P/3pHUQSuTfi3oh2UsPPZppw+Vrg==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/exporter-metrics-otlp-proto@0.219.0': + resolution: {integrity: sha512-DUS7XyIiEnoeccQUvuKy0G2/YqeKhpN8FVIrGbrLNIVMj10yeIFLRzRv0tibCI2kXXvlTTABVexGAk78wHk2ug==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/exporter-trace-otlp-proto@0.219.0': + resolution: {integrity: sha512-lF/LUBfhOFmxJa+SQsLN7ziV4MHa2pyKgOM6JNehSOfU+npjM4gwm9oIKEJrzrWcexMcqydiyoFy0XCb1Ql3wQ==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/otlp-exporter-base@0.219.0': + resolution: {integrity: sha512-zvIxQX/AZUVKDU+hCuYx+7UkiP7GRdnk1ZbFQRYzHvYp47cAWR4j3IhoPhV9KaeXEv2xdGq3IA6PnpzDmLcmSA==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/otlp-transformer@0.219.0': + resolution: {integrity: sha512-aaYKAyXhw9VchKZVGOopD3Gw/kPsyrX2c6IQ0AW32mTjqmZOh5Y6Gf5OYqTNqVktAeBjmFinhyFaCwW6GYK9YQ==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + + '@opentelemetry/resources@2.8.0': + resolution: {integrity: sha512-qmXQ27ilDbUK/vGMqwL8D4/rhn76C+sherM4wTbjlfknR8Nvfc/hCxjRJPhkzZzUsPiNg16SA31NxMabwttRjg==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': '>=1.3.0 <1.10.0' + + '@opentelemetry/sdk-logs@0.219.0': + resolution: {integrity: sha512-s6lTKRakaPClvKoWHRChxnXjDMkM/TQ30ff78jN6EBGf7MI7VzANE5PU3f4z9qDUudWjvZjOLHG0rBnBKYvoXA==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': '>=1.4.0 <1.10.0' + + '@opentelemetry/sdk-metrics@2.8.0': + resolution: {integrity: sha512-UDBGaj6W0Rgy5rTTaoxs8gVGF/aGkAKyjurJv7se6wjRxJu7FoquTLT/vt54DZfo4crbprYfhX/SOK9+BPw1qg==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': '>=1.9.0 <1.10.0' + + '@opentelemetry/sdk-trace-base@2.8.0': + resolution: {integrity: sha512-mhU4jp+vW0mGbFRd+GeXHvmfA4aDqWjBjLC3pE5XMpLs0IE2ryYb019Ts2AQrOq67gaTF25D91+fgvEHDZEnuQ==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': '>=1.3.0 <1.10.0' + + '@opentelemetry/sdk-trace-node@2.8.0': + resolution: {integrity: sha512-nZt9OGufioAc3AfoLTqA9bsAeaMJAictYDdI2VcNQ+PmT+3rfKjAZDZvgPfd8VPX0O5Bw1hdQF6kDK8VSpZiWg==} + engines: {node: ^18.19.0 || >=20.6.0} + peerDependencies: + '@opentelemetry/api': '>=1.0.0 <1.10.0' + + '@opentelemetry/semantic-conventions@1.41.1': + resolution: {integrity: sha512-/UhIkaZgPutTFmQ7RnIJGgDXZmtEJ7Dvi86xNTFWcnRxVRNk/aotsqDJYeEvDP+FSMB2SdW+pQzNMcWP0rwuNA==} + engines: {node: '>=14'} + '@openuidev/lang-core@0.2.2': resolution: {integrity: sha512-0er0fsoesuWR06OemF7Gsd2CzW22nEZ+8TQfRNcwp3qe74U2cuD5XD1qHKLUM+rkdLxeMaUoVm5h1wlnJYDU0Q==} peerDependencies: @@ -7929,6 +8049,101 @@ snapshots: '@nomicfoundation/solidity-analyzer-linux-x64-musl': 0.1.2 '@nomicfoundation/solidity-analyzer-win32-x64-msvc': 0.1.2 + '@opentelemetry/api-logs@0.219.0': + dependencies: + '@opentelemetry/api': 1.9.1 + + '@opentelemetry/api@1.9.1': {} + + '@opentelemetry/context-async-hooks@2.8.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + + '@opentelemetry/core@2.8.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/semantic-conventions': 1.41.1 + + '@opentelemetry/exporter-metrics-otlp-http@0.219.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/otlp-exporter-base': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/otlp-transformer': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-metrics': 2.8.0(@opentelemetry/api@1.9.1) + + '@opentelemetry/exporter-metrics-otlp-proto@0.219.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/exporter-metrics-otlp-http': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/otlp-exporter-base': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/otlp-transformer': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-metrics': 2.8.0(@opentelemetry/api@1.9.1) + + '@opentelemetry/exporter-trace-otlp-proto@0.219.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/otlp-exporter-base': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/otlp-transformer': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-trace-base': 2.8.0(@opentelemetry/api@1.9.1) + + '@opentelemetry/otlp-exporter-base@0.219.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/otlp-transformer': 0.219.0(@opentelemetry/api@1.9.1) + + '@opentelemetry/otlp-transformer@0.219.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/api-logs': 0.219.0 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-logs': 0.219.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-metrics': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-trace-base': 2.8.0(@opentelemetry/api@1.9.1) + + '@opentelemetry/resources@2.8.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/semantic-conventions': 1.41.1 + + '@opentelemetry/sdk-logs@0.219.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/api-logs': 0.219.0 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/semantic-conventions': 1.41.1 + + '@opentelemetry/sdk-metrics@2.8.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.1) + + '@opentelemetry/sdk-trace-base@2.8.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/resources': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/semantic-conventions': 1.41.1 + + '@opentelemetry/sdk-trace-node@2.8.0(@opentelemetry/api@1.9.1)': + dependencies: + '@opentelemetry/api': 1.9.1 + '@opentelemetry/context-async-hooks': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/core': 2.8.0(@opentelemetry/api@1.9.1) + '@opentelemetry/sdk-trace-base': 2.8.0(@opentelemetry/api@1.9.1) + + '@opentelemetry/semantic-conventions@1.41.1': {} + '@openuidev/lang-core@0.2.2(@modelcontextprotocol/sdk@1.27.1(zod@3.25.76))(zod@3.25.76)': dependencies: zod: 3.25.76 @@ -8615,7 +8830,7 @@ snapshots: transitivePeerDependencies: - supports-color - '@vitest/coverage-v8@4.0.18(vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0))': + '@vitest/coverage-v8@4.0.18(vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0))': dependencies: '@bcoe/v8-coverage': 1.0.2 '@vitest/utils': 4.0.18 @@ -8627,7 +8842,7 @@ snapshots: obug: 2.1.1 std-env: 3.10.0 tinyrainbow: 3.0.3 - vitest: 4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) + vitest: 4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) '@vitest/expect@4.0.18': dependencies: @@ -12996,7 +13211,7 @@ snapshots: tsx: 4.21.0 yaml: 2.9.0 - vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.8.3): + vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.8.3): dependencies: '@vitest/expect': 4.0.18 '@vitest/mocker': 4.0.18(vite@7.3.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) @@ -13019,6 +13234,7 @@ snapshots: vite: 7.3.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.8.3) why-is-node-running: 2.3.0 optionalDependencies: + '@opentelemetry/api': 1.9.1 '@types/node': 22.19.11 happy-dom: 20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10) transitivePeerDependencies: @@ -13034,7 +13250,7 @@ snapshots: - tsx - yaml - vitest@4.0.18(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0): + vitest@4.0.18(@opentelemetry/api@1.9.1)(@types/node@22.19.11)(happy-dom@20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10))(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0): dependencies: '@vitest/expect': 4.0.18 '@vitest/mocker': 4.0.18(vite@7.3.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0)) @@ -13057,6 +13273,7 @@ snapshots: vite: 7.3.2(@types/node@22.19.11)(jiti@2.7.0)(tsx@4.21.0)(yaml@2.9.0) why-is-node-running: 2.3.0 optionalDependencies: + '@opentelemetry/api': 1.9.1 '@types/node': 22.19.11 happy-dom: 20.8.9(bufferutil@4.1.0)(utf-8-validate@5.0.10) transitivePeerDependencies: diff --git a/tools/log-collection-poc/OPERATOR-GUIDE.md b/tools/log-collection-poc/OPERATOR-GUIDE.md new file mode 100644 index 000000000..6e762cbf3 --- /dev/null +++ b/tools/log-collection-poc/OPERATOR-GUIDE.md @@ -0,0 +1,67 @@ +# Forwarding your DKG node logs (opt-in) + +Every DKG node keeps full local logs by default (SQLite + `~/.dkg/daemon.log`). +If you *also* want to forward them to your own log backend (or an +OriginTrail-provided collector), enable the OTLP exporter. **Forwarding is off +until you turn it on**, and secrets (wallet keys, mnemonics, tokens) are +redacted on the node before anything is sent. + +## Enable it +Add to your `config.json`: + +```json +"name": "my-node-01", +"telemetry": { + "enabled": true, + "logs": { + "exporter": "otlp", + "endpoint": "https:///v1/logs", + "token": "", + "level": "info" + } +} +``` + +| Field | Meaning | +|---|---| +| `name` | Your node's name — becomes the `service_instance_id` label, i.e. how you pick this node in Grafana. Use a unique value. | +| `telemetry.enabled` | Master switch. `false` (default) = nothing leaves the node. | +| `logs.exporter` | `otlp` (recommended), `syslog` (legacy Graylog), or `none` (local only). **If omitted it defaults to `syslog`** — set `otlp` explicitly to forward via OTLP. | +| `logs.endpoint` | Your OTLP/HTTP logs URL (an OpenTelemetry Collector, Grafana Alloy, or Loki ≥3.0 `/otlp/v1/logs`). | +| `logs.token` | Optional bearer token sent as `Authorization: Bearer …`. | +| `logs.level` | Minimum level forwarded (`debug`/`info`/`warn`/`error`). Default `info` — `debug` stays local. | +| `logs.redact` | Extra sensitive key names to scrub from messages, on top of the built-in set. | + +Restart the node. It now pushes redacted, structured logs to your collector; +local logging is unchanged. The exporter is non-blocking and buffered — if your +collector is down, the node keeps running and logs are dropped-oldest, never +queued unboundedly. + +## What gets sent +- **Resource labels:** `service.name=dkg-node`, `service.instance.id=`, `deployment.environment=`, `dkg.node.role`, `dkg.chain` (matches the traces/metrics resource). Loki sanitizes dots to underscores, so these appear as `service_name`, `service_instance_id`, `deployment_environment`, `dkg_node_role`, `dkg_chain`. +- **Per-record attributes:** `dkg.operation_id`, `dkg.operation_name`, `dkg.source_operation_id`, `dkg.module`, severity, plus `trace_id`/`span_id` when emitted inside a span. +- **Body:** the log message, with secrets already redacted. + +## Traces & metrics (optional) +Logs go through a hand-rolled OTLP/HTTP exporter; **traces and metrics use the +stable OpenTelemetry SDK** exporters and are configured the same way under +`telemetry`: + +```json +"telemetry": { + "enabled": true, + "logs": { "exporter": "otlp", "endpoint": "https:///v1/logs", "level": "info" }, + "traces": { "endpoint": "https:///v1/traces", "sampleRatio": 1 }, + "metrics": { "endpoint": "https:///v1/metrics", "exportIntervalMs": 30000 } +} +``` + +Each signal is independent and stays off until it has an endpoint (or set +`"enabled": false` to disable one explicitly). Point traces at Tempo/any OTLP +traces backend and metrics at a Prometheus/Mimir-backed collector. + +## Viewing +Point Grafana at your log store and import `production/grafana-dashboard-dkg-node-logs.json` +(per-node) and/or `production/grafana-dashboard-dkg-fleet-logs.json` (fleet), +then pick your node and a time range. If your store is Loki < 3.0, front it with +Grafana Alloy (see `production/RUNBOOK.md`). diff --git a/tools/log-collection-poc/README.md b/tools/log-collection-poc/README.md new file mode 100644 index 000000000..295383fc4 --- /dev/null +++ b/tools/log-collection-poc/README.md @@ -0,0 +1,93 @@ +# DKG V10 core-node log collection — local PoC stack + +Reference self-host backend for the **"Enable log collection on core nodes"** +design: an **OpenTelemetry Collector** (the engine Grafana Alloy wraps) in front +of **Grafana Loki**, viewed in **Grafana**. + +``` +DKG node ──OTLP/HTTP :4318──▶ OTel Collector ──OTLP──▶ Loki ◀──query── Grafana :3000 + (at-source redaction) (batch, opt. backstop redact) (store) (Explore) +``` + +The node side is implemented in this branch: + +| Concern | Where | +|---|---| +| Canonical structured record | `packages/core/src/logger.ts` (`LogRecord`, `Logger.setSink`) | +| At-source secret redaction | `packages/core/src/log-redaction.ts` (`createLogRedactor`) | +| OTLP/HTTP exporter (buffer, backoff, non-blocking) | `packages/node-ui/src/otlp-log-worker.ts` (`OtlpLogWorker`) | +| Fan-out + config + toggles | `packages/cli/src/daemon/lifecycle.ts`, `packages/cli/src/config.ts` | + +## 1. Bring up the stack + +```bash +cd tools/log-collection-poc +docker compose up -d +# Grafana: http://localhost:3000 (anonymous admin) +# Collector OTLP/HTTP: http://localhost:4318 +# Loki API: http://localhost:3100 +``` + +## 2a. Send sample logs (no node needed) + +Drives the **real** redactor + exporter exactly as the daemon does (one sample +carries a fake private key + mnemonic to prove redaction): + +```bash +# from repo root, after `pnpm turbo run build --filter=@origintrail-official/dkg-node-ui...` +node tools/log-collection-poc/send-sample-logs.mjs +``` + +## 2b. …or point a real node at it + +Add to your `config.json` (or `/config.json`): + +```json +"telemetry": { + "enabled": true, + "logs": { + "exporter": "otlp", + "endpoint": "http://localhost:4318/v1/logs", + "level": "info" + } +} +``` + +`enabled` is the master gate (off by default → nothing leaves the node). +`exporter: "otlp"` selects this path (`"syslog"` = legacy Graylog, `"none"` = +local only). Local SQLite + `daemon.log` keep full-fidelity logs regardless. + +## 3. View in Grafana + +Open **http://localhost:3000 → Explore → Loki** and run: + +```logql +{service_name="dkg-node"} +``` + +DKG fields ride along as **structured metadata** — filter with e.g.: + +```logql +{service_name="dkg-node"} | dkg_network = `devnet` | dkg_operation_name = `publish` +``` + +Correlate a cross-node operation by its id: + +```logql +{service_name="dkg-node"} | dkg_operation_id = `op-pub-1` +``` + +You can also confirm ingest straight from the collector log +(`docker compose logs otel-collector`) — the `debug` exporter prints every +received record. **The wallet sample line must show `[REDACTED]`** — no +`0xdeadbeef…`, no mnemonic words — proving redaction happened on the node. + +## Notes + +- **Alloy** is the production-grade swap for the OTel Collector here (same OTLP + receiver → Loki path); Promtail is EOL (March 2026) — don't use it. +- The collector config has a commented-out **OTTL backstop redaction** block — + defense-in-depth for any bare key material that reaches the body without a + key-name (the node redactor is conservative by design to avoid nuking public + 0x hashes / Merkle roots). +- Tear down: `docker compose down -v`. diff --git a/tools/log-collection-poc/docker-compose.yml b/tools/log-collection-poc/docker-compose.yml new file mode 100644 index 000000000..67e824eb3 --- /dev/null +++ b/tools/log-collection-poc/docker-compose.yml @@ -0,0 +1,43 @@ +# Local reference log-collection stack for DKG V10 core nodes. +# +# node --OTLP/HTTP--> OTel Collector --OTLP--> Loki <--query-- Grafana +# +# This is the recommended self-host backend from the "Enable log collection on +# core nodes" design: an OpenTelemetry Collector (the engine Grafana Alloy +# wraps) in front of Loki, viewed in Grafana. Bring it up with: +# +# docker compose up -d +# +# Then point a node at it (see README.md) or run `node send-sample-logs.mjs`. +services: + loki: + image: grafana/loki:3.3.2 + command: -config.file=/etc/loki/loki-config.yaml + ports: + - "3100:3100" + volumes: + - ./loki-config.yaml:/etc/loki/loki-config.yaml:ro + + otel-collector: + image: otel/opentelemetry-collector-contrib:0.117.0 + command: ["--config=/etc/otelcol/config.yaml"] + ports: + - "4318:4318" # OTLP/HTTP — point the node here + - "4317:4317" # OTLP/gRPC + volumes: + - ./otel-collector-config.yaml:/etc/otelcol/config.yaml:ro + depends_on: + - loki + + grafana: + image: grafana/grafana:11.4.0 + ports: + - "3000:3000" + environment: + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_AUTH_DISABLE_LOGIN_FORM=true + volumes: + - ./grafana-datasources.yaml:/etc/grafana/provisioning/datasources/ds.yaml:ro + depends_on: + - loki diff --git a/tools/log-collection-poc/grafana-dashboard-dkg-node-logs.json b/tools/log-collection-poc/grafana-dashboard-dkg-node-logs.json new file mode 100644 index 000000000..c40ec2395 --- /dev/null +++ b/tools/log-collection-poc/grafana-dashboard-dkg-node-logs.json @@ -0,0 +1,101 @@ +{ + "title": "DKG Node Logs", + "uid": "dkg-node-logs", + "tags": ["dkg", "logs"], + "schemaVersion": 39, + "version": 1, + "editable": true, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": { + "refresh_intervals": ["10s", "30s", "1m", "5m", "15m"], + "time_options": ["15m", "1h", "3h", "6h", "12h", "24h", "2d", "7d"] + }, + "templating": { + "list": [ + { + "name": "loki", + "label": "Loki", + "type": "datasource", + "query": "loki", + "refresh": 1, + "hide": 0, + "current": {} + }, + { + "name": "env", + "label": "Environment", + "type": "query", + "datasource": { "type": "loki", "uid": "${loki}" }, + "definition": "label_values(deployment_environment)", + "query": "label_values(deployment_environment)", + "refresh": 2, + "sort": 1, + "includeAll": true, + "allValue": ".+", + "multi": false, + "current": { "text": "All", "value": "$__all" } + }, + { + "name": "node", + "label": "Node", + "type": "query", + "datasource": { "type": "loki", "uid": "${loki}" }, + "definition": "label_values({deployment_environment=~\"$env\"}, service_instance_id)", + "query": "label_values({deployment_environment=~\"$env\"}, service_instance_id)", + "refresh": 2, + "sort": 1, + "includeAll": false, + "multi": false, + "current": {} + }, + { + "name": "search", + "label": "Filter (regex)", + "type": "textbox", + "query": "", + "current": { "text": "", "value": "" } + } + ] + }, + "panels": [ + { + "type": "logs", + "title": "Logs — $node ($env)", + "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 21, "w": 24, "x": 0, "y": 1 }, + "options": { + "showTime": true, + "showLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "${loki}" }, + "queryType": "range", + "expr": "{service_instance_id=\"$node\"} |~ `(?i)$search`" + } + ] + }, + { + "type": "timeseries", + "title": "Log volume by level — $node", + "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 22 }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "bars", "fillOpacity": 60, "stacking": { "mode": "normal" } } }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "${loki}" }, + "queryType": "range", + "expr": "sum by (detected_level) (count_over_time({service_instance_id=\"$node\"} |~ `(?i)$search` [$__auto]))", + "legendFormat": "{{detected_level}}" + } + ] + } + ] +} diff --git a/tools/log-collection-poc/grafana-datasources.yaml b/tools/log-collection-poc/grafana-datasources.yaml new file mode 100644 index 000000000..079406c1c --- /dev/null +++ b/tools/log-collection-poc/grafana-datasources.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + jsonData: + # Surface OTLP structured metadata (dkg.*) as derived fields in Explore. + derivedFields: [] diff --git a/tools/log-collection-poc/loki-config.yaml b/tools/log-collection-poc/loki-config.yaml new file mode 100644 index 000000000..39565ab43 --- /dev/null +++ b/tools/log-collection-poc/loki-config.yaml @@ -0,0 +1,42 @@ +# Minimal single-binary Loki (filesystem storage) for local PoC use only. +# NOT a production config — no retention/compaction tuning, in-memory ring. +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + log_level: warn + +common: + instance_addr: 127.0.0.1 + path_prefix: /tmp/loki + storage: + filesystem: + chunks_directory: /tmp/loki/chunks + rules_directory: /tmp/loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + # Required so OTLP resource/log attributes land as structured metadata. + allow_structured_metadata: true + volume_enabled: true + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 diff --git a/tools/log-collection-poc/mock-otlp-collector.mjs b/tools/log-collection-poc/mock-otlp-collector.mjs new file mode 100644 index 000000000..e50344d7e --- /dev/null +++ b/tools/log-collection-poc/mock-otlp-collector.mjs @@ -0,0 +1,31 @@ +/** + * Minimal OTLP/HTTP collector for local verification (no Docker). Accepts the + * three signal endpoints, records raw payloads to /tmp/otlp_*.bin, and logs + * receipts. Protobuf encodes string fields inline as UTF-8, so span/metric + * names are greppable in the captured bytes. + */ +import http from 'node:http'; +import fs from 'node:fs'; + +for (const s of ['traces', 'metrics', 'logs']) { + try { fs.rmSync(`/tmp/otlp_${s}.bin`); } catch {} +} +const counts = { traces: 0, metrics: 0, logs: 0, other: 0 }; + +const server = http.createServer((req, res) => { + const chunks = []; + req.on('data', (c) => chunks.push(c)); + req.on('end', () => { + const buf = Buffer.concat(chunks); + const sig = req.url.includes('/v1/traces') ? 'traces' + : req.url.includes('/v1/metrics') ? 'metrics' + : req.url.includes('/v1/logs') ? 'logs' : 'other'; + counts[sig]++; + if (sig !== 'other') fs.appendFileSync(`/tmp/otlp_${sig}.bin`, buf); + console.log(`[otlp] ${sig.padEnd(7)} POST ${String(buf.length).padStart(6)}B (total ${sig}=${counts[sig]})`); + res.statusCode = 200; + res.setHeader('content-type', 'application/x-protobuf'); + res.end(); + }); +}); +server.listen(4318, '127.0.0.1', () => console.log('mock OTLP collector listening on http://127.0.0.1:4318')); diff --git a/tools/log-collection-poc/otel-collector-config.yaml b/tools/log-collection-poc/otel-collector-config.yaml new file mode 100644 index 000000000..7ea2883f3 --- /dev/null +++ b/tools/log-collection-poc/otel-collector-config.yaml @@ -0,0 +1,46 @@ +# Receives OTLP logs from DKG nodes and forwards them to Loki's native OTLP +# endpoint. `debug` exporter prints received records to the collector log so +# you can confirm ingest without opening Grafana. +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + +processors: + batch: {} + # OPTIONAL collector-side backstop redaction for the free-form log body + # (defense-in-depth on top of the node's at-source redaction). Uncomment to + # scrub any bare key material that slipped through without a key-name. + # transform: + # log_statements: + # - context: log + # statements: + # - replace_pattern(body.string, "0x[0-9a-fA-F]{64}", "[REDACTED-HEX]") + +exporters: + otlphttp/loki: + endpoint: http://loki:3100/otlp + debug: + verbosity: detailed + +service: + pipelines: + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp/loki, debug] + # Traces + metrics from the node's OTel SDK. The `debug` exporter prints + # them to the collector log (docker compose logs otel-collector) so you can + # confirm export without a Tempo/Prometheus backend. Swap in otlphttp → + # Tempo (traces) and a prometheus exporter/remote-write (metrics) for prod. + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [debug] diff --git a/tools/log-collection-poc/production/MANAGER-HANDOFF.md b/tools/log-collection-poc/production/MANAGER-HANDOFF.md new file mode 100644 index 000000000..8a331c546 --- /dev/null +++ b/tools/log-collection-poc/production/MANAGER-HANDOFF.md @@ -0,0 +1,35 @@ +# Core-node log collection — status & what's needed to finish + +**Goal:** in Grafana (polaris), pick a hosted node and see its logs over the last X hours. + +## TL;DR +The feature is **built, merged-ready, and verified end-to-end against a real node and a real Loki 2.5.0** (same version polaris runs). Everything that can be done without production access is done. **To go live we need three access-gated actions on the polaris/Loki host + Cloudflare + node deploys** — listed at the bottom. None require further code. + +## What's DONE (code) — PR #1317 (`feat/core-node-log-collection`) +- Nodes can ship logs via **OpenTelemetry (OTLP/HTTP)**, opt-in, **off by default**. Local logging (SQLite + daemon.log) is unchanged. +- **Secret redaction at the source** — wallet keys, mnemonics, tokens, JWTs become `[REDACTED]` before any log leaves the node; public hashes/Merkle roots are kept. +- **Per-node identity** emitted as labels (`service_instance_id` = node name, `deployment_environment` = testnet/mainnet) → drives the Grafana "pick a node" dropdown. +- Non-blocking exporter (bounded buffer, retry/backoff) — telemetry can never slow or crash a node. +- Reference ingest for the existing **Loki 2.5.0**: a **Grafana Alloy** bridge (Loki 2.5.0 predates native OTLP) + a ready dashboard + full runbook. + +## What's VERIFIED (evidence) +- ✅ 17 unit/integration tests (redaction + OTLP exporter over a real HTTP server) + full core suite (1114) green; whole monorepo builds. +- ✅ **Real `dkg` daemon** booted with telemetry on → emitted real operational logs (`Syncing from peer…`, `Reconnect-on-gossip…`) that landed in **Loki 2.5.0** via Alloy, queryable by node. +- ✅ Redaction confirmed through the Alloy→Loki-2.5.0 bridge (a planted wallet key + mnemonic arrived `[REDACTED]`). +- ✅ Per-node + per-environment + level labels confirmed; the dashboard's exact LogQL returns clean message lines + a level-volume breakdown. +- ✅ **"DKG Node Logs" dashboard already imported into polaris** (`/d/dkg-node-logs`, bound to the `LOKI` datasource). Empty only until a node ships. + +## What's BLOCKED — needs access we don't have (the ask) +1. **Loki host (the polaris box):** run Grafana Alloy there (Docker or systemd — files provided), pointed at the local Loki `127.0.0.1:3100`. → `docker-compose.alloy.yml` / `alloy.systemd.service` + `config.alloy`. +2. **Cloudflare:** publish an ingest hostname (e.g. `logs-ingest.xtrmstrngth.com`) → `localhost:4318` via a tunnel, **no Access policy**, with a bearer-token WAF rule. → `cloudflared-config.example.yml` + token (generated separately, kept out of git). +3. **Node configs / deploy pipeline:** add the `telemetry.logs` block (unique `name` per node) to each hosted testnet+mainnet node on a build that includes PR #1317, and restart. → `node-config.example.json`. + +After 1–3, run `smoke-test.sh` to confirm, then open `/d/dkg-node-logs` and pick a node. + +## Decisions we made (so managers don't have to) +- **OTLP over rebuilding Graylog** — vendor-neutral, redaction + structured fields, reuses existing seams. +- **Alloy bridge, not a Loki upgrade** — avoids touching the shared production Loki 2.5.0. +- **Auth at the Cloudflare edge**, not in Alloy — simpler and version-robust. +- **Off by default + opt-in + redaction** — correct for a network where nodes are independently operated (and good hygiene for our own fleet). + +_Files in this folder: `config.alloy`, `docker-compose.alloy.yml`, `alloy.systemd.service`, `cloudflared-config.example.yml`, `node-config.example.json`, `smoke-test.sh`, `RUNBOOK.md` (step-by-step), `docker-compose.sim.yml` (local validation only)._ diff --git a/tools/log-collection-poc/production/RUNBOOK.md b/tools/log-collection-poc/production/RUNBOOK.md new file mode 100644 index 000000000..67286fba2 --- /dev/null +++ b/tools/log-collection-poc/production/RUNBOOK.md @@ -0,0 +1,74 @@ +# Production runbook — DKG node logs → existing Grafana (polaris) + +Goal: in Grafana, pick a node and see its logs for the last X hours. + +**What already exists on your side** (verified 2026-06-24): +- Grafana at `https://polaris.xtrmstrngth.com` with a **Loki 2.5.0** datasource (`LOKI`, uid `RuMYFlL7z`) running on the **same host** (`http://127.0.0.1:3100`), already fed by file-tailing (job=dkg-engine, etc.). +- The **"DKG Node Logs"** dashboard is already imported (`/d/dkg-node-logs`), bound to that Loki. It's empty until a node ships. + +**Architecture** (chosen because Loki 2.5.0 can't ingest OTLP natively): +``` +DKG node ──OTLP/HTTP(+token)──▶ Cloudflare hostname ──▶ Alloy (on Loki host) ──Loki push──▶ Loki 2.5.0 ──▶ Grafana + at-source redaction OTLP→Loki, promotes node labels +``` +Verified locally end-to-end against Loki 2.5.0: per-node label `service_instance_id`, `deployment_environment`, `level`; secrets arrive `[REDACTED]`; the dashboard's queries return clean lines + level volume. + +--- + +## Step 1 — run Alloy on the Loki host (the polaris box) +Copy `config.alloy` + `docker-compose.alloy.yml` to the host, then: +```bash +docker compose -f docker-compose.alloy.yml up -d +docker logs --tail 20 # should say "now listening", no errors +``` +Alloy now listens on `0.0.0.0:4318` (OTLP/HTTP) — as set in `config.alloy` — and writes to the local Loki. (Uses `network_mode: host` to reach `127.0.0.1:3100`.) **Firewall TCP 4318 from the public internet**; only `cloudflared` on localhost (Step 2) should reach it. Reverse-proxy/tunnel both connect to it via `localhost:4318`. + +## Step 2 — expose an ingest hostname via Cloudflare (no Access) +Nodes can't do interactive SSO, so this hostname must NOT have a Cloudflare Access policy. Recommended: a **Cloudflare Tunnel** (`cloudflared`) on the host: +```bash +cloudflared tunnel create dkg-logs +# route a hostname → the local Alloy OTLP port: +cloudflared tunnel route dns dkg-logs logs-ingest.xtrmstrngth.com +# ingress config: hostname logs-ingest.xtrmstrngth.com → service http://localhost:4318 +cloudflared tunnel run dkg-logs +``` +- **Do NOT** put a Cloudflare Access policy on `logs-ingest.xtrmstrngth.com`. +- **Firewall** TCP 4318 from the public internet (only `cloudflared` on localhost reaches it). +- **Auth:** pick a long random token (`openssl rand -hex 32`) and add a Cloudflare **WAF custom rule** (Security → WAF → Custom rules) — **exact expression** (Block action): + ``` + (http.host eq "logs-ingest.xtrmstrngth.com" and not any(http.request.headers["authorization"][*] eq "Bearer ")) + ``` + This blocks every request to the ingest hostname that doesn't carry the exact bearer token. Put the same token in each node's `telemetry.logs.token`. (Verified end-to-end in your real Loki/Grafana via a proxy push on 2026-06-24 — the dashboard renders per-node, redacted lines.) + +(Alternative without Cloudflare Tunnel: terminate TLS at your existing reverse proxy and `proxy_pass` the hostname to `127.0.0.1:4318`, enforcing the bearer header there.) + +## Step 3 — point each hosted node at it +On every OriginTrail-hosted node (running a build with PR #1317), add the block from `node-config.example.json` to its `config.json`: +- `name`: **unique per node** (e.g. `testnet-core-01`, `mainnet-core-02`) — this is the Grafana Node selector value. +- `telemetry.logs.endpoint`: `https://logs-ingest.xtrmstrngth.com/v1/logs` +- `telemetry.logs.token`: the `` from Step 2. + +> **⚠️ You MUST set `telemetry.logs.exporter: "otlp"`.** If `logs.exporter` is +> left unset, a node defaults to the **legacy syslog/Graylog** exporter, not +> OTLP — so nothing reaches Alloy/Loki and the dashboards stay empty. The infra +> templates must emit `"logs": { "exporter": "otlp", … }` explicitly on every +> hosted node. + +Restart the node. Local logging (SQLite + daemon.log) is unaffected; this only adds the redacted OTLP copy. + +**Logs vs traces/metrics (different transports, same endpoint host):** logs ship via a hand-rolled **OTLP/HTTP JSON** exporter (the OTel Logs SDK is still "Development"), while **traces and metrics use the stable OTel SDK** OTLP/protobuf exporters. The polaris setup today only has a **logs** backend (Loki via Alloy), so leave `telemetry.traces`/`telemetry.metrics` out (or set `enabled: false`) until a traces backend (Tempo) and metrics backend (Mimir/Prometheus) are provisioned — the `node-config.example.json` shows the full three-signal shape and `config.alloy` has the matching commented routing. + +## Step 4 — view in Grafana +- **Per-node:** `https://polaris.xtrmstrngth.com/d/dkg-node-logs` → pick a **Node** → set the time range (top-right) → logs appear. `Level` and `Filter (regex)` narrow further; the bottom panel is volume-by-level. +- **Fleet overview:** `https://polaris.xtrmstrngth.com/d/dkg-fleet-logs` → active-node count, log volume per node, errors per node, recent fleet-wide errors (filter by `Environment`). + +Both dashboards are already imported. Optional alerts: `example-alerts.md`. Node-operator self-serve guide (any operator, their own backend): `../OPERATOR-GUIDE.md`. + +--- + +## Notes / decisions +- **Redaction** runs on the node before anything leaves (wallet keys, mnemonics, tokens, JWTs → `[REDACTED]`); public 0x hashes/Merkle roots are kept. +- **Labels kept low-cardinality** on purpose: `service_name`, `service_instance_id` (node), `deployment_environment` (network), `dkg_node_role`. `operation_id` etc. stay inside the JSON line — filter with `| json | dkg_operation_id="..."`. +- **Don't add `operation_id`/`peer_id` as Loki labels** — high cardinality will hurt Loki 2.5.0. +- If you ever upgrade Loki to ≥3.0, you can drop Alloy's `| json | line_format` step in the dashboard and push OTLP straight to Loki's `/otlp` endpoint. +- Local validation stack (do not deploy): `docker-compose.sim.yml` (Loki 2.5.0 + Alloy) + `../send-sample-logs.mjs`. diff --git a/tools/log-collection-poc/production/alloy.systemd.service b/tools/log-collection-poc/production/alloy.systemd.service new file mode 100644 index 000000000..8365db0a4 --- /dev/null +++ b/tools/log-collection-poc/production/alloy.systemd.service @@ -0,0 +1,21 @@ +# systemd alternative to Docker for running Alloy on the Loki host. +# Install Alloy (https://grafana.com/docs/alloy/latest/set-up/install/), then: +# sudo cp config.alloy /etc/alloy/config.alloy +# sudo cp alloy.systemd.service /etc/systemd/system/dkg-alloy.service +# sudo systemctl daemon-reload && sudo systemctl enable --now dkg-alloy +# journalctl -u dkg-alloy -f # should show "now listening", no errors +[Unit] +Description=Grafana Alloy — DKG OTLP log ingest bridge → local Loki +After=network-online.target +Wants=network-online.target + +[Service] +Environment=LOKI_PUSH_URL=http://127.0.0.1:3100/loki/api/v1/push +ExecStart=/usr/bin/alloy run /etc/alloy/config.alloy --server.http.listen-addr=127.0.0.1:12345 +Restart=on-failure +RestartSec=5 +# Alloy's OTLP receiver binds 0.0.0.0:4318 — keep 4318 firewalled from the +# public internet; only cloudflared (localhost) should reach it. + +[Install] +WantedBy=multi-user.target diff --git a/tools/log-collection-poc/production/cloudflared-config.example.yml b/tools/log-collection-poc/production/cloudflared-config.example.yml new file mode 100644 index 000000000..ac84b2413 --- /dev/null +++ b/tools/log-collection-poc/production/cloudflared-config.example.yml @@ -0,0 +1,16 @@ +# cloudflared ingress for the node→Alloy ingest hostname. +# IMPORTANT: do NOT attach a Cloudflare Access (SSO) policy to this hostname — +# nodes cannot do interactive login. Authenticate with the bearer token instead +# (a Cloudflare WAF custom rule on the Authorization header — see RUNBOOK.md). +# +# Install: place at /etc/cloudflared/config.yml on the Loki host, then: +# cloudflared tunnel create dkg-logs +# cloudflared tunnel route dns dkg-logs logs-ingest.xtrmstrngth.com +# cloudflared service install # or: cloudflared tunnel run dkg-logs +tunnel: +credentials-file: /etc/cloudflared/.json + +ingress: + - hostname: logs-ingest.xtrmstrngth.com + service: http://localhost:4318 # Alloy OTLP/HTTP receiver on the same host + - service: http_status:404 diff --git a/tools/log-collection-poc/production/config.alloy b/tools/log-collection-poc/production/config.alloy new file mode 100644 index 000000000..c64bca1a1 --- /dev/null +++ b/tools/log-collection-poc/production/config.alloy @@ -0,0 +1,103 @@ +// Grafana Alloy — OTLP ingest bridge for DKG V10 node telemetry. +// +// DKG nodes --OTLP/HTTP (bearer token)--> Alloy --backend--> Grafana +// +// SIGNAL STATUS for the polaris setup today: this config is LOGS-ONLY, +// because the only telemetry backend currently provisioned is Loki 2.5.0. +// The OTLP receiver below accepts all three signals on one port, but only +// the logs pipeline is wired to an exporter. Traces and metrics are shown +// in the commented "FULL-SIGNAL" section at the bottom — uncomment them once +// a traces backend (Tempo/any OTLP) and a metrics backend (Mimir/Prometheus +// remote-write) exist. Nodes emit logs via a hand-rolled OTLP/HTTP exporter +// and traces+metrics via the OTel SDK; on the wire all three are plain OTLP, +// so Alloy treats them uniformly. +// +// Why Alloy (not native OTLP→Loki): the production Loki is 2.5.0, which has no +// OTLP endpoint and no structured metadata. Alloy receives OTLP and writes via +// the classic /loki/api/v1/push, promoting the node-identity resource +// attributes to index labels (service_name, service_instance_id, +// deployment_environment, dkg_node_role) so a Grafana dashboard can pick a node. +// +// Env: +// LOKI_PUSH_URL — e.g. http://localhost:3100/loki/api/v1/push (Loki on host). +// +// AUTH: the OTLP port is NOT exposed directly. It is reached only through the +// Cloudflare-proxied ingest hostname, which enforces the bearer token (a +// Cloudflare WAF rule on the Authorization header, or a Cloudflare service +// token). Alloy binds to localhost on the host; nodes hit Cloudflare → origin. +// (Per-version Alloy receiver auth is finicky; gating at the edge is simpler +// and equally strong.) + +otelcol.receiver.otlp "default" { + http { + endpoint = "0.0.0.0:4318" + } + output { + logs = [otelcol.processor.batch.default.input] + // metrics / traces: see the FULL-SIGNAL section below to enable. + } +} + +otelcol.processor.batch "default" { + output { + logs = [otelcol.processor.transform.loki_labels.input] + } +} + +// Tell the Loki exporter which resource attributes become Loki INDEX labels. +// Keep this list SMALL and low-cardinality — never add operation_id here. +// These keys must match the OTLP resource-attribute keys the node emits +// (dotted, OTel-style); Loki sanitizes the dots to underscores, so the +// resulting labels are service_name, service_instance_id, +// deployment_environment, dkg_node_role (dashboards key off those). +otelcol.processor.transform "loki_labels" { + error_mode = "ignore" + log_statements { + context = "resource" + statements = [ + `set(attributes["loki.resource.labels"], "service.name, service.instance.id, deployment.environment, dkg.node.role")`, + ] + } + output { + logs = [otelcol.exporter.loki.default.input] + } +} + +otelcol.exporter.loki "default" { + forward_to = [loki.write.default.receiver] +} + +loki.write "default" { + endpoint { + url = sys.env("LOKI_PUSH_URL") + } +} + +// ───────────────────────────────────────────────────────────────────────── +// FULL-SIGNAL SECTION (commented — enable when the backends exist) +// +// The receiver above already accepts metrics + traces on the same port. To +// route them, add these lines to `otelcol.receiver.otlp "default"`'s output: +// metrics = [otelcol.processor.batch.metrics.input] +// traces = [otelcol.processor.batch.traces.input] +// then uncomment the matching pipeline below. +// +// --- TRACES → Tempo (or any OTLP traces backend) --- +// otelcol.processor.batch "traces" { +// output { traces = [otelcol.exporter.otlp.tempo.input] } +// } +// otelcol.exporter.otlp "tempo" { +// client { endpoint = sys.env("TEMPO_OTLP_ENDPOINT") } // e.g. tempo:4317 +// } +// +// --- METRICS → Prometheus/Mimir remote-write --- +// otelcol.processor.batch "metrics" { +// output { metrics = [otelcol.exporter.prometheus.mimir.input] } +// } +// otelcol.exporter.prometheus "mimir" { +// forward_to = [prometheus.remote_write.mimir.receiver] +// } +// prometheus.remote_write "mimir" { +// endpoint { url = sys.env("PROM_REMOTE_WRITE_URL") } // e.g. http://mimir:9009/api/v1/push +// } +// ───────────────────────────────────────────────────────────────────────── diff --git a/tools/log-collection-poc/production/docker-compose.alloy.yml b/tools/log-collection-poc/production/docker-compose.alloy.yml new file mode 100644 index 000000000..a256ab83d --- /dev/null +++ b/tools/log-collection-poc/production/docker-compose.alloy.yml @@ -0,0 +1,21 @@ +# PRODUCTION — run ONLY Alloy on the host that runs Loki (the polaris box). +# Alloy receives OTLP from DKG nodes and writes to the existing local Loki 2.5.0. +# +# deploy: docker compose -f docker-compose.alloy.yml up -d +# +# network_mode: host lets Alloy reach Loki at 127.0.0.1:3100. The OTLP port +# (4318) MUST NOT be open to the internet directly — expose it only through the +# Cloudflare tunnel/hostname (see RUNBOOK.md), and firewall 4318 from public. +services: + alloy: + image: grafana/alloy:v1.5.1 + restart: unless-stopped + network_mode: host + command: + - run + - /etc/alloy/config.alloy + - --server.http.listen-addr=127.0.0.1:12345 + environment: + - LOKI_PUSH_URL=http://127.0.0.1:3100/loki/api/v1/push + volumes: + - ./config.alloy:/etc/alloy/config.alloy:ro diff --git a/tools/log-collection-poc/production/docker-compose.sim.yml b/tools/log-collection-poc/production/docker-compose.sim.yml new file mode 100644 index 000000000..8d23c5793 --- /dev/null +++ b/tools/log-collection-poc/production/docker-compose.sim.yml @@ -0,0 +1,27 @@ +# LOCAL VALIDATION ONLY — reproduces the production topology +# (DKG node → Alloy → Loki 2.5.0) on your laptop to verify config.alloy before +# deploying. NOT for production. Production runs only Alloy (see RUNBOOK.md), +# pointed at the existing host Loki. +services: + loki25: + image: grafana/loki:2.5.0 + command: -config.file=/etc/loki/local-config.yaml + ports: + - "3105:3100" # query API (avoids clashing with the 3.x PoC Loki on 3100) + + alloy: + image: grafana/alloy:v1.5.1 + command: + - run + - /etc/alloy/config.alloy + - --server.http.listen-addr=0.0.0.0:12345 + environment: + - INGEST_TOKEN=test-ingest-token + - LOKI_PUSH_URL=http://loki25:3100/loki/api/v1/push + ports: + - "4328:4318" # OTLP/HTTP in (4318 is used by the PoC otel-collector) + - "12346:12345" # Alloy UI + volumes: + - ./config.alloy:/etc/alloy/config.alloy:ro + depends_on: + - loki25 diff --git a/tools/log-collection-poc/production/example-alerts.md b/tools/log-collection-poc/production/example-alerts.md new file mode 100644 index 000000000..aa82621c5 --- /dev/null +++ b/tools/log-collection-poc/production/example-alerts.md @@ -0,0 +1,33 @@ +# Example Grafana alert rules (optional, once logs are flowing) + +Set these up in **Alerting → Alert rules** in Grafana (data source = your Loki). +They're documented here rather than auto-created so you control thresholds and +where notifications go (contact point / notification policy). + +## 1. Node went quiet (likely down or not shipping) +A node that normally logs has produced **0 log lines in 10 minutes**. + +- Query (Loki, instant): `sum by (service_instance_id) (count_over_time({service_name="dkg-node"} [10m]))` +- Condition: `IS BELOW 1` +- Evaluate every `1m`, for `10m`. +- Annotation: `Node {{ $labels.service_instance_id }} has logged nothing for 10m.` + +> Note: this fires only for nodes that have logged before (it won't invent +> series for nodes that never shipped). For hard "node down" detection also +> watch the node's metrics/health endpoint. + +## 2. Error spike on a node +A node logged **more than 20 ERROR lines in 5 minutes**. + +- Query (Loki, instant): `sum by (service_instance_id) (count_over_time({service_name="dkg-node", level="ERROR"} [5m]))` +- Condition: `IS ABOVE 20` (tune to your baseline) +- Evaluate every `1m`, for `5m`. +- Annotation: `Node {{ $labels.service_instance_id }} error rate high ({{ $values.A }} in 5m).` + +## 3. (Optional) A secret pattern slipped through redaction +Defense-in-depth — alert if anything that looks like an un-redacted key reaches Loki. + +- Query (Loki, instant): `sum(count_over_time({service_name="dkg-node"} |~ `(?i)(privatekey|mnemonic)\s*[:=]\s*[^\[]` [10m]))` + (matches a sensitive key followed by a value that is NOT `[REDACTED]`) +- Condition: `IS ABOVE 0` +- This should always be 0; if it fires, investigate the redactor / add the field to `logs.redact`. diff --git a/tools/log-collection-poc/production/grafana-dashboard-dkg-fleet-logs.json b/tools/log-collection-poc/production/grafana-dashboard-dkg-fleet-logs.json new file mode 100644 index 000000000..7055c3e0f --- /dev/null +++ b/tools/log-collection-poc/production/grafana-dashboard-dkg-fleet-logs.json @@ -0,0 +1,52 @@ +{ + "title": "DKG Fleet — Logs Overview", + "uid": "dkg-fleet-logs", + "tags": ["dkg", "logs"], + "schemaVersion": 39, + "version": 1, + "editable": true, + "time": { "from": "now-6h", "to": "now" }, + "templating": { + "list": [ + { "name": "loki", "label": "Loki", "type": "datasource", "query": "loki", "refresh": 1, "current": {} }, + { + "name": "env", "label": "Environment", "type": "query", + "datasource": { "type": "loki", "uid": "${loki}" }, + "definition": "label_values(deployment_environment)", "query": "label_values(deployment_environment)", + "refresh": 2, "sort": 1, "includeAll": true, "allValue": ".+", "multi": false, + "current": { "text": "All", "value": "$__all" } + } + ] + }, + "panels": [ + { + "type": "stat", "title": "Active nodes ($env)", "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 0 }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] }, "colorMode": "value" }, + "targets": [{ "refId": "A", "datasource": { "type": "loki", "uid": "${loki}" }, "queryType": "instant", + "expr": "count(count by (service_instance_id) (count_over_time({service_name=\"dkg-node\", deployment_environment=~\"$env\"} [$__range])))" }] + }, + { + "type": "timeseries", "title": "Log volume per node", "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 9, "w": 18, "x": 6, "y": 0 }, + "targets": [{ "refId": "A", "datasource": { "type": "loki", "uid": "${loki}" }, "queryType": "range", + "expr": "sum by (service_instance_id) (count_over_time({service_name=\"dkg-node\", deployment_environment=~\"$env\"} [$__auto]))", + "legendFormat": "{{service_instance_id}}" }] + }, + { + "type": "timeseries", "title": "Errors per node", "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 9 }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "bars", "fillOpacity": 70, "stacking": { "mode": "normal" } } }, "overrides": [] }, + "targets": [{ "refId": "A", "datasource": { "type": "loki", "uid": "${loki}" }, "queryType": "range", + "expr": "sum by (service_instance_id) (count_over_time({service_name=\"dkg-node\", level=\"ERROR\", deployment_environment=~\"$env\"} [$__auto]))", + "legendFormat": "{{service_instance_id}}" }] + }, + { + "type": "logs", "title": "Recent errors (all nodes)", "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 17 }, + "options": { "showTime": true, "showLabels": true, "wrapLogMessage": true, "sortOrder": "Descending", "enableLogDetails": true }, + "targets": [{ "refId": "A", "datasource": { "type": "loki", "uid": "${loki}" }, "queryType": "range", + "expr": "{service_name=\"dkg-node\", level=\"ERROR\", deployment_environment=~\"$env\"} | json | line_format `{{.body}}`" }] + } + ] +} diff --git a/tools/log-collection-poc/production/grafana-dashboard-dkg-node-logs.json b/tools/log-collection-poc/production/grafana-dashboard-dkg-node-logs.json new file mode 100644 index 000000000..e2a407eb7 --- /dev/null +++ b/tools/log-collection-poc/production/grafana-dashboard-dkg-node-logs.json @@ -0,0 +1,100 @@ +{ + "title": "DKG Node Logs", + "uid": "dkg-node-logs", + "tags": ["dkg", "logs"], + "schemaVersion": 39, + "version": 1, + "editable": true, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": { + "refresh_intervals": ["10s", "30s", "1m", "5m", "15m"], + "time_options": ["15m", "1h", "3h", "6h", "12h", "24h", "2d", "7d", "30d"] + }, + "templating": { + "list": [ + { + "name": "loki", + "label": "Loki", + "type": "datasource", + "query": "loki", + "refresh": 1, + "current": {} + }, + { + "name": "node", + "label": "Node", + "type": "query", + "datasource": { "type": "loki", "uid": "${loki}" }, + "definition": "label_values(service_instance_id)", + "query": "label_values(service_instance_id)", + "refresh": 2, + "sort": 1, + "includeAll": false, + "multi": false, + "current": {} + }, + { + "name": "level", + "label": "Level", + "type": "query", + "datasource": { "type": "loki", "uid": "${loki}" }, + "definition": "label_values(level)", + "query": "label_values(level)", + "refresh": 2, + "sort": 1, + "includeAll": true, + "allValue": ".+", + "multi": true, + "current": { "text": "All", "value": "$__all" } + }, + { + "name": "search", + "label": "Filter (regex)", + "type": "textbox", + "query": "", + "current": { "text": "", "value": "" } + } + ] + }, + "panels": [ + { + "type": "logs", + "title": "Logs — $node", + "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 22, "w": 24, "x": 0, "y": 1 }, + "options": { + "showTime": true, + "showLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "enableLogDetails": true, + "dedupStrategy": "none", + "sortOrder": "Descending" + }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "${loki}" }, + "queryType": "range", + "expr": "{service_instance_id=\"$node\", level=~\"$level\"} |~ `(?i)$search` | json | line_format `{{.body}}`" + } + ] + }, + { + "type": "timeseries", + "title": "Log volume by level — $node", + "datasource": { "type": "loki", "uid": "${loki}" }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 23 }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "bars", "fillOpacity": 70, "stacking": { "mode": "normal" } } }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "loki", "uid": "${loki}" }, + "queryType": "range", + "expr": "sum by (level) (count_over_time({service_instance_id=\"$node\", level=~\"$level\"} |~ `(?i)$search` [$__auto]))", + "legendFormat": "{{level}}" + } + ] + } + ] +} diff --git a/tools/log-collection-poc/production/node-config.example.json b/tools/log-collection-poc/production/node-config.example.json new file mode 100644 index 000000000..cd2dd5069 --- /dev/null +++ b/tools/log-collection-poc/production/node-config.example.json @@ -0,0 +1,27 @@ +{ + "_comment": "Add these keys to each OriginTrail-hosted node's config.json. `name` MUST be unique per node — it becomes the Grafana Node selector value. `network` is auto-derived by the node (testnet/mainnet) → the Environment label. Requires a node build that includes the OTLP telemetry feature (PR #1317).", + + "_comment_signals": "Three independent signals, all OFF until telemetry.enabled=true AND the signal has an endpoint. LOGS use a hand-rolled OTLP/HTTP JSON exporter (the OTel Logs SDK is still 'Development'); set logs.exporter to 'otlp' — if you leave it unset on a hosted node it defaults to legacy syslog/Graylog, NOT OTLP. TRACES and METRICS use the stable OTel SDK exporters (OTLP/protobuf). For the polaris setup today only logs have a backend (Loki via Alloy); keep traces/metrics here so the config is ready when Tempo/Mimir are provisioned, or set traces.enabled/metrics.enabled to false to leave them off explicitly.", + + "name": "testnet-core-01", + + "telemetry": { + "enabled": true, + "logs": { + "exporter": "otlp", + "endpoint": "https://logs-ingest.xtrmstrngth.com/v1/logs", + "token": "", + "level": "info" + }, + "traces": { + "endpoint": "https://logs-ingest.xtrmstrngth.com/v1/traces", + "token": "", + "sampleRatio": 1 + }, + "metrics": { + "endpoint": "https://logs-ingest.xtrmstrngth.com/v1/metrics", + "token": "", + "exportIntervalMs": 30000 + } + } +} diff --git a/tools/log-collection-poc/production/smoke-test.sh b/tools/log-collection-poc/production/smoke-test.sh new file mode 100755 index 000000000..ff3e1b514 --- /dev/null +++ b/tools/log-collection-poc/production/smoke-test.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Post-deploy smoke test for the DKG log-ingest path. +# Pushes one OTLP log through the public ingest endpoint and confirms it lands +# in Loki, then checks that a tokenless push is rejected. Run on the Loki host. +# +# INGEST_URL=https://logs-ingest.xtrmstrngth.com/v1/logs \ +# INGEST_TOKEN= \ +# LOKI=http://127.0.0.1:3100 \ +# ./smoke-test.sh +set -euo pipefail +: "${INGEST_URL:?set INGEST_URL, e.g. https://logs-ingest.xtrmstrngth.com/v1/logs}" +: "${INGEST_TOKEN:?set INGEST_TOKEN}" +LOKI="${LOKI:-http://127.0.0.1:3100}" +NODE="smoke-test-$$" +NS=$(( $(date +%s) * 1000000000 )) + +payload='{"resourceLogs":[{"resource":{"attributes":[ + {"key":"service.name","value":{"stringValue":"dkg-node"}}, + {"key":"service.instance.id","value":{"stringValue":"'"$NODE"'"}}, + {"key":"deployment.environment","value":{"stringValue":"testnet"}}]}, + "scopeLogs":[{"scope":{"name":"dkg-node"},"logRecords":[ + {"timeUnixNano":"'"$NS"'","severityNumber":9,"severityText":"INFO","body":{"stringValue":"smoke-test ok"}}]}]}]}' + +echo "→ pushing test log as $NODE ..." +code=$(curl -s -o /dev/null -w '%{http_code}' -X POST "$INGEST_URL" \ + -H "authorization: Bearer $INGEST_TOKEN" -H 'content-type: application/json' -d "$payload") +echo " ingest HTTP $code (expect 2xx)" + +sleep 5 +echo "→ querying Loki for it ..." +START=$(date -u -v-5M +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -d '5 min ago' +%Y-%m-%dT%H:%M:%SZ) +END=$(date -u +%Y-%m-%dT%H:%M:%SZ) +if curl -s -G "$LOKI/loki/api/v1/query_range" \ + --data-urlencode "query={service_instance_id=\"$NODE\"}" \ + --data-urlencode "start=$START" --data-urlencode "end=$END" | grep -q 'smoke-test ok'; then + echo " ✅ PASS — log reached Loki (pick '$NODE' in the dashboard to see it)" +else + echo " ❌ FAIL — not found in Loki. Check Alloy logs + Loki push URL." +fi + +echo "→ negative check: tokenless push should be rejected ..." +noauth=$(curl -s -o /dev/null -w '%{http_code}' -X POST "$INGEST_URL" \ + -H 'content-type: application/json' -d "$payload") +echo " no-token HTTP $noauth (expect 401/403 from the Cloudflare WAF rule)" diff --git a/tools/log-collection-poc/send-sample-logs.mjs b/tools/log-collection-poc/send-sample-logs.mjs new file mode 100644 index 000000000..562db57fc --- /dev/null +++ b/tools/log-collection-poc/send-sample-logs.mjs @@ -0,0 +1,60 @@ +/** + * Drive the REAL DKG log-collection pipeline end-to-end against the local + * stack, exactly as the daemon does: + * + * createLogRedactor() → OtlpLogWorker → OTel Collector → Loki + * + * One of the sample records contains a fake private key + mnemonic to prove + * redaction happens at-source (the collector/Loki never see the secret). + * + * Usage (after `docker compose up -d` and a workspace build): + * node send-sample-logs.mjs [endpoint] [nodeName] [network] + * # defaults: http://localhost:4318/v1/logs poc-node devnet + * # run twice with different node names to populate the Grafana node selector. + */ +import { createLogRedactor } from '../../packages/core/dist/log-redaction.js'; +import { OtlpLogWorker } from '../../packages/node-ui/dist/otlp-log-worker.js'; + +const endpoint = process.argv[2] || 'http://localhost:4318/v1/logs'; +const nodeName = process.argv[3] || 'poc-node'; +const network = process.argv[4] || 'devnet'; +const redact = createLogRedactor(); + +const worker = new OtlpLogWorker({ + endpoint, + token: process.env.OTLP_TOKEN, // bearer token (e.g. when pushing through Alloy) + network, + peerId: `12D3KooW-${nodeName}`, + nodeName, // becomes service.instance.id → the Grafana node-selector label + version: '10.0.0', + commit: 'poc0001', + role: 'core', + minLevel: 'info', + flushIntervalMs: 500, + onError: (m) => console.error('[otlp]', m), +}); +worker.start(); + +const now = () => new Date().toISOString(); +const samples = [ + { level: 'info', operationName: 'connect', operationId: 'op-conn-1', module: 'p2p', message: `node up — 8 peers (2 direct / 6 relayed) @ ${now()}` }, + { level: 'info', operationName: 'publish', operationId: 'op-pub-1', module: 'publisher', message: 'published KC 42 root 0x1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef (NOT a secret — should survive)' }, + { level: 'warn', operationName: 'sync', operationId: 'op-sync-1', sourceOperationId: 'op-pub-1', module: 'agent', message: 'peer 12D3KooWxyz slow to ACK, retrying' }, + { level: 'error', operationName: 'query', operationId: 'op-qry-1', module: 'query', message: 'SPARQL timeout after 30s' }, + // SECRET — must be redacted before it leaves the node: + { level: 'info', operationName: 'init', operationId: 'op-init-1', module: 'wallet', message: 'loaded operationalWalletPrivateKey=0xdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef mnemonic="legal winner thank year wave sausage worth useful legal winner thank yellow"' }, +]; + +for (const s of samples) worker.push(redact(s)); + +console.log(`Pushed ${samples.length} records (1 containing a secret, pre-redacted) → ${endpoint}`); +console.log('Flushing…'); + +// Allow a couple of flush cycles, then stop (final flush) and exit. +setTimeout(() => { + worker.stop(); + console.log('Done. In Grafana (http://localhost:3000) → Explore → Loki, query:'); + console.log(' {service_name="dkg-node"}'); + console.log('Confirm the wallet line shows [REDACTED] (no 0xdeadbeef…, no mnemonic words).'); + setTimeout(() => process.exit(0), 300); +}, 1500);