-
Notifications
You must be signed in to change notification settings - Fork 23
Add OpenTelemetry tracing across the backbeat pipeline #2733
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: development/9.4
Are you sure you want to change the base?
Changes from all commits
a257804
84e60f3
f83f623
bc9f53f
5a85cb0
6357120
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -29,6 +29,11 @@ const { | |||||||||
| startCircuitBreakerMetricsExport, | ||||||||||
| updateCircuitBreakerConfigForImplicitOutputQueue, | ||||||||||
| } = require('../../../lib/CircuitBreaker'); | ||||||||||
| const { context: otelContext, trace, SpanKind, SpanStatusCode, ROOT_CONTEXT } = | ||||||||||
| require('@opentelemetry/api'); | ||||||||||
| const { traceHeadersFromCurrentContext } = | ||||||||||
| require('../../../lib/tracing/kafkaTraceContext'); | ||||||||||
| const { isEnabled } = require('../../../lib/tracing'); | ||||||||||
|
|
||||||||||
| const DEFAULT_CRON_RULE = '* * * * *'; | ||||||||||
| const DEFAULT_CONCURRENCY = 10; | ||||||||||
|
|
@@ -340,7 +345,8 @@ class LifecycleConductor { | |||||||||
| } | ||||||||||
|
|
||||||||||
| _taskToMessage(task, taskVersion, log) { | ||||||||||
| return { | ||||||||||
| const headers = traceHeadersFromCurrentContext(); | ||||||||||
| const kafkaEntry = { | ||||||||||
| message: JSON.stringify({ | ||||||||||
| action: 'processObjects', | ||||||||||
| contextInfo: { | ||||||||||
|
|
@@ -355,6 +361,8 @@ class LifecycleConductor { | |||||||||
| details: {}, | ||||||||||
| }), | ||||||||||
| }; | ||||||||||
| if (headers) {kafkaEntry.headers = headers;} | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
🙏 |
||||||||||
| return kafkaEntry; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| _getAccountIds(unknownCanonicalIds, log, cb) { | ||||||||||
|
|
@@ -402,6 +410,40 @@ class LifecycleConductor { | |||||||||
| } | ||||||||||
|
|
||||||||||
| processBuckets(cb) { | ||||||||||
| if (!isEnabled()) { | ||||||||||
| this._processBucketsInternal((err, res) => { | ||||||||||
| if (cb) {cb(err, res);} | ||||||||||
| }); | ||||||||||
| return; | ||||||||||
| } | ||||||||||
| // Root INTERNAL trace per cron firing (no upstream parent); the | ||||||||||
| // in-process scan work (Mongo bucket listing) nests under it. | ||||||||||
| const tracer = trace.getTracer('backbeat'); | ||||||||||
| const span = tracer.startSpan('lifecycle.conductor.scan', { | ||||||||||
| kind: SpanKind.INTERNAL, | ||||||||||
| }, ROOT_CONTEXT); | ||||||||||
| const ctx = trace.setSpan(ROOT_CONTEXT, span); | ||||||||||
| otelContext.with(ctx, () => { | ||||||||||
| try { | ||||||||||
| this._processBucketsInternal((err, res) => { | ||||||||||
| if (err) { | ||||||||||
| span.recordException(err); | ||||||||||
| span.setStatus({ code: SpanStatusCode.ERROR }); | ||||||||||
| } | ||||||||||
| span.end(); | ||||||||||
| if (cb) {cb(err, res);} | ||||||||||
| }); | ||||||||||
| } catch (err) { | ||||||||||
| // sync throw: end span (don't leak), then rethrow | ||||||||||
| span.recordException(err); | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have the same logic somewhere else, we should factorise |
||||||||||
| span.setStatus({ code: SpanStatusCode.ERROR }); | ||||||||||
| span.end(); | ||||||||||
| throw err; | ||||||||||
| } | ||||||||||
| }); | ||||||||||
|
delthas marked this conversation as resolved.
|
||||||||||
| } | ||||||||||
|
delthas marked this conversation as resolved.
|
||||||||||
|
|
||||||||||
| _processBucketsInternal(cb) { | ||||||||||
| const log = this.logger.newRequestLogger(); | ||||||||||
| const start = new Date(); | ||||||||||
| let nBucketsQueued = 0; | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -100,7 +100,19 @@ class KafkaNotificationDestination extends NotificationDestination { | |
| */ | ||
| send(messages, done) { | ||
| const starTime = Date.now(); | ||
| this._notificationProducer.send(messages, error => { | ||
| // Trust boundary: drop trace headers before producing to the external | ||
| // customer Kafka (no protocol-level ingress strip like nginx for HTTP). | ||
| const safeMessages = Array.isArray(messages) | ||
|
delthas marked this conversation as resolved.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why |
||
| ? messages.map(m => { | ||
| if (m && m.headers) { | ||
| // eslint-disable-next-line no-unused-vars | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can just |
||
| const { headers, ...rest } = m; | ||
|
delthas marked this conversation as resolved.
|
||
| return rest; | ||
| } | ||
| return m; | ||
| }) | ||
| : messages; | ||
| this._notificationProducer.send(safeMessages, error => { | ||
| if (error) { | ||
| const { host, topic } = this._destinationConfig; | ||
| this._log.error('error in message delivery to external Kafka destination', { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
|
|
||
| const ActionQueueEntry = require('../../lib/models/ActionQueueEntry'); | ||
| const ReplicationMetrics = require('./ReplicationMetrics'); | ||
| const { traceHeadersFromCurrentContext } = require('../../lib/tracing/kafkaTraceContext'); | ||
|
|
||
| let { dataMoverTopic } = config.extensions.replication; | ||
| const { coldStorageArchiveTopicPrefix } = config.extensions.lifecycle; | ||
|
|
@@ -78,6 +79,8 @@ | |
| key: `${bucket}/${key}`, | ||
| message: action.toKafkaMessage(), | ||
| }; | ||
| const traceHeaders = traceHeadersFromCurrentContext(); | ||
| if (traceHeaders) kafkaEntry.headers = traceHeaders; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quick up. I think we don't allow that in our codebase ? |
||
| let topic = dataMoverTopic; | ||
| const toLocation = action.getAttribute('toLocation'); | ||
| const locationConfig = locations[toLocation]; | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -23,6 +23,10 @@ const { | |||||
| } | ||||||
| } = require('./constants'); | ||||||
|
|
||||||
|
delthas marked this conversation as resolved.
|
||||||
| const { context: otelContext, SpanStatusCode } = require('@opentelemetry/api'); | ||||||
| const { isEnabled } = require('./tracing'); | ||||||
| const { startLinkedSpanFromKafkaEntry } = require('./tracing/kafkaTraceContext'); | ||||||
|
|
||||||
| const CLIENT_ID = 'BackbeatConsumer'; | ||||||
| const { withTopicPrefix } = require('./util/topic'); | ||||||
|
|
||||||
|
|
@@ -530,7 +534,38 @@ class BackbeatConsumer extends EventEmitter { | |||||
| const { topic, partition } = entry; | ||||||
| KafkaBacklogMetrics.onTaskStarted(topic, partition, this._groupId); | ||||||
|
|
||||||
| this._queueProcessor(entry, (err, completionArgs) => done(err, completionArgs, finishProcessingTask)); | ||||||
| // When OTEL is off, skip the span machinery entirely (no OTEL API on | ||||||
|
delthas marked this conversation as resolved.
|
||||||
| // the per-message hot path) and keep the original processing shape. | ||||||
|
Comment on lines
+537
to
+538
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
The condition is pretty explicit ? |
||||||
| if (!isEnabled()) { | ||||||
| this._queueProcessor(entry, (err, completionArgs) => | ||||||
| done(err, completionArgs, finishProcessingTask)); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
|
delthas marked this conversation as resolved.
|
||||||
| const { ctx, span } = startLinkedSpanFromKafkaEntry(entry, `${topic}.process`); | ||||||
| span.setAttribute('messaging.system', 'kafka'); | ||||||
| span.setAttribute('messaging.destination.name', topic); | ||||||
| span.setAttribute('messaging.destination.partition.id', `${partition}`); | ||||||
| span.setAttribute('messaging.consumer.group.name', this._groupId); | ||||||
|
|
||||||
|
delthas marked this conversation as resolved.
|
||||||
| otelContext.with(ctx, () => { | ||||||
| try { | ||||||
| this._queueProcessor(entry, (err, completionArgs) => { | ||||||
| if (err) { | ||||||
| span.recordException(err); | ||||||
| span.setStatus({ code: SpanStatusCode.ERROR }); | ||||||
| } | ||||||
| span.end(); | ||||||
| done(err, completionArgs, finishProcessingTask); | ||||||
| }); | ||||||
| } catch (err) { | ||||||
| // sync throw before the callback fired: end span (don't leak), then rethrow | ||||||
| span.recordException(err); | ||||||
| span.setStatus({ code: SpanStatusCode.ERROR }); | ||||||
| span.end(); | ||||||
|
delthas marked this conversation as resolved.
delthas marked this conversation as resolved.
|
||||||
| throw err; | ||||||
| } | ||||||
| }); | ||||||
|
delthas marked this conversation as resolved.
delthas marked this conversation as resolved.
|
||||||
| } | ||||||
|
|
||||||
| /** | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need to require it twice ?