From 785797806ae9370e79b04973eb33f98bc39453a5 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Wed, 29 Apr 2026 18:32:44 +0200 Subject: [PATCH 01/10] Add lifecycle conductor scan metrics. Issue: BB-740 --- extensions/lifecycle/LifecycleMetrics.js | 166 +++++++++++++++++- .../LifecycleBucketProcessor.js | 14 +- .../lifecycle/conductor/LifecycleConductor.js | 100 ++++++++--- 3 files changed, 251 insertions(+), 29 deletions(-) diff --git a/extensions/lifecycle/LifecycleMetrics.js b/extensions/lifecycle/LifecycleMetrics.js index eac0d1346..ac793b816 100644 --- a/extensions/lifecycle/LifecycleMetrics.js +++ b/extensions/lifecycle/LifecycleMetrics.js @@ -5,12 +5,53 @@ const LIFECYCLE_LABEL_OP = 'op'; const LIFECYCLE_LABEL_STATUS = 'status'; const LIFECYCLE_LABEL_LOCATION = 'location'; const LIFECYCLE_LABEL_TYPE = 'type'; +const LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID = 'conductor_scan_id'; const LIFECYCLE_MARKER_METRICS_LOCATION = '-delete-marker-'; +// Keep per-scan series long enough for scraping and debugging recent overlap, +// but remove them from prom-client eventually to avoid unbounded process memory. +const BUCKET_PROCESSOR_SCAN_METRIC_RETENTION_MS = 24 * 60 * 60 * 1000; +const BUCKET_PROCESSOR_ORIGIN = 'bucket_processor'; + +// Conductor scheduling heartbeat: timestamp (ms since epoch) of the +// instant the conductor most recently *started* a scan. Use this to +// detect "the conductor is no longer scheduling scans" via the +// LifecycleLateScan alert; do NOT subtract it from latest_batch_end_time +// to derive the scan duration: while a scan is in progress, end_time is +// from the previous run and start_time has just been refreshed, so the +// difference is negative. Use s3_lifecycle_conductor_last_batch_duration_seconds +// instead. const conductorLatestBatchStartTime = ZenkoMetrics.createGauge({ name: 's3_lifecycle_latest_batch_start_time', - help: 'Timestamp of latest lifecycle batch start time', + help: 'Conductor scheduling heartbeat: ms-since-epoch timestamp of ' + + 'the most recent scan start. Use to detect that the conductor is ' + + 'still scheduling scans (LifecycleLateScan alert). Do NOT use to ' + + 'derive scan duration; use ' + + 's3_lifecycle_conductor_last_batch_duration_seconds for that.', + labelNames: [LIFECYCLE_LABEL_ORIGIN], +}); + +// Conductor scan-completion timestamp (ms since epoch) of the last +// successfully completed scan. Useful as a "scan completed at all" +// signal; combine with conductor_last_batch_duration_seconds to know +// "the most recent scan finished N seconds ago and took M seconds". +const conductorLatestBatchEndTime = ZenkoMetrics.createGauge({ + name: 's3_lifecycle_latest_batch_end_time', + help: 'Timestamp (ms since epoch) of the most recent successful ' + + 'lifecycle conductor scan completion.', + labelNames: [LIFECYCLE_LABEL_ORIGIN], +}); + +// Duration of the latest conductor scan, computed by the conductor itself +// at scan completion. Exposed as a gauge so dashboards can render the most +// recent batch duration directly, without computing end - start in PromQL +// (which would yield negative values mid-scan, when end is from the +// previous batch and start has just been refreshed). +const conductorLastBatchDurationSeconds = ZenkoMetrics.createGauge({ + name: 's3_lifecycle_conductor_last_batch_duration_seconds', + help: 'Duration in seconds of the latest lifecycle conductor batch, ' + + 'as measured by the conductor at scan completion.', labelNames: [LIFECYCLE_LABEL_ORIGIN], }); @@ -50,6 +91,48 @@ const lifecycleLegacyTask = ZenkoMetrics.createCounter({ labelNames: [LIFECYCLE_LABEL_ORIGIN, LIFECYCLE_LABEL_STATUS], }); +const conductorLatestBatchBucketCount = ZenkoMetrics.createGauge({ + name: 's3_lifecycle_latest_batch_bucket_count', + help: 'Number of buckets listed in the latest lifecycle conductor batch', + labelNames: [LIFECYCLE_LABEL_ORIGIN], +}); + +const bucketProcessorScanMessagesProcessed = ZenkoMetrics.createCounter({ + name: 's3_lifecycle_bucket_processor_scan_messages_processed_total', + help: 'Total number of bucket-tasks topic messages successfully processed ' + + 'by this bucket processor, grouped by conductor scan id. Each message ' + + 'corresponds to a single listing slice (initial or continuation), not ' + + 'a unique bucket: a bucket with multiple listings (truncated v1, or ' + + 'current/noncurrent/orphan splits in v2) increments this counter once ' + + 'per slice. Multiple conductor_scan_id label values appearing at the ' + + 'same time indicate overlapping scans. Scan-id series are retained ' + + 'locally for 24 hours, so cardinality scales with scan frequency and ' + + 'bucket processor pod count.', + labelNames: [LIFECYCLE_LABEL_ORIGIN, LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID], +}); +const bucketProcessorScanMetricLastUpdated = new Map(); + +function removeStaleBucketProcessorScanMetrics(log, now) { + bucketProcessorScanMetricLastUpdated.forEach((lastUpdated, conductorScanId) => { + if (now - lastUpdated > BUCKET_PROCESSOR_SCAN_METRIC_RETENTION_MS) { + try { + bucketProcessorScanMessagesProcessed.remove({ + [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN, + [LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID]: conductorScanId, + }); + bucketProcessorScanMetricLastUpdated.delete(conductorScanId); + } catch (err) { + if (log) { + log.error('failed to remove stale bucket processor scan metric', { + error: err.toString(), + conductorScanId, + }); + } + } + } + }); +} + const lifecycleS3Operations = ZenkoMetrics.createCounter({ name: 's3_lifecycle_s3_operations_total', help: 'Total number of S3 operations by the lifecycle processes', @@ -113,6 +196,16 @@ class LifecycleMetrics { } } + /** + * Update the conductor scheduling heartbeat. Called at the start of + * every conductor scan; consumed by the LifecycleLateScan alert to + * detect that the conductor has stopped scheduling. Does NOT mark a + * scan as in progress and is NOT meant to be subtracted from + * latest_batch_end_time to derive a duration: use + * onConductorScanComplete's durationSeconds for that. + * + * @param {Object} log - logger + */ static onProcessBuckets(log) { try { conductorLatestBatchStartTime.set({ origin: 'conductor' }, Date.now()); @@ -172,6 +265,77 @@ class LifecycleMetrics { } } + /** + * Record metrics at the end of a full conductor scan. + * @param {Object} log - logger + * @param {number} bucketCount - total buckets listed + * @param {number} [durationSeconds] - duration of the scan in seconds, + * as measured by the conductor. When provided and finite, sets the + * s3_lifecycle_conductor_last_batch_duration_seconds gauge. Optional + * for forward-compatibility with callers that do not measure it. + */ + static onConductorScanComplete(log, bucketCount, durationSeconds) { + try { + conductorLatestBatchEndTime.set( + { [LIFECYCLE_LABEL_ORIGIN]: 'conductor' }, + Date.now()); + conductorLatestBatchBucketCount.set({ + [LIFECYCLE_LABEL_ORIGIN]: 'conductor', + }, bucketCount); + if (typeof durationSeconds === 'number' && + Number.isFinite(durationSeconds) && + durationSeconds >= 0) { + conductorLastBatchDurationSeconds.set({ + [LIFECYCLE_LABEL_ORIGIN]: 'conductor', + }, durationSeconds); + } + } catch (err) { + LifecycleMetrics.handleError( + log, err, 'LifecycleMetrics.onConductorScanComplete', { + bucketCount, + durationSeconds, + } + ); + } + } + + /** + * Increment the count of bucket-tasks topic messages successfully + * processed by this bucket processor for a specific conductor scan. + * Called after the bucket task completes successfully (not when it is + * dispatched to the scheduler), once per Kafka message regardless of how + * many objects it covers. + * + * Note: this counts messages (initial + continuation/listing slices), + * not unique buckets. Keep one time series per conductor_scan_id so that + * overlapping scans remain visible. Old scan series are removed + * opportunistically after BUCKET_PROCESSOR_SCAN_METRIC_RETENTION_MS to + * avoid unbounded prom-client memory growth. + * + * @param {Object} log - logger + * @param {string} conductorScanId - conductor scan id from contextInfo + */ + static onBucketProcessorScanMessageProcessed(log, conductorScanId) { + if (!conductorScanId) { + return; + } + try { + const now = Date.now(); + bucketProcessorScanMetricLastUpdated.set(conductorScanId, now); + bucketProcessorScanMessagesProcessed.inc({ + [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN, + [LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID]: conductorScanId, + }); + removeStaleBucketProcessorScanMetrics(log, now); + } catch (err) { + LifecycleMetrics.handleError( + log, err, + 'LifecycleMetrics.onBucketProcessorScanMessageProcessed', + { conductorScanId } + ); + } + } + static onLifecycleTriggered(log, process, type, location, latencyMs) { try { lifecycleTriggerLatency.observe({ diff --git a/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js b/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js index 6135b4186..c1e2e7191 100644 --- a/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js +++ b/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js @@ -26,6 +26,7 @@ const { extractBucketProcessorCircuitBreakerConfigs, } = require('../CircuitBreakerGroup'); const { lifecycleTaskVersions } = require('../../../lib/constants'); +const { LifecycleMetrics } = require('../LifecycleMetrics'); const locations = require('../../../conf/locationConfig.json'); const PROCESS_OBJECTS_ACTION = 'processObjects'; @@ -277,12 +278,15 @@ class LifecycleBucketProcessor { return process.nextTick(() => cb(errors.InternalError)); } const { bucket, owner, accountId, taskVersion } = result.target; + const conductorScanId = result.contextInfo && result.contextInfo.conductorScanId; + if (!bucket || !owner || (!accountId && this._authConfig.type === authTypeAssumeRole)) { this._log.error('kafka bucket entry missing required fields', { method: 'LifecycleBucketProcessor._processBucketEntry', bucket, owner, accountId, + conductorScanId, }); return process.nextTick(() => cb(errors.InternalError)); } @@ -291,6 +295,7 @@ class LifecycleBucketProcessor { bucket, owner, accountId, + conductorScanId, }); const s3Client = this.clientManager.getS3Client(accountId); @@ -345,6 +350,7 @@ class LifecycleBucketProcessor { owner, details: result.details, taskName: task.constructor.name, + conductorScanId, }); return this._internalTaskScheduler.push({ task, @@ -352,7 +358,13 @@ class LifecycleBucketProcessor { value: result, s3target: s3Client, backbeatMetadataProxy, - }, cb); + }, err => { + if (!err) { + LifecycleMetrics.onBucketProcessorScanMessageProcessed( + this._log, conductorScanId); + } + return cb(err); + }); }); } diff --git a/extensions/lifecycle/conductor/LifecycleConductor.js b/extensions/lifecycle/conductor/LifecycleConductor.js index fbbee05ba..2922d835c 100644 --- a/extensions/lifecycle/conductor/LifecycleConductor.js +++ b/extensions/lifecycle/conductor/LifecycleConductor.js @@ -3,6 +3,7 @@ const async = require('async'); const schedule = require('node-schedule'); const zookeeper = require('node-zookeeper-client'); +const { v4: uuid } = require('uuid'); const { constants, errors } = require('arsenal'); const Logger = require('werelogs').Logger; @@ -105,6 +106,7 @@ class LifecycleConductor { this._vaultClientCache = null; this._initialized = false; this._batchInProgress = false; + this._currentScanId = null; // this cache only needs to be the size of one listing. // worst case scenario is 1 account per bucket: @@ -314,6 +316,7 @@ class LifecycleConductor { action: 'processObjects', contextInfo: { reqId: log.getSerializedUids(), + conductorScanId: this._currentScanId, }, target: { bucket: task.bucketName, @@ -356,7 +359,8 @@ class LifecycleConductor { _createBucketTaskMessages(tasks, log, cb) { if (this.lcConfig.forceLegacyListing) { - return process.nextTick(cb, null, tasks.map(t => this._taskToMessage(t, lifecycleTaskVersions.v1, log))); + return process.nextTick(cb, null, tasks.map(t => + this._taskToMessage(t, lifecycleTaskVersions.v1, log))); } return async.mapLimit(tasks, 10, (t, taskDone) => @@ -372,8 +376,11 @@ class LifecycleConductor { processBuckets(cb) { const log = this.logger.newRequestLogger(); - const start = new Date(); + const start = Date.now(); + const scanId = uuid(); + let scanStarted = false; let nBucketsQueued = 0; + let totalBucketsListed = 0; let messageDeliveryReports = 0; const messageSendQueue = async.cargo((tasks, done) => { @@ -400,16 +407,18 @@ class LifecycleConductor { } return true; }))), - (tasksWithAccountId, next) => this._createBucketTaskMessages(tasksWithAccountId, log, next), + (tasksWithAccountId, next) => this._createBucketTaskMessages( + tasksWithAccountId, log, next), ], (err, messages) => { nBucketsQueued += tasks.length; log.info('bucket push progress', { + conductorScanId: scanId, nBucketsQueued, bucketsInCargo: tasks.length, kafkaBucketMessagesDeliveryReports: messageDeliveryReports, - kafkaEnqueueRateHz: Math.round(nBucketsQueued * 1000 / (new Date() - start)), + kafkaEnqueueRateHz: Math.round(nBucketsQueued * 1000 / (Date.now() - start)), }); this._accountIdCache.expireOldest(); @@ -428,22 +437,36 @@ class LifecycleConductor { // fallback to V1 listings next => this._indexesGetInProgressJobs(log, () => next(null)), next => { + scanStarted = true; + this._currentScanId = scanId; + log.addDefaultFields({ conductorScanId: scanId }); + LifecycleMetrics.onProcessBuckets(log); this._batchInProgress = true; - log.info('starting new lifecycle batch', { bucketSource: this._bucketSource }); - this.listBuckets(messageSendQueue, log, (err, nBucketsListed) => { + log.info('starting new lifecycle batch', { + bucketSource: this._bucketSource, + conductorScanId: scanId, + }); + this.listBuckets(messageSendQueue, scanId, log, (err, listedBucketsCount) => { LifecycleMetrics.onBucketListing(log, err); - return next(err, nBucketsListed); + totalBucketsListed = listedBucketsCount; + return next(err); }); }, - (nBucketsListed, next) => { + next => { async.until( - () => nBucketsQueued === nBucketsListed, + () => nBucketsQueued === totalBucketsListed, unext => setTimeout(unext, 1000), next); }, ], err => { + const elapsedMs = Date.now() - start; + const unknownCanonicalIds = this._accountIdCache.getMisses(); if (err && err.Throttling) { - log.info('not starting new lifecycle batch', { reason: err }); + log.info('not starting new lifecycle batch', { + reason: err, + conductorScanId: scanId, + fullScanElapsedMs: elapsedMs, + }); if (cb) { cb(err); } @@ -453,31 +476,52 @@ class LifecycleConductor { this.activeIndexingJobsRetrieved = false; this.activeIndexingJobs = []; this._batchInProgress = false; - const unknownCanonicalIds = this._accountIdCache.getMisses(); if (err) { - log.error('lifecycle batch failed', { error: err, unknownCanonicalIds }); + log.error('lifecycle batch failed', { + error: err, + unknownCanonicalIds, + conductorScanId: scanId, + fullScanElapsedMs: elapsedMs, + totalBucketsListed, + nBucketsQueued, + }); + if (scanStarted) { + this._currentScanId = null; + } if (cb) { cb(err); } return; } - log.info('finished pushing lifecycle batch', { nBucketsQueued, unknownCanonicalIds }); - LifecycleMetrics.onProcessBuckets(log); + LifecycleMetrics.onConductorScanComplete( + log, + totalBucketsListed, + elapsedMs / 1000, + ); + + log.info('finished pushing lifecycle batch', { + nBucketsQueued, + unknownCanonicalIds, + conductorScanId: scanId, + fullScanElapsedMs: elapsedMs, + totalBucketsListed, + }); + this._currentScanId = null; if (cb) { cb(null, nBucketsQueued); } }); } - listBuckets(queue, log, cb) { + listBuckets(queue, scanId, log, cb) { if (this._bucketSource === 'zookeeper') { - return this.listZookeeperBuckets(queue, log, cb); + return this.listZookeeperBuckets(queue, scanId, log, cb); } if (this._bucketSource === 'mongodb') { - return this.listMongodbBuckets(queue, log, cb); + return this.listMongodbBuckets(queue, scanId, log, cb); } return this.restoreBucketCheckpoint((err, marker) => { @@ -485,11 +529,11 @@ class LifecycleConductor { return cb(err); } - return this.listBucketdBuckets(queue, marker || null, log, cb); + return this.listBucketdBuckets(queue, marker || null, scanId, log, cb); }); } - listZookeeperBuckets(queue, log, cb) { + listZookeeperBuckets(queue, scanId, log, cb) { const zkBucketsPath = this.getBucketsZkPath(); this._zkClient.getChildren( zkBucketsPath, @@ -498,7 +542,7 @@ class LifecycleConductor { if (err) { log.error( 'error getting list of buckets from zookeeper', - { zkPath: zkBucketsPath, error: err.message }); + { zkPath: zkBucketsPath, error: err.message, conductorScanId: scanId }); return cb(err); } @@ -509,7 +553,7 @@ class LifecycleConductor { if (!canonicalId || !bucketUID || !bucketName) { log.error( 'malformed zookeeper bucket entry, skipping', - { zkPath: zkBucketsPath, bucket }); + { zkPath: zkBucketsPath, bucket, conductorScanId: scanId }); return false; } @@ -572,11 +616,11 @@ class LifecycleConductor { }); } - listBucketdBuckets(queue, initMarker, log, cb) { + listBucketdBuckets(queue, initMarker, scanId, log, cb) { let isTruncated = true; let marker = initMarker; let nEnqueued = 0; - const start = new Date(); + const start = Date.now(); const retryWrapper = new BackbeatTask(this.lcConfig.conductor.retry); this.lastSentId = null; @@ -589,8 +633,9 @@ class LifecycleConductor { nEnqueuedToDownstream: nEnqueued, inFlight: queue.length(), maxInFlight: this._maxInFlightBatchSize, - bucketListingPushRateHz: Math.round(nEnqueued * 1000 / (new Date() - start)), + bucketListingPushRateHz: Math.round(nEnqueued * 1000 / (Date.now() - start)), breakerState, + conductorScanId: scanId, }; if (queue.length() > this._maxInFlightBatchSize || @@ -672,9 +717,9 @@ class LifecycleConductor { ); } - listMongodbBuckets(queue, log, cb) { + listMongodbBuckets(queue, scanId, log, cb) { let nEnqueued = 0; - const start = new Date(); + const start = Date.now(); const lastEntryPath = this.getBucketProgressZkPath(); let lastSentId = null; @@ -744,8 +789,9 @@ class LifecycleConductor { nEnqueuedToDownstream: nEnqueued, inFlight: queue.length(), maxInFlight: this._maxInFlightBatchSize, - enqueueRateHz: Math.round(nEnqueued * 1000 / (new Date() - start)), + enqueueRateHz: Math.round(nEnqueued * 1000 / (Date.now() - start)), breakerState, + conductorScanId: scanId, }; if (queue.length() > this._maxInFlightBatchSize || From a1661ea921e4c2b4f35e195a4fa1c56c8a4aebcc Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Wed, 29 Apr 2026 18:32:44 +0200 Subject: [PATCH 02/10] Propagate conductor scan ids through lifecycle tasks. Issue: BB-740 --- .../tasks/LifecycleDeleteObjectTask.js | 6 ++++ extensions/lifecycle/tasks/LifecycleTask.js | 31 ++++++++++++++++++- extensions/lifecycle/tasks/LifecycleTaskV2.js | 22 +++++++++++-- .../tasks/LifecycleUpdateExpirationTask.js | 6 ++++ .../tasks/LifecycleUpdateTransitionTask.js | 6 ++++ 5 files changed, 67 insertions(+), 4 deletions(-) diff --git a/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js b/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js index 7139d8724..c0c288266 100644 --- a/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js +++ b/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js @@ -304,10 +304,16 @@ class LifecycleDeleteObjectTask extends BackbeatTask { processActionEntry(entry, done) { const startTime = Date.now(); const log = this.logger.newRequestLogger(); + const conductorScanId = entry.getContextAttribute( + 'conductorScanId'); + if (conductorScanId) { + log.addDefaultFields({ conductorScanId }); + } entry.addLoggedAttributes({ bucketName: 'target.bucket', objectKey: 'target.key', versionId: 'target.version', + conductorScanId: 'contextInfo.conductorScanId', }); return async.series([ diff --git a/extensions/lifecycle/tasks/LifecycleTask.js b/extensions/lifecycle/tasks/LifecycleTask.js index 591ce63e8..37a48d0a0 100644 --- a/extensions/lifecycle/tasks/LifecycleTask.js +++ b/extensions/lifecycle/tasks/LifecycleTask.js @@ -236,7 +236,10 @@ class LifecycleTask extends BackbeatTask { } const entry = Object.assign({}, bucketData, { - contextInfo: { reqId: log.getSerializedUids() }, + contextInfo: { + reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo?.conductorScanId, + }, details: { marker }, }); this._sendBucketEntry(entry, err => { @@ -411,6 +414,8 @@ class LifecycleTask extends BackbeatTask { const entry = Object.assign({}, bucketData, { contextInfo: { reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }, details: { keyMarker: data.NextKeyMarker, @@ -497,6 +502,8 @@ class LifecycleTask extends BackbeatTask { const entry = Object.assign({}, bucketData, { contextInfo: { reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }, details: { keyMarker: data.NextKeyMarker, @@ -1040,6 +1047,8 @@ class LifecycleTask extends BackbeatTask { origin: 'lifecycle', ruleType: 'expiration', reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) @@ -1068,6 +1077,8 @@ class LifecycleTask extends BackbeatTask { origin: 'lifecycle', ruleType: 'expiration', reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) @@ -1174,6 +1185,7 @@ class LifecycleTask extends BackbeatTask { origin: 'lifecycle', ruleType: 'transition', reqId: log.getSerializedUids(), + conductorScanId: params.conductorScanId, }); entry.setAttribute('source', { bucket: params.bucket, @@ -1374,6 +1386,8 @@ class LifecycleTask extends BackbeatTask { site: rules[ncvt].StorageClass, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( { Days: rules[ncvt][ncd] }, staleDate), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }, log, cb); return; } @@ -1450,6 +1464,8 @@ class LifecycleTask extends BackbeatTask { origin: 'lifecycle', ruleType: 'expiration', reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) @@ -1519,6 +1535,8 @@ class LifecycleTask extends BackbeatTask { origin: 'lifecycle', ruleType: 'expiration', reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) @@ -1606,6 +1624,8 @@ class LifecycleTask extends BackbeatTask { site: rules.Transition.StorageClass, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( rules.Transition, object.LastModified), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }, log, done); } return done(); @@ -1709,6 +1729,8 @@ class LifecycleTask extends BackbeatTask { encodedVersionId: undefined, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( rules.Transition, version.LastModified), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }, log, done); } @@ -1758,6 +1780,8 @@ class LifecycleTask extends BackbeatTask { origin: 'lifecycle', ruleType: 'expiration', reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) @@ -1827,6 +1851,11 @@ class LifecycleTask extends BackbeatTask { processBucketEntry(bucketLCRules, bucketData, s3target, backbeatMetadataProxy, nbRetries, done) { const log = this.log.newRequestLogger(); + const conductorScanId = bucketData.contextInfo + && bucketData.contextInfo.conductorScanId; + if (conductorScanId) { + log.addDefaultFields({ conductorScanId }); + } this.s3target = s3target; this.backbeatMetadataProxy = backbeatMetadataProxy; if (!this.backbeatMetadataProxy) { diff --git a/extensions/lifecycle/tasks/LifecycleTaskV2.js b/extensions/lifecycle/tasks/LifecycleTaskV2.js index f7ca26652..4a4d8b0ac 100644 --- a/extensions/lifecycle/tasks/LifecycleTaskV2.js +++ b/extensions/lifecycle/tasks/LifecycleTaskV2.js @@ -52,7 +52,11 @@ class LifecycleTaskV2 extends LifecycleTask { } = l; const entry = Object.assign({}, bucketData, { - contextInfo: { requestId: log.getSerializedUids() }, + contextInfo: { + requestId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, + }, details: { beforeDate, prefix, listType, storageClass }, }); @@ -115,7 +119,11 @@ class LifecycleTaskV2 extends LifecycleTask { // re-queue truncated listing only once. if (isTruncated && nbRetries === 0) { const entry = Object.assign({}, bucketData, { - contextInfo: { requestId: log.getSerializedUids() }, + contextInfo: { + requestId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, + }, details: { beforeDate: params.BeforeDate, prefix: params.Prefix, @@ -199,7 +207,11 @@ class LifecycleTaskV2 extends LifecycleTask { // re-queue truncated listing only once. if (isTruncated && nbRetries === 0) { const entry = Object.assign({}, bucketData, { - contextInfo: { requestId: log.getSerializedUids() }, + contextInfo: { + requestId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, + }, details: { beforeDate: params.BeforeDate, prefix: params.Prefix, @@ -350,6 +362,8 @@ class LifecycleTaskV2 extends LifecycleTask { site: rules.Transition.StorageClass, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( rules.Transition, obj.LastModified), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }, log, cb); } @@ -426,6 +440,8 @@ class LifecycleTaskV2 extends LifecycleTask { origin: 'lifecycle', ruleType: 'expiration', reqId: log.getSerializedUids(), + conductorScanId: bucketData.contextInfo + && bucketData.contextInfo.conductorScanId, }) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) diff --git a/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js b/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js index a94928ab9..38895a59c 100644 --- a/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js +++ b/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js @@ -150,10 +150,16 @@ class LifecycleUpdateExpirationTask extends BackbeatTask { processActionEntry(entry, done) { const startTime = Date.now(); const log = this.logger.newRequestLogger(); + const conductorScanId = entry.getContextAttribute( + 'conductorScanId'); + if (conductorScanId) { + log.addDefaultFields({ conductorScanId }); + } entry.addLoggedAttributes({ bucketName: 'target.bucket', objectKey: 'target.key', versionId: 'target.version', + conductorScanId: 'contextInfo.conductorScanId', }); async.waterfall([ diff --git a/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js b/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js index 55820bad8..74424fe08 100644 --- a/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js +++ b/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js @@ -260,10 +260,16 @@ class LifecycleUpdateTransitionTask extends BackbeatTask { */ processActionEntry(entry, done) { const log = this.logger.newRequestLogger(); + const conductorScanId = entry.getContextAttribute( + 'conductorScanId'); + if (conductorScanId) { + log.addDefaultFields({ conductorScanId }); + } entry.addLoggedAttributes({ bucketName: 'target.bucket', objectKey: 'target.key', versionId: 'target.version', + conductorScanId: 'contextInfo.conductorScanId', eTag: 'target.eTag', lastModified: 'target.lastModified', }); From 592d9119a88f4a5c599f7d4725e7c51eeda75ca3 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Wed, 29 Apr 2026 18:32:45 +0200 Subject: [PATCH 03/10] Update lifecycle scan dashboards. Issue: BB-740 --- monitoring/lifecycle/dashboard.json | 346 +++++++++++++++++++++++++++- monitoring/lifecycle/dashboard.py | 96 ++++++++ 2 files changed, 436 insertions(+), 6 deletions(-) diff --git a/monitoring/lifecycle/dashboard.json b/monitoring/lifecycle/dashboard.json index a714de95e..998cb5c69 100644 --- a/monitoring/lifecycle/dashboard.json +++ b/monitoring/lifecycle/dashboard.json @@ -4345,6 +4345,340 @@ "transparent": false, "type": "timeseries" }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Duration of the most recently completed conductor scan, as measured by the conductor itself at scan completion.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": null, + "mappings": [], + "noValue": "-", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "orange", + "index": 1, + "line": true, + "op": "gt", + "value": 60.0, + "yaxis": "left" + }, + { + "color": "red", + "index": 2, + "line": true, + "op": "gt", + "value": 300.0, + "yaxis": "left" + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 111 + }, + "hideTimeOverride": false, + "id": 50, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": null, + "expr": "s3_lifecycle_conductor_last_batch_duration_seconds{job=\"${job_lifecycle_producer}\", namespace=\"${namespace}\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Last Conductor Batch Duration", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Number of buckets listed during the latest conductor scan.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-blue", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 111 + }, + "hideTimeOverride": false, + "id": 51, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": null, + "expr": "s3_lifecycle_latest_batch_bucket_count{job=\"${job_lifecycle_producer}\", namespace=\"${namespace}\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Buckets Listed", + "transformations": [], + "transparent": false, + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Rate of bucket-tasks topic messages successfully processed by conductor scan id and bucket processor pod. Multiple scan ids active at the same time indicate overlapping conductor scans, which should not happen in normal operation. Each message corresponds to a single listing slice (initial or continuation), not a unique bucket.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 115 + }, + "hideTimeOverride": false, + "id": 52, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(s3_lifecycle_bucket_processor_scan_messages_processed_total{job=\"${job_lifecycle_bucket_processor}\", namespace=\"${namespace}\"}[$__rate_interval])) by (pod, conductor_scan_id)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}} / {{conductor_scan_id}}", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Bucket Processor Messages Processed by Scan", + "transformations": [], + "transparent": false, + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Message processing rate grouped by conductor_scan_id across all bucket-processor pods. Should normally show one active scan id; multiple active scan ids indicate overlapping conductor scans.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 2, + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": {}, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 115 + }, + "hideTimeOverride": false, + "id": 53, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(s3_lifecycle_bucket_processor_scan_messages_processed_total{job=\"${job_lifecycle_bucket_processor}\", namespace=\"${namespace}\"}[$__rate_interval])) by (conductor_scan_id)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{conductor_scan_id}}", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Bucket Processor Active Conductor Scans", + "transformations": [], + "transparent": false, + "type": "timeseries" + }, { "datasource": "${DS_PROMETHEUS}", "editable": true, @@ -4442,10 +4776,10 @@ "h": 7, "w": 8, "x": 0, - "y": 111 + "y": 122 }, "hideTimeOverride": false, - "id": 50, + "id": 54, "links": [], "maxDataPoints": 100, "options": { @@ -4555,10 +4889,10 @@ "h": 7, "w": 8, "x": 8, - "y": 111 + "y": 122 }, "hideTimeOverride": false, - "id": 51, + "id": 55, "links": [], "maxDataPoints": 100, "options": { @@ -4640,10 +4974,10 @@ "h": 7, "w": 8, "x": 16, - "y": 111 + "y": 122 }, "hideTimeOverride": false, - "id": 52, + "id": 56, "links": [], "maxDataPoints": 100, "options": { diff --git a/monitoring/lifecycle/dashboard.py b/monitoring/lifecycle/dashboard.py index bf11220ac..249087371 100644 --- a/monitoring/lifecycle/dashboard.py +++ b/monitoring/lifecycle/dashboard.py @@ -52,6 +52,16 @@ class Metrics: job='${job_lifecycle_producer}', namespace='${namespace}', ) + LATEST_BATCH_END_TIME = metrics.Metric( + 's3_lifecycle_latest_batch_end_time', + job='${job_lifecycle_producer}', namespace='${namespace}', + ) + + CONDUCTOR_LAST_BATCH_DURATION = metrics.Metric( + 's3_lifecycle_conductor_last_batch_duration_seconds', + job='${job_lifecycle_producer}', namespace='${namespace}', + ) + BUCKET_LISTING_SUCCESS, BUCKET_LISTING_ERROR, BUCKET_LISTING_THROTTLING = [ metrics.CounterMetric( name, job='${job_lifecycle_producer}', namespace='${namespace}', @@ -73,6 +83,16 @@ class Metrics: 'status', job='${job_lifecycle_producer}', namespace='${namespace}', ) + LATEST_BATCH_BUCKET_COUNT = metrics.Metric( + 's3_lifecycle_latest_batch_bucket_count', + job='${job_lifecycle_producer}', namespace='${namespace}', + ) + + BUCKET_PROCESSOR_SCAN_MESSAGES_PROCESSED = metrics.CounterMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total', + 'conductor_scan_id', job='${job_lifecycle_bucket_processor}', namespace='${namespace}', + ) + S3_OPS = metrics.CounterMetric( 's3_lifecycle_s3_operations_total', 'origin', 'op', 'status', job=['$jobs'], namespace='${namespace}', @@ -730,6 +750,80 @@ def color_override(name, color): ] ) +lifecycle_full_scan_elapsed = Stat( + title="Last Conductor Batch Duration", + description="Duration of the most recently completed conductor scan, " + "as measured by the conductor itself at scan completion.", + dataSource="${DS_PROMETHEUS}", + reduceCalc="last", + format=UNITS.SECONDS, + noValue='-', + targets=[ + Target( + expr=Metrics.CONDUCTOR_LAST_BATCH_DURATION(), + ), + ], + thresholds=[ + Threshold('green', 0, 0.0), + Threshold('orange', 1, 60.0), + Threshold('red', 2, 300.0), + ], +) + +lifecycle_scan_count_buckets = Stat( + title="Buckets Listed", + description="Number of buckets listed during the latest conductor scan.", + dataSource="${DS_PROMETHEUS}", + reduceCalc="last", + decimals=0, + noValue='0', + targets=[ + Target( + expr=Metrics.LATEST_BATCH_BUCKET_COUNT(), + ), + ], + thresholds=[ + Threshold('semi-dark-blue', 0, 0.0), + ], +) + +bucket_processor_messages_processed = TimeSeries( + title="Bucket Processor Messages Processed by Scan", + description="Rate of bucket-tasks topic messages successfully processed " + "by conductor scan id and bucket processor pod. Multiple scan ids " + "active at the same time indicate overlapping conductor scans, which " + "should not happen in normal operation. Each message corresponds to " + "a single listing slice (initial or continuation), not a unique " + "bucket.", + dataSource="${DS_PROMETHEUS}", + lineInterpolation="smooth", + decimals=0, + targets=[ + Target( + expr='sum(rate(' + Metrics.BUCKET_PROCESSOR_SCAN_MESSAGES_PROCESSED() + ')) ' + 'by (pod, conductor_scan_id)', + legendFormat='{{pod}} / {{conductor_scan_id}}', + ), + ], +) + +bucket_processor_active_scan_ids = TimeSeries( + title="Bucket Processor Active Conductor Scans", + description="Message processing rate grouped by conductor_scan_id across " + "all bucket-processor pods. Should normally show one active scan id; " + "multiple active scan ids indicate overlapping conductor scans.", + dataSource="${DS_PROMETHEUS}", + lineInterpolation="smooth", + decimals=0, + targets=[ + Target( + expr='sum(rate(' + Metrics.BUCKET_PROCESSOR_SCAN_MESSAGES_PROCESSED() + ')) ' + 'by (conductor_scan_id)', + legendFormat='{{conductor_scan_id}}', + ), + ], +) + active_indexing_jobs = TimeSeries( title="Active Indexing jobs", dataSource="${DS_PROMETHEUS}", @@ -897,6 +991,8 @@ def color_override(name, color): layout.row([s3_delete_object_ops, s3_delete_mpu_ops], height=8), RowPanel(title="Lifecycle Conductor"), layout.row([lifecycle_scans, trigger_latency], height=7), + layout.row([lifecycle_full_scan_elapsed, lifecycle_scan_count_buckets], height=4), + layout.row([bucket_processor_messages_processed, bucket_processor_active_scan_ids], height=7), layout.row([lifecycle_scan_rate, active_indexing_jobs, legacy_tasks], height=7), ]), ) From d064eec0a9de3f9e84ac48318520d21305cd3697 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Wed, 29 Apr 2026 18:32:45 +0200 Subject: [PATCH 04/10] Clarify lifecycle scan alert wording. Issue: BB-740 --- monitoring/lifecycle/alerts.test.yaml | 111 +++++++++++++++++--------- monitoring/lifecycle/alerts.yaml | 64 ++++++++++++++- 2 files changed, 135 insertions(+), 40 deletions(-) diff --git a/monitoring/lifecycle/alerts.test.yaml b/monitoring/lifecycle/alerts.test.yaml index d8cbe6988..48e841c21 100644 --- a/monitoring/lifecycle/alerts.test.yaml +++ b/monitoring/lifecycle/alerts.test.yaml @@ -328,8 +328,8 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 4m exp_alerts: @@ -337,14 +337,14 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - exp_labels: severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 3m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 3m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 5m exp_alerts: @@ -352,14 +352,14 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - exp_labels: severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 3m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 3m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 6m exp_alerts: @@ -367,8 +367,8 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 7m exp_alerts: @@ -376,8 +376,8 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 8m exp_alerts: @@ -385,14 +385,14 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - exp_labels: severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 3m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 3m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 9m exp_alerts: @@ -400,14 +400,14 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - exp_labels: severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 3m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 3m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 10m exp_alerts: @@ -415,14 +415,14 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - exp_labels: severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 3m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 3m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 11m exp_alerts: @@ -430,14 +430,14 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - exp_labels: severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 3m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 3m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 12m exp_alerts: @@ -445,14 +445,14 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - exp_labels: severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 3m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 3m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 13m exp_alerts: @@ -460,12 +460,51 @@ tests: severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer - description: "Last lifecycle scan was performed more than 2m 0s ago." - summary: "Lifecycle scan not executed in time" + description: "Last lifecycle scan was scheduled more than 2m 0s ago." + summary: "Lifecycle scan not scheduled in time" - alertname: LifecycleLateScan eval_time: 14m exp_alerts: [] + - name: LifecycleStuckScan + interval: 1m + input_series: + - series: s3_lifecycle_latest_batch_start_time{namespace="zenko",job="artesca-data-backbeat-lifecycle-producer-headless",pod="foo"} + values: 60000+0x5 360000 + - series: s3_lifecycle_latest_batch_end_time{namespace="zenko",job="artesca-data-backbeat-lifecycle-producer-headless",pod="foo"} + values: 0+0x5 360000 + alert_rule_test: + - alertname: LifecycleStuckScan + eval_time: 2m + exp_alerts: [] + - alertname: LifecycleStuckScan + eval_time: 4m + exp_alerts: + - exp_labels: + severity: warning + exp_annotations: + zenko_service: backbeat-lifecycle-producer + description: "Last lifecycle scan has not completed more than 2m 0s after it was scheduled." + summary: "Lifecycle scan did not complete in time" + - alertname: LifecycleStuckScan + eval_time: 5m + exp_alerts: + - exp_labels: + severity: warning + exp_annotations: + zenko_service: backbeat-lifecycle-producer + description: "Last lifecycle scan has not completed more than 2m 0s after it was scheduled." + summary: "Lifecycle scan did not complete in time" + - exp_labels: + severity: critical + exp_annotations: + zenko_service: backbeat-lifecycle-producer + description: "Last lifecycle scan has not completed more than 3m 0s after it was scheduled." + summary: "Lifecycle scan did not complete in time" + - alertname: LifecycleStuckScan + eval_time: 6m + exp_alerts: [] + - name: LifecycleLatency interval: 10m input_series: diff --git a/monitoring/lifecycle/alerts.yaml b/monitoring/lifecycle/alerts.yaml index 99f969ff3..114df6747 100644 --- a/monitoring/lifecycle/alerts.yaml +++ b/monitoring/lifecycle/alerts.yaml @@ -57,9 +57,9 @@ groups: Annotations: zenko_service: backbeat-lifecycle-producer description: >- - Last lifecycle scan was performed more than + Last lifecycle scan was scheduled more than {{ ${lifecycle_latency_warning_threshold} | humanizeDuration }} ago. - summary: "Lifecycle scan not executed in time" + summary: "Lifecycle scan not scheduled in time" - alert: LifecycleLateScan Expr: | @@ -78,9 +78,65 @@ groups: Annotations: zenko_service: backbeat-lifecycle-producer description: >- - Last lifecycle scan was performed more than + Last lifecycle scan was scheduled more than {{ ${lifecycle_latency_critical_threshold} | humanizeDuration }} ago. - summary: "Lifecycle scan not executed in time" + summary: "Lifecycle scan not scheduled in time" + + - alert: LifecycleStuckScan + Expr: | + ( + max(s3_lifecycle_latest_batch_start_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + }) + > + max(s3_lifecycle_latest_batch_end_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + } or vector(0)) + ) + and + ( + time() - ( + max(s3_lifecycle_latest_batch_start_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + }) / 1000 + ) + ) / ${lifecycle_latency_warning_threshold} > 1 + Labels: + severity: warning + Annotations: + zenko_service: backbeat-lifecycle-producer + description: >- + Last lifecycle scan has not completed more than + {{ ${lifecycle_latency_warning_threshold} | humanizeDuration }} after it was scheduled. + summary: "Lifecycle scan did not complete in time" + + - alert: LifecycleStuckScan + Expr: | + ( + max(s3_lifecycle_latest_batch_start_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + }) + > + max(s3_lifecycle_latest_batch_end_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + } or vector(0)) + ) + and + ( + time() - ( + max(s3_lifecycle_latest_batch_start_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + }) / 1000 + ) + ) / ${lifecycle_latency_critical_threshold} > 1 + Labels: + severity: critical + Annotations: + zenko_service: backbeat-lifecycle-producer + description: >- + Last lifecycle scan has not completed more than + {{ ${lifecycle_latency_critical_threshold} | humanizeDuration }} after it was scheduled. + summary: "Lifecycle scan did not complete in time" - name: LifecycleBucketProcessor rules: From 5561284bc55ec233d5ac98511cd87e753d351a9e Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Wed, 29 Apr 2026 18:32:45 +0200 Subject: [PATCH 05/10] Test lifecycle conductor scan monitoring. Issue: BB-740 --- .../lifecycle/LifecycleConductor.spec.js | 12 +- .../unit/lifecycle/LifecycleConductor.spec.js | 137 ++++++++++++++-- tests/unit/lifecycle/LifecycleMetrics.spec.js | 155 ++++++++++++++++++ tests/utils/BackbeatTestConsumer.js | 8 + 4 files changed, 291 insertions(+), 21 deletions(-) diff --git a/tests/functional/lifecycle/LifecycleConductor.spec.js b/tests/functional/lifecycle/LifecycleConductor.spec.js index 9943ab4c2..68b525254 100644 --- a/tests/functional/lifecycle/LifecycleConductor.spec.js +++ b/tests/functional/lifecycle/LifecycleConductor.spec.js @@ -41,7 +41,7 @@ const expected2Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id' }, + contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, target: { bucket: 'bucket1', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -49,7 +49,7 @@ const expected2Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id' }, + contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, target: { bucket: 'bucket1-2', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -60,7 +60,7 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id' }, + contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, target: { bucket: 'bucket1', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -68,7 +68,7 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id' }, + contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, target: { bucket: 'bucket1-2', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -76,7 +76,7 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id' }, + contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, target: { bucket: 'bucket3', owner: 'owner3', taskVersion: version }, details: {}, }, @@ -84,7 +84,7 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id' }, + contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, target: { bucket: 'bucket4', owner: 'owner4', taskVersion: version }, details: {}, }, diff --git a/tests/unit/lifecycle/LifecycleConductor.spec.js b/tests/unit/lifecycle/LifecycleConductor.spec.js index 598d0b4df..f75ebaf9a 100644 --- a/tests/unit/lifecycle/LifecycleConductor.spec.js +++ b/tests/unit/lifecycle/LifecycleConductor.spec.js @@ -3,10 +3,12 @@ const assert = require('assert'); const sinon = require('sinon'); const fakeLogger = require('../../utils/fakeLogger'); +const { errors } = require('arsenal'); const { splitter } = require('arsenal').constants; const LifecycleConductor = require( '../../../extensions/lifecycle/conductor/LifecycleConductor'); +const { LifecycleMetrics } = require('../../../extensions/lifecycle/LifecycleMetrics'); const { lifecycleTaskVersions, indexesForFeature @@ -190,6 +192,103 @@ describe('Lifecycle Conductor', () => { conductor.processBuckets(done); }); + it('should publish full scan metrics at end of scan', done => { + conductor._mongodbClient = { + getIndexingJobs: (_, cb) => cb(null, ['job1']), + getCollection: () => ({ + find: () => ({ + project: () => ({ + hasNext: () => Promise.resolve(false), + }), + }), + }), + }; + conductor._zkClient = { + getData: (_, cb) => cb(null, null, null), + setData: (path, data, version, cb) => cb(null, { version: 1 }), + }; + + sinon.stub(conductor, '_controlBacklog').callsFake(cb => cb(null)); + const metricStub = sinon.stub(LifecycleMetrics, 'onConductorScanComplete'); + + conductor.processBuckets(err => { + assert.ifError(err); + assert(metricStub.calledOnce); + const [, bucketCount, durationSeconds] = metricStub.firstCall.args; + assert.strictEqual(bucketCount, 0); + assert.strictEqual(typeof durationSeconds, 'number'); + assert(Number.isFinite(durationSeconds) && durationSeconds >= 0, + `durationSeconds should be a finite non-negative number, got ${durationSeconds}`); + done(); + }); + }); + + it('should keep the in-flight scan id when backlog control throttles a new scan', done => { + const inFlightScanId = 'in-flight-scan-id'; + conductor._currentScanId = inFlightScanId; + conductor._batchInProgress = true; + + sinon.stub(conductor, '_controlBacklog') + .callsFake(cb => cb(errors.Throttling.customizeDescription('Batch in progress'))); + const onProcessBucketsStub = sinon.stub(LifecycleMetrics, 'onProcessBuckets'); + + conductor.processBuckets(err => { + assert(err && err.Throttling); + assert.strictEqual(conductor._currentScanId, inFlightScanId); + assert(onProcessBucketsStub.notCalled); + done(); + }); + }); + + it('should generate a conductorScanId', done => { + conductor._mongodbClient = { + getIndexingJobs: (_, cb) => cb(null, []), + getCollection: () => ({ + find: () => ({ + project: () => ({ + hasNext: () => Promise.resolve(false), + }), + }), + }), + }; + conductor._zkClient = { + getData: (_, cb) => cb(null, null, null), + setData: (path, data, version, cb) => cb(null, { version: 1 }), + }; + conductor._producer = { send: (msg, cb) => cb(null, {}) }; + const addDefaultFieldsStub = sinon.stub(); + const testLog = conductor.logger.newRequestLogger(); + testLog.addDefaultFields = addDefaultFieldsStub; + sinon.stub(conductor.logger, 'newRequestLogger').returns(testLog); + + sinon.stub(conductor, '_controlBacklog').callsFake(cb => cb(null)); + let capturedScanId = null; + sinon.stub(conductor, 'listBuckets') + .callsFake((queue, scanId, log, cb) => { + // Verify scanId is a valid UUID + const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; + assert(uuidRegex.test(scanId), + `conductorScanId should be a valid UUID v4, got: ${scanId}`); + capturedScanId = scanId; + assert(addDefaultFieldsStub.calledOnce); + assert.strictEqual(addDefaultFieldsStub.firstCall.args[0].conductorScanId, scanId); + cb(null, 0); + }); + const metricStub = sinon.stub(LifecycleMetrics, 'onConductorScanComplete'); + + conductor.processBuckets(err => { + assert.ifError(err); + assert(metricStub.calledOnce); + const logPassedToMetric = metricStub.firstCall.args[0]; + assert.strictEqual(logPassedToMetric, testLog); + assert(capturedScanId); + const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; + assert(uuidRegex.test(capturedScanId), + `captured conductorScanId should be a valid UUID v4, got: ${capturedScanId}`); + done(); + }); + }); + // tests that `activeIndexingJobRetrieved` is not reset until the e it('should not reset `activeIndexingJobsRetrieved` while async operations are in progress', done => { const order = []; @@ -202,7 +301,7 @@ describe('Lifecycle Conductor', () => { sinon.stub(conductor._mongodbClient, 'getIndexingJobs') .callsFake((_, cb) => cb(null, ['job1', 'job2'])); sinon.stub(conductor, 'listBuckets') - .callsFake((mQueue, log, cb) => { + .callsFake((mQueue, scanId, log, cb) => { mQueue.push({ bucketName: 'testbucket', canonicalId: 'testId', @@ -244,6 +343,14 @@ describe('Lifecycle Conductor', () => { }); describe('_indexesGetOrCreate', () => { + it('should include conductor scan id in task context', () => { + conductor._currentScanId = 'scan-id-test'; + const taskMessage = conductor._taskToMessage( + getTask(true), lifecycleTaskVersions.v2, log); + const parsed = JSON.parse(taskMessage.message); + assert.strictEqual(parsed.contextInfo.conductorScanId, 'scan-id-test'); + }); + it('should return v2 for bucketd bucket sources', done => { conductor._bucketSource = 'bucketd'; conductor._indexesGetOrCreate(getTask(undefined), log, (err, taskVersion) => { @@ -435,7 +542,7 @@ describe('Lifecycle Conductor', () => { const lcConductor = makeLifecycleConductorWithFilters({ bucketSource: 'zookeeper', }); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 2); assert.strictEqual(queue.length(), 2); const expectedQueue = [ @@ -458,7 +565,7 @@ describe('Lifecycle Conductor', () => { const lcConductor = makeLifecycleConductorWithFilters({ bucketSource: 'bucketd', }, markers); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 2); assert.strictEqual(queue.length(), 2); @@ -489,7 +596,7 @@ describe('Lifecycle Conductor', () => { 'invalid:bucketuid789', 'invalid', ]); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 2); assert.strictEqual(queue.length(), 2); const expectedQueue = [ @@ -512,7 +619,7 @@ describe('Lifecycle Conductor', () => { bucketsDenied: [bucket1], bucketSource: 'zookeeper', }); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 1); assert.strictEqual(queue.length(), 1); const expectedQueue = [ @@ -532,7 +639,7 @@ describe('Lifecycle Conductor', () => { bucketsDenied: [bucket1], bucketSource: 'bucketd', }, markers); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 1); assert.strictEqual(queue.length(), 1); const expectedQueue = [ @@ -553,7 +660,7 @@ describe('Lifecycle Conductor', () => { accountsDenied: [`${accountName1}:${account1}`], bucketSource: 'zookeeper', }); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 1); assert.strictEqual(queue.length(), 1); const expectedQueue = [ @@ -573,7 +680,7 @@ describe('Lifecycle Conductor', () => { accountsDenied: [`${accountName1}:${account1}`], bucketSource: 'bucketd', }, markers); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 1); assert.strictEqual(queue.length(), 1); const expectedQueue = [ @@ -595,7 +702,7 @@ describe('Lifecycle Conductor', () => { bucketsDenied: [bucket2], bucketSource: 'zookeeper', }); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 0); assert.strictEqual(queue.length(), 0); const expectedQueue = []; @@ -611,7 +718,7 @@ describe('Lifecycle Conductor', () => { bucketsDenied: [bucket2], bucketSource: 'bucketd', }, markers); - lcConductor.listBuckets(queue, fakeLogger, (err, length) => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, (err, length) => { assert.strictEqual(length, 0); assert.strictEqual(queue.length(), 0); const expectedQueue = []; @@ -636,7 +743,7 @@ describe('Lifecycle Conductor', () => { getCollection: () => ({ find: findStub }) }; sinon.stub(lcConductor._zkClient, 'getData').yields(null, null, null); - lcConductor.listBuckets(queue, fakeLogger, err => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, err => { assert.ifError(err); assert.deepEqual(findStub.getCall(0).args[0], {}); done(); @@ -658,7 +765,7 @@ describe('Lifecycle Conductor', () => { getCollection: () => ({ find: findStub }) }; sinon.stub(lcConductor._zkClient, 'getData').yields(null, null, null); - lcConductor.listBuckets(queue, fakeLogger, err => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, err => { assert.ifError(err); assert.deepEqual(findStub.getCall(0).args[0], { 'value.owner': { @@ -686,7 +793,7 @@ describe('Lifecycle Conductor', () => { getCollection: () => ({ find: findStub }) }; sinon.stub(lcConductor._zkClient, 'getData').yields(null, null, null); - lcConductor.listBuckets(queue, fakeLogger, err => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, err => { assert.ifError(err); assert.deepEqual(findStub.getCall(0).args[0], { _id: { @@ -711,7 +818,7 @@ describe('Lifecycle Conductor', () => { getCollection: () => ({ find: findStub }) }; sinon.stub(lcConductor._zkClient, 'getData').yields(null, Buffer.from('bucket1'), null); - lcConductor.listBuckets(queue, fakeLogger, err => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, err => { assert.ifError(err); assert.deepEqual(findStub.getCall(0).args[0], { _id: { @@ -738,7 +845,7 @@ describe('Lifecycle Conductor', () => { getCollection: () => ({ find: findStub }) }; sinon.stub(lcConductor._zkClient, 'getData').yields(null, Buffer.from('bucket1'), null); - lcConductor.listBuckets(queue, fakeLogger, err => { + lcConductor.listBuckets(queue, 'test-scan-id', fakeLogger, err => { assert.ifError(err); assert.deepEqual(findStub.getCall(0).args[0], { '_id': { diff --git a/tests/unit/lifecycle/LifecycleMetrics.spec.js b/tests/unit/lifecycle/LifecycleMetrics.spec.js index 2acdba249..910709d8b 100644 --- a/tests/unit/lifecycle/LifecycleMetrics.spec.js +++ b/tests/unit/lifecycle/LifecycleMetrics.spec.js @@ -79,6 +79,110 @@ describe('LifecycleMetrics', () => { })); }); + it('should catch errors in onConductorScanComplete', () => { + const metric = ZenkoMetrics.getMetric('s3_lifecycle_latest_batch_end_time'); + sinon.stub(metric, 'set').throws(new Error('Metric error')); + + LifecycleMetrics.onConductorScanComplete(log, 10); + + assert(log.error.calledOnce); + assert(log.error.calledWithMatch('failed to update prometheus metrics', { + method: 'LifecycleMetrics.onConductorScanComplete', + bucketCount: 10, + })); + }); + + it('should increment bucket processor messages processed counter by scan id', () => { + const messagesMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total'); + const messagesInc = sinon.stub(messagesMetric, 'inc'); + + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); + assert(messagesInc.calledOnce); + assert(messagesInc.calledWithMatch({ + origin: 'bucket_processor', + ['conductor_scan_id']: 'scan-A', + })); + + assert(log.error.notCalled); + }); + + it('should catch errors in onBucketProcessorScanMessageProcessed', () => { + const messagesMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total'); + sinon.stub(messagesMetric, 'inc').throws(new Error('Metric error')); + + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); + + assert(log.error.calledOnce); + assert(log.error.calledWithMatch('failed to update prometheus metrics', { + method: 'LifecycleMetrics.onBucketProcessorScanMessageProcessed', + conductorScanId: 'scan-A', + })); + }); + + it('should be a no-op when scan id is missing', () => { + const messagesMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total'); + const incStub = sinon.stub(messagesMetric, 'inc'); + + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, undefined); + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, ''); + + assert(incStub.notCalled); + assert(log.error.notCalled); + }); + + it('should remove stale bucket processor scan series', () => { + const messagesMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total'); + const removeStub = sinon.stub(messagesMetric, 'remove'); + sinon.stub(messagesMetric, 'inc'); + const now = 1700000000000; + sinon.stub(Date, 'now') + .onFirstCall().returns(now) + .onSecondCall().returns(now + 24 * 60 * 60 * 1000 + 1); + + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-B'); + + assert(removeStub.calledOnce); + assert(removeStub.calledWithMatch({ + origin: 'bucket_processor', + ['conductor_scan_id']: 'scan-A', + })); + assert(log.error.notCalled); + }); + + it('should continue stale series cleanup when one removal fails', () => { + const messagesMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total'); + const removeStub = sinon.stub(messagesMetric, 'remove') + .onFirstCall().throws(new Error('Metric remove error')); + const incStub = sinon.stub(messagesMetric, 'inc'); + const now = 1700000000000; + sinon.stub(Date, 'now') + .onFirstCall().returns(now) + .onSecondCall().returns(now + 1) + .onThirdCall().returns(now + 24 * 60 * 60 * 1000 + 2); + + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-B'); + LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-C'); + + assert(incStub.calledThrice); + assert(incStub.thirdCall.calledWithMatch({ + origin: 'bucket_processor', + ['conductor_scan_id']: 'scan-C', + })); + assert(removeStub.callCount >= 2); + assert(log.error.calledOnce); + assert(log.error.calledWithMatch('failed to remove stale bucket processor scan metric')); + assert(log.error.neverCalledWithMatch('failed to update prometheus metrics', { + method: 'LifecycleMetrics.onBucketProcessorScanMessageProcessed', + })); + }); + it('should catch errors in onLifecycleTriggered', () => { LifecycleMetrics.onLifecycleTriggered(log, 'conductor', 'expiration', 'us-east-1', NaN); @@ -169,5 +273,56 @@ describe('LifecycleMetrics', () => { process: 'conductor', })); }); + + it('should set latest batch bucket count metric', () => { + const endMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_latest_batch_end_time'); + const countMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_latest_batch_bucket_count'); + const endSet = sinon.stub(endMetric, 'set'); + const countSet = sinon.stub(countMetric, 'set'); + + LifecycleMetrics.onConductorScanComplete(log, 7); + + assert(endSet.calledOnce); + assert(endSet.calledWithMatch( + { origin: 'conductor' })); + assert.strictEqual(countSet.callCount, 1); + assert(countSet.getCall(0).calledWithMatch( + { origin: 'conductor' }, 7)); + assert(log.error.notCalled); + }); + + it('should set conductor last batch duration when provided', () => { + sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_latest_batch_end_time'), 'set'); + sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_latest_batch_bucket_count'), 'set'); + const durationSet = sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_conductor_last_batch_duration_seconds'), 'set'); + + LifecycleMetrics.onConductorScanComplete(log, 5, 12.5); + + assert(durationSet.calledOnce); + assert(durationSet.calledWithMatch( + { origin: 'conductor' }, 12.5)); + assert(log.error.notCalled); + }); + + it('should not set conductor last batch duration when omitted', () => { + sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_latest_batch_end_time'), 'set'); + sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_latest_batch_bucket_count'), 'set'); + const durationSet = sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_conductor_last_batch_duration_seconds'), 'set'); + + LifecycleMetrics.onConductorScanComplete(log, 5); + LifecycleMetrics.onConductorScanComplete(log, 5, NaN); + LifecycleMetrics.onConductorScanComplete(log, 5, -1); + + assert(durationSet.notCalled); + assert(log.error.notCalled); + }); }); }); diff --git a/tests/utils/BackbeatTestConsumer.js b/tests/utils/BackbeatTestConsumer.js index 00ed404dd..a227f8365 100644 --- a/tests/utils/BackbeatTestConsumer.js +++ b/tests/utils/BackbeatTestConsumer.js @@ -32,6 +32,14 @@ class BackbeatTestConsumer extends BackbeatConsumer { // present assert(parsedMsg.contextInfo?.reqId, 'expected contextInfo.reqId field'); parsedMsg.contextInfo.reqId = expectedMsg.value.contextInfo?.reqId; + // conductorScanId is also generated per scan: check it exists, + // then normalize for comparison + if (expectedMsg.value.contextInfo?.conductorScanId === 'test-scan-id') { + assert(parsedMsg.contextInfo?.conductorScanId, + 'expected contextInfo.conductorScanId field'); + parsedMsg.contextInfo.conductorScanId = + expectedMsg.value.contextInfo.conductorScanId; + } } assert.deepStrictEqual( parsedMsg, expectedMsg.value, From 43d3f902c29332f226a30e113158f340ccbecab8 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Thu, 21 May 2026 16:35:08 +0200 Subject: [PATCH 06/10] Propagate lifecycle scan timestamps. Carry conductor scan start timestamps through lifecycle bucket messages, task contexts, and action entries for consistent downstream observability. Issue: BB-740 --- .../lifecycle/conductor/LifecycleConductor.js | 15 +- .../tasks/LifecycleDeleteObjectTask.js | 10 +- extensions/lifecycle/tasks/LifecycleTask.js | 130 +++++++----------- extensions/lifecycle/tasks/LifecycleTaskV2.js | 68 ++++----- .../tasks/LifecycleUpdateExpirationTask.js | 10 +- .../tasks/LifecycleUpdateTransitionTask.js | 10 +- .../lifecycle/LifecycleConductor.spec.js | 36 ++++- .../unit/lifecycle/LifecycleConductor.spec.js | 2 + tests/unit/lifecycle/LifecycleTask.spec.js | 96 +++++++++++++ tests/utils/BackbeatTestConsumer.js | 69 ++++++++-- tests/utils/fakeLogger.js | 1 + 11 files changed, 290 insertions(+), 157 deletions(-) diff --git a/extensions/lifecycle/conductor/LifecycleConductor.js b/extensions/lifecycle/conductor/LifecycleConductor.js index 2922d835c..d87aa07f7 100644 --- a/extensions/lifecycle/conductor/LifecycleConductor.js +++ b/extensions/lifecycle/conductor/LifecycleConductor.js @@ -107,6 +107,7 @@ class LifecycleConductor { this._initialized = false; this._batchInProgress = false; this._currentScanId = null; + this._currentScanStartTimestamp = null; // this cache only needs to be the size of one listing. // worst case scenario is 1 account per bucket: @@ -317,6 +318,7 @@ class LifecycleConductor { contextInfo: { reqId: log.getSerializedUids(), conductorScanId: this._currentScanId, + conductorScanStartTimestamp: this._currentScanStartTimestamp, }, target: { bucket: task.bucketName, @@ -439,12 +441,14 @@ class LifecycleConductor { next => { scanStarted = true; this._currentScanId = scanId; + this._currentScanStartTimestamp = start; log.addDefaultFields({ conductorScanId: scanId }); - LifecycleMetrics.onProcessBuckets(log); + LifecycleMetrics.onProcessBuckets(log, start); this._batchInProgress = true; log.info('starting new lifecycle batch', { bucketSource: this._bucketSource, conductorScanId: scanId, + conductorScanStartTimestamp: start, }); this.listBuckets(messageSendQueue, scanId, log, (err, listedBucketsCount) => { LifecycleMetrics.onBucketListing(log, err); @@ -462,6 +466,13 @@ class LifecycleConductor { const elapsedMs = Date.now() - start; const unknownCanonicalIds = this._accountIdCache.getMisses(); if (err && err.Throttling) { + if (scanStarted) { + this.activeIndexingJobsRetrieved = false; + this.activeIndexingJobs = []; + this._batchInProgress = false; + this._currentScanId = null; + this._currentScanStartTimestamp = null; + } log.info('not starting new lifecycle batch', { reason: err, conductorScanId: scanId, @@ -488,6 +499,7 @@ class LifecycleConductor { }); if (scanStarted) { this._currentScanId = null; + this._currentScanStartTimestamp = null; } if (cb) { cb(err); @@ -509,6 +521,7 @@ class LifecycleConductor { totalBucketsListed, }); this._currentScanId = null; + this._currentScanStartTimestamp = null; if (cb) { cb(null, nBucketsQueued); } diff --git a/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js b/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js index c0c288266..a8d793be4 100644 --- a/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js +++ b/extensions/lifecycle/tasks/LifecycleDeleteObjectTask.js @@ -304,16 +304,16 @@ class LifecycleDeleteObjectTask extends BackbeatTask { processActionEntry(entry, done) { const startTime = Date.now(); const log = this.logger.newRequestLogger(); - const conductorScanId = entry.getContextAttribute( - 'conductorScanId'); - if (conductorScanId) { - log.addDefaultFields({ conductorScanId }); - } + const conductorScanId = entry.getContextAttribute('conductorScanId'); + const conductorScanStartTimestamp = entry.getContextAttribute( + 'conductorScanStartTimestamp'); + log.addDefaultFields({ conductorScanId, conductorScanStartTimestamp }); entry.addLoggedAttributes({ bucketName: 'target.bucket', objectKey: 'target.key', versionId: 'target.version', conductorScanId: 'contextInfo.conductorScanId', + conductorScanStartTimestamp: 'contextInfo.conductorScanStartTimestamp', }); return async.series([ diff --git a/extensions/lifecycle/tasks/LifecycleTask.js b/extensions/lifecycle/tasks/LifecycleTask.js index 37a48d0a0..0ee99d200 100644 --- a/extensions/lifecycle/tasks/LifecycleTask.js +++ b/extensions/lifecycle/tasks/LifecycleTask.js @@ -128,6 +128,37 @@ class LifecycleTask extends BackbeatTask { }); } + _getScanContext(bucketData) { + return { + conductorScanId: bucketData?.contextInfo?.conductorScanId, + conductorScanStartTimestamp: bucketData?.contextInfo?.conductorScanStartTimestamp, + }; + } + + _getBucketEntryContext(bucketData, log) { + return { + reqId: log.getSerializedUids(), + ...this._getScanContext(bucketData), + }; + } + + _getActionContext(bucketData, log, ruleType) { + return { + origin: 'lifecycle', + ruleType, + reqId: log.getSerializedUids(), + ...this._getScanContext(bucketData), + }; + } + + _getContinuationEntry(bucketData, log, details) { + return { + ...bucketData, + contextInfo: this._getBucketEntryContext(bucketData, log), + details, + }; + } + /** * This function forces syncing the latest data mover topic * offsets to the 'lifecycle' metrics snapshot. It is called when @@ -235,13 +266,8 @@ class LifecycleTask extends BackbeatTask { marker = contents[contents.length - 1].Key; } - const entry = Object.assign({}, bucketData, { - contextInfo: { - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo?.conductorScanId, - }, - details: { marker }, - }); + const entry = this._getContinuationEntry( + bucketData, log, { marker }); this._sendBucketEntry(entry, err => { if (!err) { log.debug( @@ -411,17 +437,10 @@ class LifecycleTask extends BackbeatTask { if (data.IsTruncated && allVersions.length > 0 && nbRetries === 0) { // Uses last version whether Version or DeleteMarker const last = allVersions[allVersions.length - 1]; - const entry = Object.assign({}, bucketData, { - contextInfo: { - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }, - details: { - keyMarker: data.NextKeyMarker, - versionIdMarker: data.NextVersionIdMarker, - prevDate: last.LastModified, - }, + const entry = this._getContinuationEntry(bucketData, log, { + keyMarker: data.NextKeyMarker, + versionIdMarker: data.NextVersionIdMarker, + prevDate: last.LastModified, }); this._sendBucketEntry(entry, err => { if (!err) { @@ -499,16 +518,9 @@ class LifecycleTask extends BackbeatTask { if (data.IsTruncated && nbRetries === 0) { // re-queue to kafka with `NextUploadIdMarker` & // `NextKeyMarker` only once. - const entry = Object.assign({}, bucketData, { - contextInfo: { - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }, - details: { - keyMarker: data.NextKeyMarker, - uploadIdMarker: data.NextUploadIdMarker, - }, + const entry = this._getContinuationEntry(bucketData, log, { + keyMarker: data.NextKeyMarker, + uploadIdMarker: data.NextUploadIdMarker, }); return this._sendBucketEntry(entry, err => { if (!err) { @@ -1043,13 +1055,7 @@ class LifecycleTask extends BackbeatTask { rules.Expiration.Date < currentDate) { // expiration date passed for this object const entry = ActionQueueEntry.create('deleteObject') - .addContext({ - origin: 'lifecycle', - ruleType: 'expiration', - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }) + .addContext(this._getActionContext(bucketData, log, 'expiration')) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) .setAttribute('target.accountId', bucketData.target.accountId) @@ -1073,13 +1079,7 @@ class LifecycleTask extends BackbeatTask { if (rules.Expiration.Days !== undefined && daysSinceInitiated >= rules.Expiration.Days) { const entry = ActionQueueEntry.create('deleteObject') - .addContext({ - origin: 'lifecycle', - ruleType: 'expiration', - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }) + .addContext(this._getActionContext(bucketData, log, 'expiration')) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) .setAttribute('target.accountId', bucketData.target.accountId) @@ -1181,12 +1181,7 @@ class LifecycleTask extends BackbeatTask { transitionTime: new Date(params.transitionTime).toISOString(), attempt, }); - entry.addContext({ - origin: 'lifecycle', - ruleType: 'transition', - reqId: log.getSerializedUids(), - conductorScanId: params.conductorScanId, - }); + entry.addContext(params.actionContext); entry.setAttribute('source', { bucket: params.bucket, objectKey: params.objectKey, @@ -1386,8 +1381,7 @@ class LifecycleTask extends BackbeatTask { site: rules[ncvt].StorageClass, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( { Days: rules[ncvt][ncd] }, staleDate), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, + actionContext: this._getActionContext(bucketData, log, 'transition'), }, log, cb); return; } @@ -1460,13 +1454,7 @@ class LifecycleTask extends BackbeatTask { // if a valid Expiration rule exists, apply and permanently delete this DM if (matchingNoncurrentKeys.length === 0 && applicableExpRule) { const entry = ActionQueueEntry.create('deleteObject') - .addContext({ - origin: 'lifecycle', - ruleType: 'expiration', - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }) + .addContext(this._getActionContext(bucketData, log, 'expiration')) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) .setAttribute('target.key', deleteMarker.Key) @@ -1531,13 +1519,7 @@ class LifecycleTask extends BackbeatTask { const storageClass = this._isDeleteMarker(verToExpire) ? LIFECYCLE_MARKER_METRICS_LOCATION : verToExpire.StorageClass; const entry = ActionQueueEntry.create('deleteObject') - .addContext({ - origin: 'lifecycle', - ruleType: 'expiration', - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }) + .addContext(this._getActionContext(bucketData, log, 'expiration')) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) .setAttribute('target.accountId', bucketData.target.accountId) @@ -1624,8 +1606,7 @@ class LifecycleTask extends BackbeatTask { site: rules.Transition.StorageClass, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( rules.Transition, object.LastModified), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, + actionContext: this._getActionContext(bucketData, log, 'transition'), }, log, done); } return done(); @@ -1729,8 +1710,7 @@ class LifecycleTask extends BackbeatTask { encodedVersionId: undefined, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( rules.Transition, version.LastModified), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, + actionContext: this._getActionContext(bucketData, log, 'transition'), }, log, done); } @@ -1776,13 +1756,7 @@ class LifecycleTask extends BackbeatTask { uploadId: upload.UploadId, }); const entry = ActionQueueEntry.create('deleteMPU') - .addContext({ - origin: 'lifecycle', - ruleType: 'expiration', - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }) + .addContext(this._getActionContext(bucketData, log, 'expiration')) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) .setAttribute('target.accountId', bucketData.target.accountId) @@ -1851,11 +1825,7 @@ class LifecycleTask extends BackbeatTask { processBucketEntry(bucketLCRules, bucketData, s3target, backbeatMetadataProxy, nbRetries, done) { const log = this.log.newRequestLogger(); - const conductorScanId = bucketData.contextInfo - && bucketData.contextInfo.conductorScanId; - if (conductorScanId) { - log.addDefaultFields({ conductorScanId }); - } + log.addDefaultFields(this._getScanContext(bucketData)); this.s3target = s3target; this.backbeatMetadataProxy = backbeatMetadataProxy; if (!this.backbeatMetadataProxy) { diff --git a/extensions/lifecycle/tasks/LifecycleTaskV2.js b/extensions/lifecycle/tasks/LifecycleTaskV2.js index 4a4d8b0ac..7eea5e9a6 100644 --- a/extensions/lifecycle/tasks/LifecycleTaskV2.js +++ b/extensions/lifecycle/tasks/LifecycleTaskV2.js @@ -15,6 +15,13 @@ class LifecycleTaskV2 extends LifecycleTask { * NOTE: Among other methods, LifecycleTaskV2 inherits the processBucketEntry method from LifecycleTask. */ + _getBucketEntryContext(bucketData, log) { + return { + requestId: log.getSerializedUids(), + ...this._getScanContext(bucketData), + }; + } + /** * Skips kafka entry * @param {object} bucketData - bucket data from bucketTasksTopic @@ -51,13 +58,11 @@ class LifecycleTaskV2 extends LifecycleTask { storageClass, } = l; - const entry = Object.assign({}, bucketData, { - contextInfo: { - requestId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }, - details: { beforeDate, prefix, listType, storageClass }, + const entry = this._getContinuationEntry(bucketData, log, { + beforeDate, + prefix, + listType, + storageClass, }); this._sendBucketEntry(entry, err => { @@ -118,19 +123,12 @@ class LifecycleTaskV2 extends LifecycleTask { // re-queue truncated listing only once. if (isTruncated && nbRetries === 0) { - const entry = Object.assign({}, bucketData, { - contextInfo: { - requestId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }, - details: { - beforeDate: params.BeforeDate, - prefix: params.Prefix, - storageClass: params.ExcludedDataStoreName, - listType, - ...markerInfo - }, + const entry = this._getContinuationEntry(bucketData, log, { + beforeDate: params.BeforeDate, + prefix: params.Prefix, + storageClass: params.ExcludedDataStoreName, + listType, + ...markerInfo, }); this._sendBucketEntry(entry, err => { @@ -206,19 +204,12 @@ class LifecycleTaskV2 extends LifecycleTask { // re-queue truncated listing only once. if (isTruncated && nbRetries === 0) { - const entry = Object.assign({}, bucketData, { - contextInfo: { - requestId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }, - details: { - beforeDate: params.BeforeDate, - prefix: params.Prefix, - storageClass: params.ExcludedDataStoreName, - listType, - ...markerInfo, - }, + const entry = this._getContinuationEntry(bucketData, log, { + beforeDate: params.BeforeDate, + prefix: params.Prefix, + storageClass: params.ExcludedDataStoreName, + listType, + ...markerInfo, }); this._sendBucketEntry(entry, err => { @@ -362,8 +353,7 @@ class LifecycleTaskV2 extends LifecycleTask { site: rules.Transition.StorageClass, transitionTime: this._lifecycleDateTime.getTransitionTimestamp( rules.Transition, obj.LastModified), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, + actionContext: this._getActionContext(bucketData, log, 'transition'), }, log, cb); } @@ -436,13 +426,7 @@ class LifecycleTaskV2 extends LifecycleTask { if (applicableExpRule) { const entry = ActionQueueEntry.create('deleteObject') - .addContext({ - origin: 'lifecycle', - ruleType: 'expiration', - reqId: log.getSerializedUids(), - conductorScanId: bucketData.contextInfo - && bucketData.contextInfo.conductorScanId, - }) + .addContext(this._getActionContext(bucketData, log, 'expiration')) .setAttribute('target.owner', bucketData.target.owner) .setAttribute('target.bucket', bucketData.target.bucket) .setAttribute('target.key', deleteMarker.Key) diff --git a/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js b/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js index 38895a59c..c816b8f6c 100644 --- a/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js +++ b/extensions/lifecycle/tasks/LifecycleUpdateExpirationTask.js @@ -150,16 +150,16 @@ class LifecycleUpdateExpirationTask extends BackbeatTask { processActionEntry(entry, done) { const startTime = Date.now(); const log = this.logger.newRequestLogger(); - const conductorScanId = entry.getContextAttribute( - 'conductorScanId'); - if (conductorScanId) { - log.addDefaultFields({ conductorScanId }); - } + const conductorScanId = entry.getContextAttribute('conductorScanId'); + const conductorScanStartTimestamp = entry.getContextAttribute( + 'conductorScanStartTimestamp'); + log.addDefaultFields({ conductorScanId, conductorScanStartTimestamp }); entry.addLoggedAttributes({ bucketName: 'target.bucket', objectKey: 'target.key', versionId: 'target.version', conductorScanId: 'contextInfo.conductorScanId', + conductorScanStartTimestamp: 'contextInfo.conductorScanStartTimestamp', }); async.waterfall([ diff --git a/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js b/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js index 74424fe08..ebf7bd56c 100644 --- a/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js +++ b/extensions/lifecycle/tasks/LifecycleUpdateTransitionTask.js @@ -260,16 +260,16 @@ class LifecycleUpdateTransitionTask extends BackbeatTask { */ processActionEntry(entry, done) { const log = this.logger.newRequestLogger(); - const conductorScanId = entry.getContextAttribute( - 'conductorScanId'); - if (conductorScanId) { - log.addDefaultFields({ conductorScanId }); - } + const conductorScanId = entry.getContextAttribute('conductorScanId'); + const conductorScanStartTimestamp = entry.getContextAttribute( + 'conductorScanStartTimestamp'); + log.addDefaultFields({ conductorScanId, conductorScanStartTimestamp }); entry.addLoggedAttributes({ bucketName: 'target.bucket', objectKey: 'target.key', versionId: 'target.version', conductorScanId: 'contextInfo.conductorScanId', + conductorScanStartTimestamp: 'contextInfo.conductorScanStartTimestamp', eTag: 'target.eTag', lastModified: 'target.lastModified', }); diff --git a/tests/functional/lifecycle/LifecycleConductor.spec.js b/tests/functional/lifecycle/LifecycleConductor.spec.js index 68b525254..fc0b22bb2 100644 --- a/tests/functional/lifecycle/LifecycleConductor.spec.js +++ b/tests/functional/lifecycle/LifecycleConductor.spec.js @@ -41,7 +41,11 @@ const expected2Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, + contextInfo: { + reqId: 'test-request-id', + conductorScanId: 'test-scan-id', + conductorScanStartTimestamp: 0, + }, target: { bucket: 'bucket1', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -49,7 +53,11 @@ const expected2Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, + contextInfo: { + reqId: 'test-request-id', + conductorScanId: 'test-scan-id', + conductorScanStartTimestamp: 0, + }, target: { bucket: 'bucket1-2', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -60,7 +68,11 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, + contextInfo: { + reqId: 'test-request-id', + conductorScanId: 'test-scan-id', + conductorScanStartTimestamp: 0, + }, target: { bucket: 'bucket1', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -68,7 +80,11 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, + contextInfo: { + reqId: 'test-request-id', + conductorScanId: 'test-scan-id', + conductorScanStartTimestamp: 0, + }, target: { bucket: 'bucket1-2', owner: 'owner1', taskVersion: version }, details: {}, }, @@ -76,7 +92,11 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, + contextInfo: { + reqId: 'test-request-id', + conductorScanId: 'test-scan-id', + conductorScanStartTimestamp: 0, + }, target: { bucket: 'bucket3', owner: 'owner3', taskVersion: version }, details: {}, }, @@ -84,7 +104,11 @@ const expected4Messages = (version='v2') => ([ { value: { action: 'processObjects', - contextInfo: { reqId: 'test-request-id', conductorScanId: 'test-scan-id' }, + contextInfo: { + reqId: 'test-request-id', + conductorScanId: 'test-scan-id', + conductorScanStartTimestamp: 0, + }, target: { bucket: 'bucket4', owner: 'owner4', taskVersion: version }, details: {}, }, diff --git a/tests/unit/lifecycle/LifecycleConductor.spec.js b/tests/unit/lifecycle/LifecycleConductor.spec.js index f75ebaf9a..47660161b 100644 --- a/tests/unit/lifecycle/LifecycleConductor.spec.js +++ b/tests/unit/lifecycle/LifecycleConductor.spec.js @@ -345,10 +345,12 @@ describe('Lifecycle Conductor', () => { describe('_indexesGetOrCreate', () => { it('should include conductor scan id in task context', () => { conductor._currentScanId = 'scan-id-test'; + conductor._currentScanStartTimestamp = 1700000000000; const taskMessage = conductor._taskToMessage( getTask(true), lifecycleTaskVersions.v2, log); const parsed = JSON.parse(taskMessage.message); assert.strictEqual(parsed.contextInfo.conductorScanId, 'scan-id-test'); + assert.strictEqual(parsed.contextInfo.conductorScanStartTimestamp, 1700000000000); }); it('should return v2 for bucketd bucket sources', done => { diff --git a/tests/unit/lifecycle/LifecycleTask.spec.js b/tests/unit/lifecycle/LifecycleTask.spec.js index 3c8b93d78..10f1b4f2c 100644 --- a/tests/unit/lifecycle/LifecycleTask.spec.js +++ b/tests/unit/lifecycle/LifecycleTask.spec.js @@ -8,6 +8,8 @@ const { ValidLifecycleRules } = require('arsenal').models; const LifecycleTask = require( '../../../extensions/lifecycle/tasks/LifecycleTask'); +const LifecycleTaskV2 = require( + '../../../extensions/lifecycle/tasks/LifecycleTaskV2'); const fakeLogger = require('../../utils/fakeLogger'); const { timeOptions } = require('../../functional/lifecycle/configObjects'); @@ -654,6 +656,95 @@ describe('lifecycle task helper methods', () => { }); }); + describe('LifecycleTaskV2 scan context propagation', () => { + let lifecycleTaskV2; + + const bucketData = { + target: { + owner: 'test-user', + accountId: 'test-account', + bucket: 'test-bucket', + }, + contextInfo: { + conductorScanId: 'scan-A', + conductorScanStartTimestamp: 1700000000000, + }, + }; + + beforeEach(() => { + lifecycleTaskV2 = new LifecycleTaskV2(lp); + }); + + afterEach(() => { + sinon.restore(); + }); + + it('should propagate scan context to current-version transition actions', done => { + sinon.stub(lifecycleTaskV2, '_checkAndApplyExpirationRule').returns(false); + sinon.stub(lifecycleTaskV2, '_isDeleteMarker').returns(false); + sinon.stub(lifecycleTaskV2, '_applyTransitionRule') + .callsFake((params, log, cb) => { + assert.strictEqual(params.actionContext.conductorScanId, 'scan-A'); + assert.strictEqual( + params.actionContext.conductorScanStartTimestamp, 1700000000000); + assert.strictEqual(params.actionContext.ruleType, 'transition'); + cb(); + }); + + lifecycleTaskV2._compareCurrent(bucketData, { + Key: 'test-key', + VersionId: 'test-version', + ETag: '"test-etag"', + LastModified: '2023-01-01T00:00:00.000Z', + }, { + Transition: { + Days: 1, + StorageClass: 'test-site', + }, + }, fakeLogger, done); + }); + + it('should propagate scan context to expired delete-marker actions', () => { + class LifecycleTaskV2Mock extends LifecycleTaskV2 { + _sendObjectAction(entry, cb) { + this.latestEntry = entry; + return cb(); + } + } + lifecycleTaskV2 = new LifecycleTaskV2Mock(lp); + + lifecycleTaskV2._checkAndApplyEODMRule(bucketData, { + Key: 'test-key', + VersionId: 'test-version', + LastModified: new Date(Date.now() - (2 * DAY)).toISOString(), + }, { + Expiration: { Days: 1 }, + }, fakeLogger); + + const context = lifecycleTaskV2.latestEntry.getContext(); + assert.strictEqual(context.conductorScanId, 'scan-A'); + assert.strictEqual(context.conductorScanStartTimestamp, 1700000000000); + }); + + it('should preserve V2 continuation requestId context', () => { + const log = { + ...fakeLogger, + getSerializedUids: () => 'test-request-id', + }; + const entry = lifecycleTaskV2._getContinuationEntry(bucketData, log, { + beforeDate: '2023-01-01T00:00:00.000Z', + prefix: '', + listType: 'current', + }); + + assert.strictEqual(entry.contextInfo.requestId, 'test-request-id'); + assert.strictEqual(entry.contextInfo.reqId, undefined); + assert.strictEqual(entry.contextInfo.conductorScanId, 'scan-A'); + assert.strictEqual( + entry.contextInfo.conductorScanStartTimestamp, 1700000000000); + }); + }); + describe('_rulesHaveTag', () => { it('should return true if rule has a prefix and tag', () => { const rules = [ @@ -2230,6 +2321,11 @@ describe('lifecycle task helper methods', () => { site: 'test-site', accountId: 'test-account-id', transitionTime: Date.now(), + actionContext: { + origin: 'lifecycle', + ruleType: 'transition', + reqId: 'test-request-id', + }, }; before(() => { diff --git a/tests/utils/BackbeatTestConsumer.js b/tests/utils/BackbeatTestConsumer.js index a227f8365..e328bfa09 100644 --- a/tests/utils/BackbeatTestConsumer.js +++ b/tests/utils/BackbeatTestConsumer.js @@ -12,9 +12,12 @@ class BackbeatTestConsumer extends BackbeatConsumer { // hook queue processor function this._queueProcessor = this._processMessage.bind(this); this._expectVars = null; + this._expectedConductorScanId = null; + this._expectedConductorScanStartTimestamp = null; } _processMessage(message, done) { + const consumer = this; function _matchMessage(expectedMsg) { if (expectedMsg.key !== undefined) { assert.deepStrictEqual( @@ -23,28 +26,66 @@ class BackbeatTestConsumer extends BackbeatConsumer { `expected ${expectedMsg.key}`); } if (expectedMsg.value !== undefined) { - const parsedMsg = typeof expectedMsg.value === 'object' ? - JSON.parse(message.value) : - message.value.toString(); - if (typeof expectedMsg.value === 'object' && - expectedMsg.value.contextInfo?.reqId === 'test-request-id') { + const isExpectedObject = typeof expectedMsg.value === 'object'; + const parsedMsg = isExpectedObject ? + JSON.parse(message.value) : + message.value.toString(); + const expectedValue = isExpectedObject ? + JSON.parse(JSON.stringify(expectedMsg.value)) : + expectedMsg.value; + let nextExpectedConductorScanId; + let nextExpectedConductorScanStartTimestamp; + if (isExpectedObject && + expectedValue.contextInfo?.reqId === 'test-request-id') { // RequestId is generated randomly, we can't compare it: just check that it is // present assert(parsedMsg.contextInfo?.reqId, 'expected contextInfo.reqId field'); - parsedMsg.contextInfo.reqId = expectedMsg.value.contextInfo?.reqId; - // conductorScanId is also generated per scan: check it exists, - // then normalize for comparison - if (expectedMsg.value.contextInfo?.conductorScanId === 'test-scan-id') { + expectedValue.contextInfo.reqId = parsedMsg.contextInfo.reqId; + // conductorScanId is generated per scan. Assert the real + // value exists and remains stable across expected messages. + if (expectedValue.contextInfo?.conductorScanId === 'test-scan-id') { assert(parsedMsg.contextInfo?.conductorScanId, 'expected contextInfo.conductorScanId field'); - parsedMsg.contextInfo.conductorScanId = - expectedMsg.value.contextInfo.conductorScanId; + nextExpectedConductorScanId = parsedMsg.contextInfo.conductorScanId; + if (consumer._expectedConductorScanId !== null) { + assert.strictEqual( + parsedMsg.contextInfo.conductorScanId, + consumer._expectedConductorScanId, + 'expected stable contextInfo.conductorScanId field'); + } + expectedValue.contextInfo.conductorScanId = + parsedMsg.contextInfo.conductorScanId; + } + if (expectedValue.contextInfo?.conductorScanStartTimestamp === 0) { + assert.strictEqual(typeof parsedMsg.contextInfo?.conductorScanStartTimestamp, 'number', + 'expected contextInfo.conductorScanStartTimestamp field'); + assert(parsedMsg.contextInfo.conductorScanStartTimestamp > 0, + 'expected positive contextInfo.conductorScanStartTimestamp field'); + nextExpectedConductorScanStartTimestamp = + parsedMsg.contextInfo.conductorScanStartTimestamp; + if (consumer._expectedConductorScanStartTimestamp !== null) { + assert.strictEqual( + parsedMsg.contextInfo.conductorScanStartTimestamp, + consumer._expectedConductorScanStartTimestamp, + 'expected stable contextInfo.conductorScanStartTimestamp field'); + } + expectedValue.contextInfo.conductorScanStartTimestamp = + parsedMsg.contextInfo.conductorScanStartTimestamp; } } assert.deepStrictEqual( - parsedMsg, expectedMsg.value, + parsedMsg, expectedValue, `unexpected message value ${parsedMsg}, ` + - `expected ${expectedMsg.value}`); + `expected ${expectedValue}`); + if (consumer._expectedConductorScanId === null && + nextExpectedConductorScanId !== undefined) { + consumer._expectedConductorScanId = nextExpectedConductorScanId; + } + if (consumer._expectedConductorScanStartTimestamp === null && + nextExpectedConductorScanStartTimestamp !== undefined) { + consumer._expectedConductorScanStartTimestamp = + nextExpectedConductorScanStartTimestamp; + } } } @@ -87,6 +128,8 @@ class BackbeatTestConsumer extends BackbeatConsumer { } _expectMessages(messages, timeout, ordered, cb) { + this._expectedConductorScanId = null; + this._expectedConductorScanStartTimestamp = null; const cbOnce = jsutil.once(err => { this._expectVars = null; return cb(err); diff --git a/tests/utils/fakeLogger.js b/tests/utils/fakeLogger.js index 2e241d2a9..7fd57e2d6 100644 --- a/tests/utils/fakeLogger.js +++ b/tests/utils/fakeLogger.js @@ -5,6 +5,7 @@ const fakeLogger = { debug: console.debug, // eslint-disable-line no-console warn: console.warn, // eslint-disable-line no-console getSerializedUids: () => {}, + addDefaultFields: () => {}, end: () => fakeLogger, newRequestLogger: () => fakeLogger, }; From 8a19b7143a60ce5dcf3badaa1092f17552fe34e3 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Thu, 21 May 2026 16:35:08 +0200 Subject: [PATCH 07/10] Track lifecycle bucket task pickups. Record bucket-task message pickup by conductor scan id before scheduler dispatch, observe message age, and make per-scan metric retention configurable. Issue: BB-740 --- conf/config.json | 1 + .../lifecycle/LifecycleConfigValidator.js | 4 + extensions/lifecycle/LifecycleMetrics.js | 170 ++++++++++++------ .../LifecycleBucketProcessor.js | 54 +++--- .../LifecycleBucketProcessor.spec.js | 39 ++++ tests/unit/lifecycle/LifecycleMetrics.spec.js | 117 ++++++++---- 6 files changed, 276 insertions(+), 109 deletions(-) diff --git a/conf/config.json b/conf/config.json index 46c590d60..a7618aa01 100644 --- a/conf/config.json +++ b/conf/config.json @@ -246,6 +246,7 @@ } }, "concurrency": 10, + "scanMetricRetentionMs": 86400000, "probeServer": { "bindAddress": "0.0.0.0", "port": 8553 diff --git a/extensions/lifecycle/LifecycleConfigValidator.js b/extensions/lifecycle/LifecycleConfigValidator.js index b6487862e..cfd034cdd 100644 --- a/extensions/lifecycle/LifecycleConfigValidator.js +++ b/extensions/lifecycle/LifecycleConfigValidator.js @@ -11,6 +11,8 @@ const { const { backbeatConsumer: { MAX_QUEUED_DEFAULT } } = require('../../lib/constants'); const { ValidLifecycleRules: supportedLifecycleRules } = require('arsenal').models; +const DEFAULT_SCAN_METRIC_RETENTION_MS = 24 * 60 * 60 * 1000; + const joiSchema = joi.object({ zookeeperPath: joi.string().required(), bucketTasksTopic: joi.string().required(), @@ -52,6 +54,8 @@ const joiSchema = joi.object({ // the processing, no need to add more here to avoid // overloading the system concurrency: joi.number().greater(0).default(1), + scanMetricRetentionMs: joi.number().integer().positive() + .default(DEFAULT_SCAN_METRIC_RETENTION_MS), probeServer: probeServerJoi.default(), circuitBreaker: joi.object().optional(), }, diff --git a/extensions/lifecycle/LifecycleMetrics.js b/extensions/lifecycle/LifecycleMetrics.js index ac793b816..b820bfbc3 100644 --- a/extensions/lifecycle/LifecycleMetrics.js +++ b/extensions/lifecycle/LifecycleMetrics.js @@ -1,6 +1,6 @@ const { ZenkoMetrics } = require('arsenal').metrics; -const LIFECYCLE_LABEL_ORIGIN = 'origin'; +const LIFECYCLE_LABEL_ORIGIN = 'origin'; const LIFECYCLE_LABEL_OP = 'op'; const LIFECYCLE_LABEL_STATUS = 'status'; const LIFECYCLE_LABEL_LOCATION = 'location'; @@ -10,9 +10,12 @@ const LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID = 'conductor_scan_id'; const LIFECYCLE_MARKER_METRICS_LOCATION = '-delete-marker-'; // Keep per-scan series long enough for scraping and debugging recent overlap, -// but remove them from prom-client eventually to avoid unbounded process memory. -const BUCKET_PROCESSOR_SCAN_METRIC_RETENTION_MS = 24 * 60 * 60 * 1000; +// but remove them from prom-client after a configurable retention interval. +// Prometheus retains scraped scan-id series until TSDB retention expires. +const DEFAULT_SCAN_METRIC_RETENTION_MS = 24 * 60 * 60 * 1000; +const CONDUCTOR_ORIGIN = 'conductor'; const BUCKET_PROCESSOR_ORIGIN = 'bucket_processor'; +let scanMetricRetentionMs = DEFAULT_SCAN_METRIC_RETENTION_MS; // Conductor scheduling heartbeat: timestamp (ms since epoch) of the // instant the conductor most recently *started* a scan. Use this to @@ -99,38 +102,92 @@ const conductorLatestBatchBucketCount = ZenkoMetrics.createGauge({ const bucketProcessorScanMessagesProcessed = ZenkoMetrics.createCounter({ name: 's3_lifecycle_bucket_processor_scan_messages_processed_total', - help: 'Total number of bucket-tasks topic messages successfully processed ' + - 'by this bucket processor, grouped by conductor scan id. Each message ' + + help: 'Total number of bucket-tasks topic messages picked up by this ' + + 'bucket processor, grouped by conductor scan id. Each message ' + 'corresponds to a single listing slice (initial or continuation), not ' + 'a unique bucket: a bucket with multiple listings (truncated v1, or ' + 'current/noncurrent/orphan splits in v2) increments this counter once ' + - 'per slice. Multiple conductor_scan_id label values appearing at the ' + - 'same time indicate overlapping scans. Scan-id series are retained ' + - 'locally for 24 hours, so cardinality scales with scan frequency and ' + - 'bucket processor pod count.', + 'per slice. Multiple conductor_scan_id label values over the same ' + + 'query window indicate that bucket processors recently handled work ' + + 'from different scans. Normal operation is expected to expose one ' + + 'scan id at a time; scan-id series are removed locally after the ' + + 'configured bucket processor retention interval without update to ' + + 'avoid unbounded process memory growth. ' + + 'Prometheus retains scraped scan-id series until TSDB retention.', labelNames: [LIFECYCLE_LABEL_ORIGIN, LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID], }); -const bucketProcessorScanMetricLastUpdated = new Map(); - -function removeStaleBucketProcessorScanMetrics(log, now) { - bucketProcessorScanMetricLastUpdated.forEach((lastUpdated, conductorScanId) => { - if (now - lastUpdated > BUCKET_PROCESSOR_SCAN_METRIC_RETENTION_MS) { - try { - bucketProcessorScanMessagesProcessed.remove({ - [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN, - [LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID]: conductorScanId, - }); - bucketProcessorScanMetricLastUpdated.delete(conductorScanId); - } catch (err) { - if (log) { - log.error('failed to remove stale bucket processor scan metric', { - error: err.toString(), - conductorScanId, - }); - } - } - } - }); + +const bucketProcessorScanMessageAgeSeconds = ZenkoMetrics.createHistogram({ + name: 's3_lifecycle_bucket_processor_scan_message_age_seconds', + help: 'Age in seconds of bucket-tasks topic messages when they finish ' + + 'processing in the bucket processor, measured from the conductor scan ' + + 'start timestamp propagated in the message context.', + labelNames: [LIFECYCLE_LABEL_ORIGIN], + buckets: [60, 300, 600, 1800, 3600, 7200, 14400, 28800, 43200, 86400], +}); + +const scanMetricTimers = new Map(); + +function removeBucketProcessorScanMetrics(conductorScanId) { + try { + bucketProcessorScanMessagesProcessed.remove({ + [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN, + [LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID]: conductorScanId, + }); + } catch { + // Best-effort cleanup: metrics are observational only. + } +} + +function setScanMetricTimeout(conductorScanId) { + const previousTimer = scanMetricTimers.get(conductorScanId); + if (previousTimer) { + clearTimeout(previousTimer); + } + + const cleanupTimer = setTimeout(() => { + removeBucketProcessorScanMetrics(conductorScanId); + scanMetricTimers.delete(conductorScanId); + }, scanMetricRetentionMs); + if (typeof cleanupTimer.unref === 'function') { + cleanupTimer.unref(); + } + scanMetricTimers.set(conductorScanId, cleanupTimer); +} + +function observeBucketProcessorScanMessageAge(conductorScanStartTimestamp) { + // Messages produced before this field existed can still be consumed during + // rolling upgrades, so skip invalid timestamps instead of logging noise. + if (typeof conductorScanStartTimestamp !== 'number' || + !Number.isFinite(conductorScanStartTimestamp) || + conductorScanStartTimestamp <= 0) { + return; + } + + const ageSeconds = (Date.now() - conductorScanStartTimestamp) / 1000; + if (ageSeconds >= 0) { + bucketProcessorScanMessageAgeSeconds.observe({ + [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN, + }, ageSeconds); + } +} + +function clearScanMetricTimers() { + scanMetricTimers.forEach(timer => clearTimeout(timer)); + scanMetricTimers.clear(); +} + +function resetLifecycleScanMetricCleanupTimers() { + clearScanMetricTimers(); + scanMetricRetentionMs = DEFAULT_SCAN_METRIC_RETENTION_MS; +} + +function configureLifecycleScanMetricRetention(retentionMs) { + if (typeof retentionMs === 'number' && + Number.isFinite(retentionMs) && + retentionMs > 0) { + scanMetricRetentionMs = retentionMs; + } } const lifecycleS3Operations = ZenkoMetrics.createCounter({ @@ -205,12 +262,17 @@ class LifecycleMetrics { * onConductorScanComplete's durationSeconds for that. * * @param {Object} log - logger + * @param {number} scanStartTimestamp - scan start timestamp in ms */ - static onProcessBuckets(log) { + static onProcessBuckets(log, scanStartTimestamp = Date.now()) { try { - conductorLatestBatchStartTime.set({ origin: 'conductor' }, Date.now()); + conductorLatestBatchStartTime.set( + { [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN }, + scanStartTimestamp); } catch (err) { - LifecycleMetrics.handleError(log, err, 'LifecycleMetrics.onProcessBuckets'); + LifecycleMetrics.handleError(log, err, 'LifecycleMetrics.onProcessBuckets', { + scanStartTimestamp, + }); } } @@ -276,17 +338,18 @@ class LifecycleMetrics { */ static onConductorScanComplete(log, bucketCount, durationSeconds) { try { - conductorLatestBatchEndTime.set( - { [LIFECYCLE_LABEL_ORIGIN]: 'conductor' }, - Date.now()); + const endTimestamp = Date.now(); + conductorLatestBatchEndTime.set({ + [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN, + }, endTimestamp); conductorLatestBatchBucketCount.set({ - [LIFECYCLE_LABEL_ORIGIN]: 'conductor', + [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN, }, bucketCount); if (typeof durationSeconds === 'number' && Number.isFinite(durationSeconds) && durationSeconds >= 0) { conductorLastBatchDurationSeconds.set({ - [LIFECYCLE_LABEL_ORIGIN]: 'conductor', + [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN, }, durationSeconds); } } catch (err) { @@ -300,38 +363,39 @@ class LifecycleMetrics { } /** - * Increment the count of bucket-tasks topic messages successfully - * processed by this bucket processor for a specific conductor scan. - * Called after the bucket task completes successfully (not when it is - * dispatched to the scheduler), once per Kafka message regardless of how - * many objects it covers. + * Increment the count of bucket-tasks topic messages picked up by this + * bucket processor for a specific conductor scan. Called before the task + * is dispatched to the scheduler, once per Kafka message regardless of how + * many objects it covers or whether processing eventually succeeds. * * Note: this counts messages (initial + continuation/listing slices), * not unique buckets. Keep one time series per conductor_scan_id so that - * overlapping scans remain visible. Old scan series are removed - * opportunistically after BUCKET_PROCESSOR_SCAN_METRIC_RETENTION_MS to - * avoid unbounded prom-client memory growth. + * overlapping scans remain visible. Old scan series are removed by a + * timer after the configured scanMetricRetentionMs interval without + * update to avoid unbounded prom-client memory growth. * * @param {Object} log - logger * @param {string} conductorScanId - conductor scan id from contextInfo + * @param {number} [conductorScanStartTimestamp] - conductor scan start + * timestamp from contextInfo */ - static onBucketProcessorScanMessageProcessed(log, conductorScanId) { + static onBucketProcessorScanMessageReceived( + log, conductorScanId, conductorScanStartTimestamp) { if (!conductorScanId) { return; } try { - const now = Date.now(); - bucketProcessorScanMetricLastUpdated.set(conductorScanId, now); bucketProcessorScanMessagesProcessed.inc({ [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN, [LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID]: conductorScanId, }); - removeStaleBucketProcessorScanMetrics(log, now); + observeBucketProcessorScanMessageAge(conductorScanStartTimestamp); + setScanMetricTimeout(conductorScanId); } catch (err) { LifecycleMetrics.handleError( log, err, - 'LifecycleMetrics.onBucketProcessorScanMessageProcessed', - { conductorScanId } + 'LifecycleMetrics.onBucketProcessorScanMessageReceived', + { conductorScanId, conductorScanStartTimestamp } ); } } @@ -413,4 +477,6 @@ class LifecycleMetrics { module.exports = { LifecycleMetrics, LIFECYCLE_MARKER_METRICS_LOCATION, + configureLifecycleScanMetricRetention, + resetLifecycleScanMetricCleanupTimers, }; diff --git a/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js b/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js index c1e2e7191..1d3412b4d 100644 --- a/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js +++ b/extensions/lifecycle/bucketProcessor/LifecycleBucketProcessor.js @@ -26,7 +26,10 @@ const { extractBucketProcessorCircuitBreakerConfigs, } = require('../CircuitBreakerGroup'); const { lifecycleTaskVersions } = require('../../../lib/constants'); -const { LifecycleMetrics } = require('../LifecycleMetrics'); +const { + configureLifecycleScanMetricRetention, + LifecycleMetrics, +} = require('../LifecycleMetrics'); const locations = require('../../../conf/locationConfig.json'); const PROCESS_OBJECTS_ACTION = 'processObjects'; @@ -67,6 +70,8 @@ class LifecycleBucketProcessor { * @param {Object} repConfig.source - replication source * @param {Number} [lcConfig.bucketProcessor.concurrency] - number * of max allowed concurrent operations + * @param {Number} [lcConfig.bucketProcessor.scanMetricRetentionMs] - + * per-scan metric retention in milliseconds * @param {Object} s3Config - s3 config * @param {String} s3Config.host - host ip * @param {String} s3Config.port - port @@ -106,6 +111,8 @@ class LifecycleBucketProcessor { }, this._log); this.retryWrapper = new BackbeatTask(lcConfig.bucketProcessor.retry); + configureLifecycleScanMetricRetention( + lcConfig.bucketProcessor.scanMetricRetentionMs); // helper object to facilitate the tracking of the the latest x // noncurrent versions of an object when the field @@ -278,25 +285,32 @@ class LifecycleBucketProcessor { return process.nextTick(() => cb(errors.InternalError)); } const { bucket, owner, accountId, taskVersion } = result.target; - const conductorScanId = result.contextInfo && result.contextInfo.conductorScanId; + const conductorScanId = result.contextInfo?.conductorScanId; + const conductorScanStartTimestamp = result.contextInfo?.conductorScanStartTimestamp; + const scanLogInfo = { + conductorScanId, + conductorScanStartTimestamp, + }; + const log = this._log.newRequestLogger(); + log.addDefaultFields(scanLogInfo); if (!bucket || !owner || (!accountId && this._authConfig.type === authTypeAssumeRole)) { - this._log.error('kafka bucket entry missing required fields', { + log.error('kafka bucket entry missing required fields', { method: 'LifecycleBucketProcessor._processBucketEntry', bucket, owner, accountId, - conductorScanId, }); return process.nextTick(() => cb(errors.InternalError)); } - this._log.debug('processing bucket entry', { + log.debug('processing bucket entry', { method: 'LifecycleBucketProcessor._processBucketEntry', bucket, owner, accountId, - conductorScanId, }); + LifecycleMetrics.onBucketProcessorScanMessageReceived( + log, conductorScanId, conductorScanStartTimestamp); const s3Client = this.clientManager.getS3Client(accountId); if (!s3Client) { @@ -312,19 +326,23 @@ class LifecycleBucketProcessor { } const params = { Bucket: bucket }; - return this._getBucketLifecycleConfiguration(s3Client, params, (err, config) => { + return this._getBucketLifecycleConfiguration(s3Client, params, log, (err, config) => { if (err) { if (err.name === 'NoSuchLifecycleConfiguration') { - this._log.debug('skipping non-lifecycled bucket', { bucket }); + log.debug('skipping non-lifecycled bucket', { + bucket, + }); return cb(); } if (err.name === 'NoSuchBucket') { - this._log.error('skipping non-existent bucket', { bucket }); + log.error('skipping non-existent bucket', { + bucket, + }); return cb(); } - this._log.error('error getting bucket lifecycle config', { + log.error('error getting bucket lifecycle config', { method: 'LifecycleBucketProcessor._processBucketEntry', bucket, owner, @@ -344,13 +362,12 @@ class LifecycleBucketProcessor { task = new LifecycleTaskV2(this); } - this._log.info('scheduling new task for bucket lifecycle', { + log.info('scheduling new task for bucket lifecycle', { method: 'LifecycleBucketProcessor._processBucketEntry', bucket, owner, details: result.details, taskName: task.constructor.name, - conductorScanId, }); return this._internalTaskScheduler.push({ task, @@ -358,13 +375,7 @@ class LifecycleBucketProcessor { value: result, s3target: s3Client, backbeatMetadataProxy, - }, err => { - if (!err) { - LifecycleMetrics.onBucketProcessorScanMessageProcessed( - this._log, conductorScanId); - } - return cb(err); - }); + }, cb); }); } @@ -372,10 +383,11 @@ class LifecycleBucketProcessor { * Call AWS.S3.GetBucketLifecycleConfiguration in a retry wrapper. * @param {S3Client} s3Client - the s3 client * @param {object} params - the parameters to pass to getBucketLifecycleConfiguration + * @param {Logger} log - The request logger to use * @param {Function} cb - The callback to call * @return {undefined} */ - _getBucketLifecycleConfiguration(s3Client, params, cb) { + _getBucketLifecycleConfiguration(s3Client, params, log, cb) { return this.retryWrapper.retry({ actionDesc: 'get bucket lifecycle', logFields: { params }, @@ -386,7 +398,7 @@ class LifecycleBucketProcessor { .catch(done); }, shouldRetryFunc: err => err.retryable, - log: this._log, + log, }, cb); } diff --git a/tests/unit/lifecycle/LifecycleBucketProcessor.spec.js b/tests/unit/lifecycle/LifecycleBucketProcessor.spec.js index ef7c88beb..f892695a1 100644 --- a/tests/unit/lifecycle/LifecycleBucketProcessor.spec.js +++ b/tests/unit/lifecycle/LifecycleBucketProcessor.spec.js @@ -28,6 +28,7 @@ const { } = require('../../utils/kafkaEntries'); const LifecycleTask = require('../../../extensions/lifecycle/tasks/LifecycleTask'); const LifecycleTaskV2 = require('../../../extensions/lifecycle/tasks/LifecycleTaskV2'); +const { LifecycleMetrics } = require('../../../extensions/lifecycle/LifecycleMetrics'); describe('Lifecycle Bucket Processor', () => { @@ -262,6 +263,44 @@ describe('Lifecycle Bucket Processor', () => { }); }); }); + + it('should pass conductor scan start timestamp to lifecycle metrics', done => { + const kafkaMessage = { + ...bucketProcessorV2Entry, + value: JSON.stringify({ + ...JSON.parse(bucketProcessorV2Entry.value), + contextInfo: { + conductorScanId: 'scan-A', + conductorScanStartTimestamp: 1700000000000, + }, + }), + }; + const metricStub = sinon.stub( + LifecycleMetrics, 'onBucketProcessorScanMessageReceived'); + lbp.clientManager = { + getS3Client: () => ({}), + getBackbeatMetadataProxy: () => ({}), + }; + sinon.stub(lbp, '_getBucketLifecycleConfiguration').yields(null, { + Rules: [{ + Status: 'Enabled', + Transitions: [{ + Days: 10, + StorageClass: 'azure' + }], + ID: 'dac36d89-0005-4c78-8e00-7e9ace06a9c4' + }] + }); + lbp._internalTaskScheduler = async.queue((ctx, cb) => cb(), 1); + + lbp._processBucketEntry(kafkaMessage, err => { + assert.ifError(err); + assert(metricStub.calledOnce); + assert.strictEqual(metricStub.firstCall.args[1], 'scan-A'); + assert.strictEqual(metricStub.firstCall.args[2], 1700000000000); + done(); + }); + }); }); describe('start', () => { diff --git a/tests/unit/lifecycle/LifecycleMetrics.spec.js b/tests/unit/lifecycle/LifecycleMetrics.spec.js index 910709d8b..d73b942a7 100644 --- a/tests/unit/lifecycle/LifecycleMetrics.spec.js +++ b/tests/unit/lifecycle/LifecycleMetrics.spec.js @@ -1,6 +1,10 @@ const assert = require('assert'); const sinon = require('sinon'); -const { LifecycleMetrics } = require('../../../extensions/lifecycle/LifecycleMetrics'); +const { + configureLifecycleScanMetricRetention, + LifecycleMetrics, + resetLifecycleScanMetricCleanupTimers, +} = require('../../../extensions/lifecycle/LifecycleMetrics'); const { ZenkoMetrics } = require('arsenal').metrics; describe('LifecycleMetrics', () => { @@ -13,6 +17,7 @@ describe('LifecycleMetrics', () => { }); afterEach(() => { + resetLifecycleScanMetricCleanupTimers(); sinon.restore(); }); @@ -97,7 +102,7 @@ describe('LifecycleMetrics', () => { 's3_lifecycle_bucket_processor_scan_messages_processed_total'); const messagesInc = sinon.stub(messagesMetric, 'inc'); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, 'scan-A'); assert(messagesInc.calledOnce); assert(messagesInc.calledWithMatch({ origin: 'bucket_processor', @@ -107,16 +112,33 @@ describe('LifecycleMetrics', () => { assert(log.error.notCalled); }); - it('should catch errors in onBucketProcessorScanMessageProcessed', () => { + it('should observe bucket processor scan message age from conductor start timestamp', () => { + const messageAgeMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_message_age_seconds'); + const observeStub = sinon.stub(messageAgeMetric, 'observe'); + sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total'), 'inc'); + sinon.useFakeTimers(1700000010000); + + LifecycleMetrics.onBucketProcessorScanMessageReceived( + log, 'scan-A', 1700000000000); + + assert(observeStub.calledOnce); + assert(observeStub.calledWithMatch( + { origin: 'bucket_processor' }, 10)); + assert(log.error.notCalled); + }); + + it('should catch errors in onBucketProcessorScanMessageReceived', () => { const messagesMetric = ZenkoMetrics.getMetric( 's3_lifecycle_bucket_processor_scan_messages_processed_total'); sinon.stub(messagesMetric, 'inc').throws(new Error('Metric error')); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, 'scan-A'); assert(log.error.calledOnce); assert(log.error.calledWithMatch('failed to update prometheus metrics', { - method: 'LifecycleMetrics.onBucketProcessorScanMessageProcessed', + method: 'LifecycleMetrics.onBucketProcessorScanMessageReceived', conductorScanId: 'scan-A', })); }); @@ -126,25 +148,22 @@ describe('LifecycleMetrics', () => { 's3_lifecycle_bucket_processor_scan_messages_processed_total'); const incStub = sinon.stub(messagesMetric, 'inc'); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, undefined); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, ''); + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, undefined); + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, ''); assert(incStub.notCalled); assert(log.error.notCalled); }); - it('should remove stale bucket processor scan series', () => { + it('should remove stale bucket processor scan series after retention', () => { const messagesMetric = ZenkoMetrics.getMetric( 's3_lifecycle_bucket_processor_scan_messages_processed_total'); const removeStub = sinon.stub(messagesMetric, 'remove'); sinon.stub(messagesMetric, 'inc'); - const now = 1700000000000; - sinon.stub(Date, 'now') - .onFirstCall().returns(now) - .onSecondCall().returns(now + 24 * 60 * 60 * 1000 + 1); + const clock = sinon.useFakeTimers(1700000000000); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-B'); + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, 'scan-A'); + clock.tick(24 * 60 * 60 * 1000 + 1); assert(removeStub.calledOnce); assert(removeStub.calledWithMatch({ @@ -154,33 +173,48 @@ describe('LifecycleMetrics', () => { assert(log.error.notCalled); }); - it('should continue stale series cleanup when one removal fails', () => { + it('should reset bucket processor scan cleanup timer on update', () => { const messagesMetric = ZenkoMetrics.getMetric( 's3_lifecycle_bucket_processor_scan_messages_processed_total'); - const removeStub = sinon.stub(messagesMetric, 'remove') - .onFirstCall().throws(new Error('Metric remove error')); - const incStub = sinon.stub(messagesMetric, 'inc'); - const now = 1700000000000; - sinon.stub(Date, 'now') - .onFirstCall().returns(now) - .onSecondCall().returns(now + 1) - .onThirdCall().returns(now + 24 * 60 * 60 * 1000 + 2); - - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-A'); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-B'); - LifecycleMetrics.onBucketProcessorScanMessageProcessed(log, 'scan-C'); - - assert(incStub.calledThrice); - assert(incStub.thirdCall.calledWithMatch({ + const removeStub = sinon.stub(messagesMetric, 'remove'); + sinon.stub(messagesMetric, 'inc'); + const clock = sinon.useFakeTimers(1700000000000); + + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, 'scan-A'); + clock.tick(12 * 60 * 60 * 1000); + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, 'scan-A'); + clock.tick(12 * 60 * 60 * 1000 + 1); + + assert(removeStub.notCalled); + clock.tick(12 * 60 * 60 * 1000); + + assert(removeStub.calledOnce); + assert(removeStub.calledWithMatch({ origin: 'bucket_processor', - ['conductor_scan_id']: 'scan-C', + ['conductor_scan_id']: 'scan-A', })); - assert(removeStub.callCount >= 2); - assert(log.error.calledOnce); - assert(log.error.calledWithMatch('failed to remove stale bucket processor scan metric')); - assert(log.error.neverCalledWithMatch('failed to update prometheus metrics', { - method: 'LifecycleMetrics.onBucketProcessorScanMessageProcessed', + assert(log.error.notCalled); + }); + + it('should use configured bucket processor scan cleanup retention', () => { + const messagesMetric = ZenkoMetrics.getMetric( + 's3_lifecycle_bucket_processor_scan_messages_processed_total'); + const removeStub = sinon.stub(messagesMetric, 'remove'); + sinon.stub(messagesMetric, 'inc'); + const clock = sinon.useFakeTimers(1700000000000); + + configureLifecycleScanMetricRetention(1000); + LifecycleMetrics.onBucketProcessorScanMessageReceived(log, 'scan-A'); + clock.tick(999); + assert(removeStub.notCalled); + clock.tick(1); + + assert(removeStub.calledOnce); + assert(removeStub.calledWithMatch({ + origin: 'bucket_processor', + ['conductor_scan_id']: 'scan-A', })); + assert(log.error.notCalled); }); it('should catch errors in onLifecycleTriggered', () => { @@ -293,6 +327,17 @@ describe('LifecycleMetrics', () => { assert(log.error.notCalled); }); + it('should set latest batch start time', () => { + const latestStartSet = sinon.stub(ZenkoMetrics.getMetric( + 's3_lifecycle_latest_batch_start_time'), 'set'); + + LifecycleMetrics.onProcessBuckets(log, 1700000000000); + + assert(latestStartSet.calledWithMatch( + { origin: 'conductor' }, 1700000000000)); + assert(log.error.notCalled); + }); + it('should set conductor last batch duration when provided', () => { sinon.stub(ZenkoMetrics.getMetric( 's3_lifecycle_latest_batch_end_time'), 'set'); From d7000eb51312457e5d47f050ca47bde023adfa12 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Thu, 21 May 2026 16:35:08 +0200 Subject: [PATCH 08/10] Refine lifecycle scan alerts. Separate stalled-scan thresholds from lifecycle latency and alert only when bucket processors handle multiple scan ids in parallel. Issue: BB-740 --- .github/workflows/alerts.yaml | 2 + monitoring/lifecycle/alerts.test.yaml | 43 ++++++++++++++-- monitoring/lifecycle/alerts.yaml | 73 ++++++++++++++++++++------- 3 files changed, 95 insertions(+), 23 deletions(-) diff --git a/.github/workflows/alerts.yaml b/.github/workflows/alerts.yaml index f4e17f10d..4ebbb7445 100644 --- a/.github/workflows/alerts.yaml +++ b/.github/workflows/alerts.yaml @@ -32,6 +32,8 @@ jobs: lifecycle_transition_replicas=3 lifecycle_latency_warning_threshold=120 lifecycle_latency_critical_threshold=180 + lifecycle_conductor_scan_warning_threshold=120 + lifecycle_conductor_scan_critical_threshold=180 github_token: ${{ secrets.GIT_ACCESS_TOKEN }} - name: Render and test replication diff --git a/monitoring/lifecycle/alerts.test.yaml b/monitoring/lifecycle/alerts.test.yaml index 48e841c21..06247e5a6 100644 --- a/monitoring/lifecycle/alerts.test.yaml +++ b/monitoring/lifecycle/alerts.test.yaml @@ -466,7 +466,7 @@ tests: eval_time: 14m exp_alerts: [] - - name: LifecycleStuckScan + - name: LifecycleStalledScan interval: 1m input_series: - series: s3_lifecycle_latest_batch_start_time{namespace="zenko",job="artesca-data-backbeat-lifecycle-producer-headless",pod="foo"} @@ -474,37 +474,70 @@ tests: - series: s3_lifecycle_latest_batch_end_time{namespace="zenko",job="artesca-data-backbeat-lifecycle-producer-headless",pod="foo"} values: 0+0x5 360000 alert_rule_test: - - alertname: LifecycleStuckScan + - alertname: LifecycleStalledScan eval_time: 2m exp_alerts: [] - - alertname: LifecycleStuckScan + - alertname: LifecycleStalledScan eval_time: 4m exp_alerts: - exp_labels: + namespace: zenko + job: artesca-data-backbeat-lifecycle-producer-headless + pod: foo severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer description: "Last lifecycle scan has not completed more than 2m 0s after it was scheduled." summary: "Lifecycle scan did not complete in time" - - alertname: LifecycleStuckScan + - alertname: LifecycleStalledScan eval_time: 5m exp_alerts: - exp_labels: + namespace: zenko + job: artesca-data-backbeat-lifecycle-producer-headless + pod: foo severity: warning exp_annotations: zenko_service: backbeat-lifecycle-producer description: "Last lifecycle scan has not completed more than 2m 0s after it was scheduled." summary: "Lifecycle scan did not complete in time" - exp_labels: + namespace: zenko + job: artesca-data-backbeat-lifecycle-producer-headless + pod: foo severity: critical exp_annotations: zenko_service: backbeat-lifecycle-producer description: "Last lifecycle scan has not completed more than 3m 0s after it was scheduled." summary: "Lifecycle scan did not complete in time" - - alertname: LifecycleStuckScan + - alertname: LifecycleStalledScan eval_time: 6m exp_alerts: [] + - name: LifecycleBucketProcessorMultipleParallelScans + interval: 1m + input_series: + - series: s3_lifecycle_bucket_processor_scan_messages_processed_total{namespace="zenko",job="artesca-data-backbeat-lifecycle-bucket-processor-headless",conductor_scan_id="scan-a"} + values: 0+10x20 + - series: s3_lifecycle_bucket_processor_scan_messages_processed_total{namespace="zenko",job="artesca-data-backbeat-lifecycle-bucket-processor-headless",conductor_scan_id="scan-b"} + values: _ _ _ 0+10x17 + alert_rule_test: + - alertname: LifecycleBucketProcessorMultipleParallelScans + eval_time: 2m + exp_alerts: [] + - alertname: LifecycleBucketProcessorMultipleParallelScans + eval_time: 10m + exp_alerts: [] + - alertname: LifecycleBucketProcessorMultipleParallelScans + eval_time: 15m + exp_alerts: + - exp_labels: + severity: warning + exp_annotations: + zenko_service: backbeat-lifecycle-bucket-processor + description: "Bucket processors have processed bucket-task messages from multiple lifecycle conductor scans in parallel for more than 10 minutes." + summary: "Lifecycle bucket processors are processing multiple scans" + - name: LifecycleLatency interval: 10m input_series: diff --git a/monitoring/lifecycle/alerts.yaml b/monitoring/lifecycle/alerts.yaml index 114df6747..8371b8f31 100644 --- a/monitoring/lifecycle/alerts.yaml +++ b/monitoring/lifecycle/alerts.yaml @@ -25,6 +25,12 @@ x-inputs: - name: lifecycle_latency_critical_threshold type: config value: 36*60*60 # 36h, in seconds +- name: lifecycle_conductor_scan_warning_threshold + type: config + value: 10*60 # 10m, in seconds +- name: lifecycle_conductor_scan_critical_threshold + type: config + value: 30*60 # 30m, in seconds groups: - name: LifecycleProducer @@ -82,60 +88,72 @@ groups: {{ ${lifecycle_latency_critical_threshold} | humanizeDuration }} ago. summary: "Lifecycle scan not scheduled in time" - - alert: LifecycleStuckScan + - alert: LifecycleStalledScan Expr: | ( - max(s3_lifecycle_latest_batch_start_time{ + max by(namespace, job, pod) (s3_lifecycle_latest_batch_start_time{ namespace="${namespace}", job="${job_lifecycle_producer}" }) > - max(s3_lifecycle_latest_batch_end_time{ - namespace="${namespace}", job="${job_lifecycle_producer}" - } or vector(0)) + max by(namespace, job, pod) ( + s3_lifecycle_latest_batch_end_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + } + or + s3_lifecycle_latest_batch_start_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + } * 0 + ) ) - and + and on(namespace, job, pod) ( time() - ( - max(s3_lifecycle_latest_batch_start_time{ + max by(namespace, job, pod) (s3_lifecycle_latest_batch_start_time{ namespace="${namespace}", job="${job_lifecycle_producer}" }) / 1000 ) - ) / ${lifecycle_latency_warning_threshold} > 1 + ) / ${lifecycle_conductor_scan_warning_threshold} > 1 Labels: severity: warning Annotations: zenko_service: backbeat-lifecycle-producer description: >- Last lifecycle scan has not completed more than - {{ ${lifecycle_latency_warning_threshold} | humanizeDuration }} after it was scheduled. + {{ ${lifecycle_conductor_scan_warning_threshold} | humanizeDuration }} after it was scheduled. summary: "Lifecycle scan did not complete in time" - - alert: LifecycleStuckScan + - alert: LifecycleStalledScan Expr: | ( - max(s3_lifecycle_latest_batch_start_time{ + max by(namespace, job, pod) (s3_lifecycle_latest_batch_start_time{ namespace="${namespace}", job="${job_lifecycle_producer}" }) > - max(s3_lifecycle_latest_batch_end_time{ - namespace="${namespace}", job="${job_lifecycle_producer}" - } or vector(0)) + max by(namespace, job, pod) ( + s3_lifecycle_latest_batch_end_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + } + or + s3_lifecycle_latest_batch_start_time{ + namespace="${namespace}", job="${job_lifecycle_producer}" + } * 0 + ) ) - and + and on(namespace, job, pod) ( time() - ( - max(s3_lifecycle_latest_batch_start_time{ + max by(namespace, job, pod) (s3_lifecycle_latest_batch_start_time{ namespace="${namespace}", job="${job_lifecycle_producer}" }) / 1000 ) - ) / ${lifecycle_latency_critical_threshold} > 1 + ) / ${lifecycle_conductor_scan_critical_threshold} > 1 Labels: severity: critical Annotations: zenko_service: backbeat-lifecycle-producer description: >- Last lifecycle scan has not completed more than - {{ ${lifecycle_latency_critical_threshold} | humanizeDuration }} after it was scheduled. + {{ ${lifecycle_conductor_scan_critical_threshold} | humanizeDuration }} after it was scheduled. summary: "Lifecycle scan did not complete in time" - name: LifecycleBucketProcessor @@ -185,6 +203,25 @@ groups: description: "More than 5% of S3 requests by bucket processors resulting in errors" summary: "Very high rate of S3 request errors" + - alert: LifecycleBucketProcessorMultipleParallelScans + Expr: | + count( + sum by(conductor_scan_id) ( + rate(s3_lifecycle_bucket_processor_scan_messages_processed_total{ + namespace="${namespace}", job="${job_lifecycle_bucket_processor}" + }[1m]) + ) > 0 + ) > 1 + For: "10m" + Labels: + severity: warning + Annotations: + zenko_service: backbeat-lifecycle-bucket-processor + description: >- + Bucket processors have processed bucket-task messages from multiple + lifecycle conductor scans in parallel for more than 10 minutes. + summary: "Lifecycle bucket processors are processing multiple scans" + - name: LifecycleObjectProcessor rules: From 4520f5abf1c28367adba210a3df87adce0c115d2 Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Thu, 21 May 2026 16:35:08 +0200 Subject: [PATCH 09/10] Refine lifecycle scan dashboard panels. Clarify bucket processor pickup panels, split pod and scan-id views, and render scan message age as a heatmap. Issue: BB-740 --- monitoring/lifecycle/dashboard.json | 112 ++++++++++++++++++++++++---- monitoring/lifecycle/dashboard.py | 51 +++++++++---- 2 files changed, 135 insertions(+), 28 deletions(-) diff --git a/monitoring/lifecycle/dashboard.json b/monitoring/lifecycle/dashboard.json index 998cb5c69..f158d848a 100644 --- a/monitoring/lifecycle/dashboard.json +++ b/monitoring/lifecycle/dashboard.json @@ -4509,7 +4509,7 @@ }, { "datasource": "${DS_PROMETHEUS}", - "description": "Rate of bucket-tasks topic messages successfully processed by conductor scan id and bucket processor pod. Multiple scan ids active at the same time indicate overlapping conductor scans, which should not happen in normal operation. Each message corresponds to a single listing slice (initial or continuation), not a unique bucket.", + "description": "Rate of bucket-tasks topic messages picked up by bucket processor pod. Each message corresponds to a single listing slice (initial or continuation), not a unique bucket.", "editable": true, "error": false, "fieldConfig": { @@ -4575,27 +4575,27 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(s3_lifecycle_bucket_processor_scan_messages_processed_total{job=\"${job_lifecycle_bucket_processor}\", namespace=\"${namespace}\"}[$__rate_interval])) by (pod, conductor_scan_id)", + "expr": "sum(rate(s3_lifecycle_bucket_processor_scan_messages_processed_total{job=\"${job_lifecycle_bucket_processor}\", namespace=\"${namespace}\"}[$__rate_interval])) by (pod) > 0", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{pod}} / {{conductor_scan_id}}", + "legendFormat": "{{pod}}", "metric": "", "refId": "", "step": 10, "target": "" } ], - "title": "Bucket Processor Messages Processed by Scan", + "title": "Bucket Tasks Picked Up by Pod", "transformations": [], "transparent": false, "type": "timeseries" }, { "datasource": "${DS_PROMETHEUS}", - "description": "Message processing rate grouped by conductor_scan_id across all bucket-processor pods. Should normally show one active scan id; multiple active scan ids indicate overlapping conductor scans.", + "description": "Message pickup rate grouped by conductor_scan_id across all bucket-processor pods. This panel is mostly useful to check that a single conductor scan is usually visible.", "editable": true, "error": false, "fieldConfig": { @@ -4651,7 +4651,7 @@ "options": { "legend": { "calcs": [], - "displayMode": "list", + "displayMode": "hidden", "placement": "bottom" }, "tooltip": { @@ -4661,7 +4661,7 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(s3_lifecycle_bucket_processor_scan_messages_processed_total{job=\"${job_lifecycle_bucket_processor}\", namespace=\"${namespace}\"}[$__rate_interval])) by (conductor_scan_id)", + "expr": "sum(rate(s3_lifecycle_bucket_processor_scan_messages_processed_total{job=\"${job_lifecycle_bucket_processor}\", namespace=\"${namespace}\"}[$__rate_interval])) by (conductor_scan_id) > 0", "format": "time_series", "hide": false, "instant": false, @@ -4674,11 +4674,95 @@ "target": "" } ], - "title": "Bucket Processor Active Conductor Scans", + "title": "Bucket Processor Recent Conductor Scans", "transformations": [], "transparent": false, "type": "timeseries" }, + { + "cards": { + "cardPadding": 1, + "cardRound": 2 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "max": null, + "min": null, + "mode": "opacity" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_PROMETHEUS}", + "description": "Age of bucket-task messages when bucket processors finish processing them, measured from the conductor scan start timestamp.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + } + } + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 122 + }, + "heatmap": {}, + "hideTimeOverride": false, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 54, + "legend": { + "show": false + }, + "links": [], + "maxDataPoints": 25, + "reverseYBuckets": false, + "targets": [ + { + "datasource": null, + "expr": "sum(increase(s3_lifecycle_bucket_processor_scan_message_age_seconds_bucket{job=\"${job_lifecycle_bucket_processor}\", namespace=\"${namespace}\"}[$__rate_interval])) by(le)", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{le}}", + "metric": "", + "refId": "", + "step": 10, + "target": "" + } + ], + "title": "Bucket Processor Scan Message Age", + "tooltip": { + "show": true, + "showHistogram": true + }, + "transformations": [], + "transparent": false, + "type": "heatmap", + "xAxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yAxis": { + "decimals": 0, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + }, { "datasource": "${DS_PROMETHEUS}", "editable": true, @@ -4776,10 +4860,10 @@ "h": 7, "w": 8, "x": 0, - "y": 122 + "y": 129 }, "hideTimeOverride": false, - "id": 54, + "id": 55, "links": [], "maxDataPoints": 100, "options": { @@ -4889,10 +4973,10 @@ "h": 7, "w": 8, "x": 8, - "y": 122 + "y": 129 }, "hideTimeOverride": false, - "id": 55, + "id": 56, "links": [], "maxDataPoints": 100, "options": { @@ -4974,10 +5058,10 @@ "h": 7, "w": 8, "x": 16, - "y": 122 + "y": 129 }, "hideTimeOverride": false, - "id": 56, + "id": 57, "links": [], "maxDataPoints": 100, "options": { diff --git a/monitoring/lifecycle/dashboard.py b/monitoring/lifecycle/dashboard.py index 249087371..482bf4848 100644 --- a/monitoring/lifecycle/dashboard.py +++ b/monitoring/lifecycle/dashboard.py @@ -93,6 +93,11 @@ class Metrics: 'conductor_scan_id', job='${job_lifecycle_bucket_processor}', namespace='${namespace}', ) + BUCKET_PROCESSOR_SCAN_MESSAGE_AGE = metrics.BucketMetric( + 's3_lifecycle_bucket_processor_scan_message_age_seconds', + job='${job_lifecycle_bucket_processor}', namespace='${namespace}', + ) + S3_OPS = metrics.CounterMetric( 's3_lifecycle_s3_operations_total', 'origin', 'op', 'status', job=['$jobs'], namespace='${namespace}', @@ -788,42 +793,59 @@ def color_override(name, color): ) bucket_processor_messages_processed = TimeSeries( - title="Bucket Processor Messages Processed by Scan", - description="Rate of bucket-tasks topic messages successfully processed " - "by conductor scan id and bucket processor pod. Multiple scan ids " - "active at the same time indicate overlapping conductor scans, which " - "should not happen in normal operation. Each message corresponds to " - "a single listing slice (initial or continuation), not a unique " - "bucket.", + title="Bucket Tasks Picked Up by Pod", + description="Rate of bucket-tasks topic messages picked up by bucket " + "processor pod. Each message corresponds to a single " + "listing slice (initial or continuation), not a unique bucket.", dataSource="${DS_PROMETHEUS}", lineInterpolation="smooth", decimals=0, targets=[ Target( expr='sum(rate(' + Metrics.BUCKET_PROCESSOR_SCAN_MESSAGES_PROCESSED() + ')) ' - 'by (pod, conductor_scan_id)', - legendFormat='{{pod}} / {{conductor_scan_id}}', + 'by (pod) > 0', + legendFormat='{{pod}}', ), ], ) bucket_processor_active_scan_ids = TimeSeries( - title="Bucket Processor Active Conductor Scans", - description="Message processing rate grouped by conductor_scan_id across " - "all bucket-processor pods. Should normally show one active scan id; " - "multiple active scan ids indicate overlapping conductor scans.", + title="Bucket Processor Recent Conductor Scans", + description="Message pickup rate grouped by conductor_scan_id across all " + "bucket-processor pods. This panel is mostly useful to check that a " + "single conductor scan is usually visible.", dataSource="${DS_PROMETHEUS}", lineInterpolation="smooth", decimals=0, + legendDisplayMode="hidden", targets=[ Target( expr='sum(rate(' + Metrics.BUCKET_PROCESSOR_SCAN_MESSAGES_PROCESSED() + ')) ' - 'by (conductor_scan_id)', + 'by (conductor_scan_id) > 0', legendFormat='{{conductor_scan_id}}', ), ], ) +bucket_processor_scan_message_age = Heatmap( + title="Bucket Processor Scan Message Age", + description="Age of bucket-task messages when bucket processors finish " + "processing them, measured from the conductor scan start timestamp.", + dataSource="${DS_PROMETHEUS}", + dataFormat='tsbuckets', + hideZeroBuckets=True, + maxDataPoints=25, + tooltip=Tooltip(show=True, showHistogram=True), + yAxis=YAxis(format=UNITS.SECONDS, decimals=0), + cards={'cardPadding': 1, 'cardRound': 2}, + color=HeatmapColor(mode='opacity'), + targets=[Target( + expr='sum(increase(' + Metrics.BUCKET_PROCESSOR_SCAN_MESSAGE_AGE.bucket() + ')) by(le)', + format="heatmap", + legendFormat="{{le}}", + )], +) + active_indexing_jobs = TimeSeries( title="Active Indexing jobs", dataSource="${DS_PROMETHEUS}", @@ -993,6 +1015,7 @@ def color_override(name, color): layout.row([lifecycle_scans, trigger_latency], height=7), layout.row([lifecycle_full_scan_elapsed, lifecycle_scan_count_buckets], height=4), layout.row([bucket_processor_messages_processed, bucket_processor_active_scan_ids], height=7), + layout.row([bucket_processor_scan_message_age], height=7), layout.row([lifecycle_scan_rate, active_indexing_jobs, legacy_tasks], height=7), ]), ) From 5d846654779243440e3d44263ce0e09eafee3ddc Mon Sep 17 00:00:00 2001 From: Maha Benzekri Date: Thu, 21 May 2026 17:13:21 +0200 Subject: [PATCH 10/10] Close lifecycle scans on errors. Issue: BB-740 --- extensions/lifecycle/LifecycleMetrics.js | 20 ++++++------- .../lifecycle/conductor/LifecycleConductor.js | 4 +++ monitoring/lifecycle/dashboard.json | 2 +- monitoring/lifecycle/dashboard.py | 4 +-- .../unit/lifecycle/LifecycleConductor.spec.js | 28 +++++++++++++++++++ 5 files changed, 45 insertions(+), 13 deletions(-) diff --git a/extensions/lifecycle/LifecycleMetrics.js b/extensions/lifecycle/LifecycleMetrics.js index b820bfbc3..4ee4949eb 100644 --- a/extensions/lifecycle/LifecycleMetrics.js +++ b/extensions/lifecycle/LifecycleMetrics.js @@ -35,26 +35,26 @@ const conductorLatestBatchStartTime = ZenkoMetrics.createGauge({ labelNames: [LIFECYCLE_LABEL_ORIGIN], }); -// Conductor scan-completion timestamp (ms since epoch) of the last -// successfully completed scan. Useful as a "scan completed at all" -// signal; combine with conductor_last_batch_duration_seconds to know -// "the most recent scan finished N seconds ago and took M seconds". +// Conductor scan-end timestamp (ms since epoch) of the last scan that reached +// the listing phase. Useful as a "scan ended at all" signal; combine with +// conductor_last_batch_duration_seconds to know "the most recent scan ended N +// seconds ago and took M seconds". const conductorLatestBatchEndTime = ZenkoMetrics.createGauge({ name: 's3_lifecycle_latest_batch_end_time', - help: 'Timestamp (ms since epoch) of the most recent successful ' + - 'lifecycle conductor scan completion.', + help: 'Timestamp (ms since epoch) of the most recent lifecycle ' + + 'conductor scan end after the scan reached bucket listing.', labelNames: [LIFECYCLE_LABEL_ORIGIN], }); -// Duration of the latest conductor scan, computed by the conductor itself -// at scan completion. Exposed as a gauge so dashboards can render the most -// recent batch duration directly, without computing end - start in PromQL +// Duration of the latest conductor scan that reached bucket listing, computed +// by the conductor itself at scan end. Exposed as a gauge so dashboards can +// render the most recent batch duration directly, without computing end - start in PromQL // (which would yield negative values mid-scan, when end is from the // previous batch and start has just been refreshed). const conductorLastBatchDurationSeconds = ZenkoMetrics.createGauge({ name: 's3_lifecycle_conductor_last_batch_duration_seconds', help: 'Duration in seconds of the latest lifecycle conductor batch, ' + - 'as measured by the conductor at scan completion.', + 'as measured by the conductor at scan end.', labelNames: [LIFECYCLE_LABEL_ORIGIN], }); diff --git a/extensions/lifecycle/conductor/LifecycleConductor.js b/extensions/lifecycle/conductor/LifecycleConductor.js index d87aa07f7..ea919539c 100644 --- a/extensions/lifecycle/conductor/LifecycleConductor.js +++ b/extensions/lifecycle/conductor/LifecycleConductor.js @@ -472,6 +472,8 @@ class LifecycleConductor { this._batchInProgress = false; this._currentScanId = null; this._currentScanStartTimestamp = null; + LifecycleMetrics.onConductorScanComplete( + log, totalBucketsListed || 0, elapsedMs / 1000); } log.info('not starting new lifecycle batch', { reason: err, @@ -500,6 +502,8 @@ class LifecycleConductor { if (scanStarted) { this._currentScanId = null; this._currentScanStartTimestamp = null; + LifecycleMetrics.onConductorScanComplete( + log, totalBucketsListed || 0, elapsedMs / 1000); } if (cb) { cb(err); diff --git a/monitoring/lifecycle/dashboard.json b/monitoring/lifecycle/dashboard.json index f158d848a..cd69ee3d1 100644 --- a/monitoring/lifecycle/dashboard.json +++ b/monitoring/lifecycle/dashboard.json @@ -4347,7 +4347,7 @@ }, { "datasource": "${DS_PROMETHEUS}", - "description": "Duration of the most recently completed conductor scan, as measured by the conductor itself at scan completion.", + "description": "Duration of the most recent conductor scan that reached bucket listing, as measured by the conductor itself at scan end.", "editable": true, "error": false, "fieldConfig": { diff --git a/monitoring/lifecycle/dashboard.py b/monitoring/lifecycle/dashboard.py index 482bf4848..07cc1e81c 100644 --- a/monitoring/lifecycle/dashboard.py +++ b/monitoring/lifecycle/dashboard.py @@ -757,8 +757,8 @@ def color_override(name, color): lifecycle_full_scan_elapsed = Stat( title="Last Conductor Batch Duration", - description="Duration of the most recently completed conductor scan, " - "as measured by the conductor itself at scan completion.", + description="Duration of the most recent conductor scan that reached " + "bucket listing, as measured by the conductor itself at scan end.", dataSource="${DS_PROMETHEUS}", reduceCalc="last", format=UNITS.SECONDS, diff --git a/tests/unit/lifecycle/LifecycleConductor.spec.js b/tests/unit/lifecycle/LifecycleConductor.spec.js index 47660161b..1bd113f1f 100644 --- a/tests/unit/lifecycle/LifecycleConductor.spec.js +++ b/tests/unit/lifecycle/LifecycleConductor.spec.js @@ -161,9 +161,14 @@ describe('Lifecycle Conductor', () => { sinon.stub(conductor, '_controlBacklog') .callsFake(cb => cb(null)); + const metricStub = sinon.stub(LifecycleMetrics, 'onConductorScanComplete'); conductor.processBuckets(err => { assert.strictEqual(err.message, 'error'); + assert(metricStub.calledOnce); + const [, bucketCount, durationSeconds] = metricStub.firstCall.args; + assert.strictEqual(bucketCount, 0); + assert.strictEqual(typeof durationSeconds, 'number'); done(); }); }); @@ -240,6 +245,29 @@ describe('Lifecycle Conductor', () => { }); }); + it('should close scan metrics when throttling occurs after scan start', done => { + conductor._mongodbClient = { + getIndexingJobs: (_, cb) => cb(null, []), + }; + + sinon.stub(conductor, '_controlBacklog').callsFake(cb => cb(null)); + sinon.stub(conductor, 'listBuckets') + .callsFake((queue, scanId, log, cb) => cb(errors.Throttling)); + const metricStub = sinon.stub(LifecycleMetrics, 'onConductorScanComplete'); + + conductor.processBuckets(err => { + assert(err && err.Throttling); + assert(metricStub.calledOnce); + const [, bucketCount, durationSeconds] = metricStub.firstCall.args; + assert.strictEqual(bucketCount, 0); + assert.strictEqual(typeof durationSeconds, 'number'); + assert.strictEqual(conductor._batchInProgress, false); + assert.strictEqual(conductor._currentScanId, null); + assert.strictEqual(conductor._currentScanStartTimestamp, null); + done(); + }); + }); + it('should generate a conductorScanId', done => { conductor._mongodbClient = { getIndexingJobs: (_, cb) => cb(null, []),