scality · benzekrimaha · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/.github/workflows/alerts.yaml b/.github/workflows/alerts.yaml
@@ -32,6 +32,8 @@ jobs:
             lifecycle_transition_replicas=3
             lifecycle_latency_warning_threshold=120
             lifecycle_latency_critical_threshold=180
+            lifecycle_conductor_scan_warning_threshold=120
+            lifecycle_conductor_scan_critical_threshold=180
           github_token: ${{ secrets.GIT_ACCESS_TOKEN }}
 
       - name: Render and test replication 

diff --git a/conf/config.json b/conf/config.json
@@ -246,6 +246,7 @@
                     }
                 },
                 "concurrency": 10,
+                "scanMetricRetentionMs": 86400000,
                 "probeServer": {
                     "bindAddress": "0.0.0.0",
                     "port": 8553

diff --git a/extensions/lifecycle/LifecycleConfigValidator.js b/extensions/lifecycle/LifecycleConfigValidator.js
@@ -11,6 +11,8 @@ const {
 const { backbeatConsumer: { MAX_QUEUED_DEFAULT } } = require('../../lib/constants');
 const { ValidLifecycleRules: supportedLifecycleRules } = require('arsenal').models;
 
+const DEFAULT_SCAN_METRIC_RETENTION_MS = 24 * 60 * 60 * 1000;
+
 const joiSchema = joi.object({
     zookeeperPath: joi.string().required(),
     bucketTasksTopic: joi.string().required(),
@@ -52,6 +54,8 @@ const joiSchema = joi.object({
         // the processing, no need to add more here to avoid
         // overloading the system
         concurrency: joi.number().greater(0).default(1),
+        scanMetricRetentionMs: joi.number().integer().positive()
+            .default(DEFAULT_SCAN_METRIC_RETENTION_MS),
         probeServer: probeServerJoi.default(),
         circuitBreaker: joi.object().optional(),
     },

diff --git a/extensions/lifecycle/LifecycleMetrics.js b/extensions/lifecycle/LifecycleMetrics.js
@@ -1,16 +1,60 @@
 const { ZenkoMetrics } = require('arsenal').metrics;
 
-const LIFECYCLE_LABEL_ORIGIN =  'origin';
+const LIFECYCLE_LABEL_ORIGIN = 'origin';
 const LIFECYCLE_LABEL_OP = 'op';
 const LIFECYCLE_LABEL_STATUS = 'status';
 const LIFECYCLE_LABEL_LOCATION = 'location';
 const LIFECYCLE_LABEL_TYPE = 'type';
+const LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID = 'conductor_scan_id';
 
 const LIFECYCLE_MARKER_METRICS_LOCATION = '-delete-marker-';
 
+// Keep per-scan series long enough for scraping and debugging recent overlap,
+// but remove them from prom-client after a configurable retention interval.
+// Prometheus retains scraped scan-id series until TSDB retention expires.
+const DEFAULT_SCAN_METRIC_RETENTION_MS = 24 * 60 * 60 * 1000;
+const CONDUCTOR_ORIGIN = 'conductor';
+const BUCKET_PROCESSOR_ORIGIN = 'bucket_processor';
+let scanMetricRetentionMs = DEFAULT_SCAN_METRIC_RETENTION_MS;
+
+// Conductor scheduling heartbeat: timestamp (ms since epoch) of the
+// instant the conductor most recently *started* a scan. Use this to
+// detect "the conductor is no longer scheduling scans" via the
+// LifecycleLateScan alert; do NOT subtract it from latest_batch_end_time
+// to derive the scan duration: while a scan is in progress, end_time is
+// from the previous run and start_time has just been refreshed, so the
+// difference is negative. Use s3_lifecycle_conductor_last_batch_duration_seconds
+// instead.
 const conductorLatestBatchStartTime = ZenkoMetrics.createGauge({
     name: 's3_lifecycle_latest_batch_start_time',
-    help: 'Timestamp of latest lifecycle batch start time',
+    help: 'Conductor scheduling heartbeat: ms-since-epoch timestamp of ' +
+        'the most recent scan start. Use to detect that the conductor is ' +
+        'still scheduling scans (LifecycleLateScan alert). Do NOT use to ' +
+        'derive scan duration; use ' +
+        's3_lifecycle_conductor_last_batch_duration_seconds for that.',
+    labelNames: [LIFECYCLE_LABEL_ORIGIN],
+});
+
+// Conductor scan-completion timestamp (ms since epoch) of the last
+// successfully completed scan. Useful as a "scan completed at all"
+// signal; combine with conductor_last_batch_duration_seconds to know
+// "the most recent scan finished N seconds ago and took M seconds".
+const conductorLatestBatchEndTime = ZenkoMetrics.createGauge({
+    name: 's3_lifecycle_latest_batch_end_time',
+    help: 'Timestamp (ms since epoch) of the most recent successful ' +
+        'lifecycle conductor scan completion.',
+    labelNames: [LIFECYCLE_LABEL_ORIGIN],
+});
+
+// Duration of the latest conductor scan, computed by the conductor itself
+// at scan completion. Exposed as a gauge so dashboards can render the most
+// recent batch duration directly, without computing end - start in PromQL
+// (which would yield negative values mid-scan, when end is from the
+// previous batch and start has just been refreshed).
+const conductorLastBatchDurationSeconds = ZenkoMetrics.createGauge({
+    name: 's3_lifecycle_conductor_last_batch_duration_seconds',
+    help: 'Duration in seconds of the latest lifecycle conductor batch, ' +
+        'as measured by the conductor at scan completion.',
     labelNames: [LIFECYCLE_LABEL_ORIGIN],
 });
 
@@ -50,6 +94,102 @@ const lifecycleLegacyTask = ZenkoMetrics.createCounter({
     labelNames: [LIFECYCLE_LABEL_ORIGIN, LIFECYCLE_LABEL_STATUS],
 });
 
+const conductorLatestBatchBucketCount = ZenkoMetrics.createGauge({
+    name: 's3_lifecycle_latest_batch_bucket_count',
+    help: 'Number of buckets listed in the latest lifecycle conductor batch',
+    labelNames: [LIFECYCLE_LABEL_ORIGIN],
+});
+
+const bucketProcessorScanMessagesProcessed = ZenkoMetrics.createCounter({
+    name: 's3_lifecycle_bucket_processor_scan_messages_processed_total',
+    help: 'Total number of bucket-tasks topic messages picked up by this ' +
+        'bucket processor, grouped by conductor scan id. Each message ' +
+        'corresponds to a single listing slice (initial or continuation), not ' +
+        'a unique bucket: a bucket with multiple listings (truncated v1, or ' +
+        'current/noncurrent/orphan splits in v2) increments this counter once ' +
+        'per slice. Multiple conductor_scan_id label values over the same ' +
+        'query window indicate that bucket processors recently handled work ' +
+        'from different scans. Normal operation is expected to expose one ' +
+        'scan id at a time; scan-id series are removed locally after the ' +
+        'configured bucket processor retention interval without update to ' +
+        'avoid unbounded process memory growth. ' +
+        'Prometheus retains scraped scan-id series until TSDB retention.',
+    labelNames: [LIFECYCLE_LABEL_ORIGIN, LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID],
-    labelNames: [LIFECYCLE_LABEL_ORIGIN, LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID],
+scan-id series are removed locally some time hours after their latest update, to avoid having too high cardinality.
-    labelNames: [LIFECYCLE_LABEL_ORIGIN, LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID],
+scan-id series are removed locally some time hours after their latest update, to avoid having too high cardinality.
+});
+
+const bucketProcessorScanMessageAgeSeconds = ZenkoMetrics.createHistogram({
+    name: 's3_lifecycle_bucket_processor_scan_message_age_seconds',
+    help: 'Age in seconds of bucket-tasks topic messages when they finish ' +
+        'processing in the bucket processor, measured from the conductor scan ' +
+        'start timestamp propagated in the message context.',
+    labelNames: [LIFECYCLE_LABEL_ORIGIN],
+    buckets: [60, 300, 600, 1800, 3600, 7200, 14400, 28800, 43200, 86400],
+});
+
+const scanMetricTimers = new Map();
+
+function removeBucketProcessorScanMetrics(conductorScanId) {
+    try {
+        bucketProcessorScanMessagesProcessed.remove({
+            [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN,
+            [LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID]: conductorScanId,
+        });
+    } catch {
+        // Best-effort cleanup: metrics are observational only.
+    }
+}
+
+function setScanMetricTimeout(conductorScanId) {
+    const previousTimer = scanMetricTimers.get(conductorScanId);
+    if (previousTimer) {
+        clearTimeout(previousTimer);
+    }
+
+    const cleanupTimer = setTimeout(() => {
+        removeBucketProcessorScanMetrics(conductorScanId);
+        scanMetricTimers.delete(conductorScanId);
+    }, scanMetricRetentionMs);
+    if (typeof cleanupTimer.unref === 'function') {
+        cleanupTimer.unref();
+    }
+    scanMetricTimers.set(conductorScanId, cleanupTimer);
+}
+
+function observeBucketProcessorScanMessageAge(conductorScanStartTimestamp) {
+    // Messages produced before this field existed can still be consumed during
+    // rolling upgrades, so skip invalid timestamps instead of logging noise.
+    if (typeof conductorScanStartTimestamp !== 'number' ||
+        !Number.isFinite(conductorScanStartTimestamp) ||
+        conductorScanStartTimestamp <= 0) {
+        return;
+    }
+
+    const ageSeconds = (Date.now() - conductorScanStartTimestamp) / 1000;
+    if (ageSeconds >= 0) {
+        bucketProcessorScanMessageAgeSeconds.observe({
+            [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN,
+        }, ageSeconds);
+    }
+}
+
+function clearScanMetricTimers() {
+    scanMetricTimers.forEach(timer => clearTimeout(timer));
+    scanMetricTimers.clear();
+}
+
+function resetLifecycleScanMetricCleanupTimers() {
+    clearScanMetricTimers();
+    scanMetricRetentionMs = DEFAULT_SCAN_METRIC_RETENTION_MS;
+}
+
+function configureLifecycleScanMetricRetention(retentionMs) {
+    if (typeof retentionMs === 'number' &&
+        Number.isFinite(retentionMs) &&
+        retentionMs > 0) {
+        scanMetricRetentionMs = retentionMs;
+    }
+}
+
 const lifecycleS3Operations = ZenkoMetrics.createCounter({
     name: 's3_lifecycle_s3_operations_total',
     help: 'Total number of S3 operations by the lifecycle processes',
@@ -113,11 +253,26 @@ class LifecycleMetrics {
         }
     }
 
-    static onProcessBuckets(log) {
+    /**
+     * Update the conductor scheduling heartbeat. Called at the start of
+     * every conductor scan; consumed by the LifecycleLateScan alert to
+     * detect that the conductor has stopped scheduling. Does NOT mark a
+     * scan as in progress and is NOT meant to be subtracted from
+     * latest_batch_end_time to derive a duration: use
+     * onConductorScanComplete's durationSeconds for that.
+     *
+     * @param {Object} log - logger
+     * @param {number} scanStartTimestamp - scan start timestamp in ms
+     */
+    static onProcessBuckets(log, scanStartTimestamp = Date.now()) {
         try {
-            conductorLatestBatchStartTime.set({ origin: 'conductor' }, Date.now());
+            conductorLatestBatchStartTime.set(
+                { [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN },
+                scanStartTimestamp);
         } catch (err) {
-            LifecycleMetrics.handleError(log, err, 'LifecycleMetrics.onProcessBuckets');
+            LifecycleMetrics.handleError(log, err, 'LifecycleMetrics.onProcessBuckets', {
+                scanStartTimestamp,
+            });
         }
     }
 
@@ -172,6 +327,79 @@ class LifecycleMetrics {
         }
     }
 
+    /**
+     * Record metrics at the end of a full conductor scan.
+     * @param {Object} log - logger
+     * @param {number} bucketCount - total buckets listed
+     * @param {number} [durationSeconds] - duration of the scan in seconds,
+     *   as measured by the conductor. When provided and finite, sets the
+     *   s3_lifecycle_conductor_last_batch_duration_seconds gauge. Optional
+     *   for forward-compatibility with callers that do not measure it.
+     */
+    static onConductorScanComplete(log, bucketCount, durationSeconds) {
+        try {
+            const endTimestamp = Date.now();
+            conductorLatestBatchEndTime.set({
+                [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN,
+            }, endTimestamp);
+            conductorLatestBatchBucketCount.set({
+                [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN,
+            }, bucketCount);
+            if (typeof durationSeconds === 'number' &&
+                Number.isFinite(durationSeconds) &&
+                durationSeconds >= 0) {
+                conductorLastBatchDurationSeconds.set({
+                    [LIFECYCLE_LABEL_ORIGIN]: CONDUCTOR_ORIGIN,
+                }, durationSeconds);
+            }
+        } catch (err) {
+            LifecycleMetrics.handleError(
+                log, err, 'LifecycleMetrics.onConductorScanComplete', {
+                    bucketCount,
+                    durationSeconds,
+                }
+            );
+        }
+    }
+
+    /**
+     * Increment the count of bucket-tasks topic messages picked up by this
+     * bucket processor for a specific conductor scan. Called before the task
+     * is dispatched to the scheduler, once per Kafka message regardless of how
+     * many objects it covers or whether processing eventually succeeds.
+     *
+     * Note: this counts messages (initial + continuation/listing slices),
+     * not unique buckets. Keep one time series per conductor_scan_id so that
+     * overlapping scans remain visible. Old scan series are removed by a
+     * timer after the configured scanMetricRetentionMs interval without
+     * update to avoid unbounded prom-client memory growth.
+     *
+     * @param {Object} log - logger
+     * @param {string} conductorScanId - conductor scan id from contextInfo
+     * @param {number} [conductorScanStartTimestamp] - conductor scan start
+     *   timestamp from contextInfo
+     */
+    static onBucketProcessorScanMessageReceived(
+        log, conductorScanId, conductorScanStartTimestamp) {
+        if (!conductorScanId) {
+            return;
+        }
+        try {
+            bucketProcessorScanMessagesProcessed.inc({
+                [LIFECYCLE_LABEL_ORIGIN]: BUCKET_PROCESSOR_ORIGIN,
+                [LIFECYCLE_LABEL_CONDUCTOR_SCAN_ID]: conductorScanId,
+            });
+            observeBucketProcessorScanMessageAge(conductorScanStartTimestamp);
+            setScanMetricTimeout(conductorScanId);
+        } catch (err) {
+            LifecycleMetrics.handleError(
+                log, err,
+                'LifecycleMetrics.onBucketProcessorScanMessageReceived',
+                { conductorScanId, conductorScanStartTimestamp }
+            );
+        }
+    }
+
     static onLifecycleTriggered(log, process, type, location, latencyMs) {
         try {
             lifecycleTriggerLatency.observe({
@@ -249,4 +477,6 @@ class LifecycleMetrics {
 module.exports = {
     LifecycleMetrics,
     LIFECYCLE_MARKER_METRICS_LOCATION,
+    configureLifecycleScanMetricRetention,
+    resetLifecycleScanMetricCleanupTimers,
 };