From 2e72ce3b4a3dcd9067da4de34d432e830c18979f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 29 May 2026 16:28:40 +1200 Subject: [PATCH 1/3] migration: add incidents.escalated_at --- crates/commons-types/src/issue.rs | 8 ++------ crates/database/src/issues.rs | 2 ++ crates/database/src/schema.rs | 1 + crates/private-server/src/fns/healthchecks.rs | 6 +----- crates/private-server/src/fns/statuses.rs | 14 ++++++++------ .../down.sql | 2 ++ .../up.sql | 6 ++++++ 7 files changed, 22 insertions(+), 17 deletions(-) create mode 100644 migrations/2026-05-29-100000-0000_incidents_escalated_at/down.sql create mode 100644 migrations/2026-05-29-100000-0000_incidents_escalated_at/up.sql diff --git a/crates/commons-types/src/issue.rs b/crates/commons-types/src/issue.rs index 746919aa..cd98cd2b 100644 --- a/crates/commons-types/src/issue.rs +++ b/crates/commons-types/src/issue.rs @@ -63,9 +63,7 @@ impl Severity { } #[derive(Debug, Clone, Copy, thiserror::Error)] -#[error( - "invalid severity; expected one of: critical, error, warning, info, debug" -)] +#[error("invalid severity; expected one of: critical, error, warning, info, debug")] pub struct SeverityFromStringError; impl std::str::FromStr for Severity { @@ -75,9 +73,7 @@ impl std::str::FromStr for Severity { match s.to_ascii_lowercase().as_ref() { // Retired severities still parse — emergency/alert collapse into // critical and notice collapses into info, matching the migration. - "emergency" | "emerg" | "panic" | "alert" | "critical" | "crit" => { - Ok(Self::Critical) - } + "emergency" | "emerg" | "panic" | "alert" | "critical" | "crit" => Ok(Self::Critical), "error" | "err" => Ok(Self::Error), "warning" | "warn" => Ok(Self::Warning), "notice" | "info" | "informational" => Ok(Self::Info), diff --git a/crates/database/src/issues.rs b/crates/database/src/issues.rs index d5164895..adcc16a2 100644 --- a/crates/database/src/issues.rs +++ b/crates/database/src/issues.rs @@ -96,6 +96,8 @@ pub struct Incident { pub resolved_at: Option, pub resolved_by: Option, pub resolved_reason: Option, + #[diesel(deserialize_as = jiff_diesel::NullableTimestamp, serialize_as = jiff_diesel::NullableTimestamp)] + pub escalated_at: Option, } #[derive(Clone, Debug, Serialize, Deserialize, Queryable, Selectable, Associations)] diff --git a/crates/database/src/schema.rs b/crates/database/src/schema.rs index 18f14dff..7b625ea5 100644 --- a/crates/database/src/schema.rs +++ b/crates/database/src/schema.rs @@ -147,6 +147,7 @@ diesel::table! { resolved_by -> Nullable, resolved_reason -> Nullable, server_group_id -> Uuid, + escalated_at -> Nullable, } } diff --git a/crates/private-server/src/fns/healthchecks.rs b/crates/private-server/src/fns/healthchecks.rs index 72e47641..7d9a725f 100644 --- a/crates/private-server/src/fns/healthchecks.rs +++ b/crates/private-server/src/fns/healthchecks.rs @@ -260,11 +260,7 @@ pub async fn sample( // Top-level extras — the column is always an object after our // ingestion path strips reserved keys. - let status_extra = status - .extra - .as_object() - .cloned() - .unwrap_or_default(); + let status_extra = status.extra.as_object().cloned().unwrap_or_default(); // Pull the failing-check entry out of the health array (any entry // matching by name; we don't require unhealthy here so we still diff --git a/crates/private-server/src/fns/statuses.rs b/crates/private-server/src/fns/statuses.rs index 6aa87a1d..cc096321 100644 --- a/crates/private-server/src/fns/statuses.rs +++ b/crates/private-server/src/fns/statuses.rs @@ -240,8 +240,7 @@ pub struct StatusSnapshotData { /// the explicit severity when one is known so operators see /// all five levels, not just the legacy "warning/error" pair /// derived from `healthy`. - pub check_severities: - std::collections::HashMap, + pub check_severities: std::collections::HashMap, } #[derive(Deserialize, ToSchema)] @@ -314,8 +313,7 @@ pub async fn snapshot( // Compute the per-unhealthy-check severity the rules engine would // file at given this push. Healthy checks are omitted; the UI // renders them with its 'passing' affordance regardless. - let check_severities = - compute_check_severities(&mut conn, args.server_id, &status).await?; + let check_severities = compute_check_severities(&mut conn, args.server_id, &status).await?; Ok(Json(Some(StatusSnapshotData { id: status.id, @@ -357,8 +355,12 @@ async fn compute_check_severities( let mut failing: Vec<(String, serde_json::Map)> = Vec::new(); for raw in arr { let Some(obj) = raw.as_object() else { continue }; - let Some(check_name) = obj.get("check").and_then(|v| v.as_str()) else { continue }; - let Some(healthy) = obj.get("healthy").and_then(|v| v.as_bool()) else { continue }; + let Some(check_name) = obj.get("check").and_then(|v| v.as_str()) else { + continue; + }; + let Some(healthy) = obj.get("healthy").and_then(|v| v.as_bool()) else { + continue; + }; if healthy { continue; } diff --git a/migrations/2026-05-29-100000-0000_incidents_escalated_at/down.sql b/migrations/2026-05-29-100000-0000_incidents_escalated_at/down.sql new file mode 100644 index 00000000..24a40d33 --- /dev/null +++ b/migrations/2026-05-29-100000-0000_incidents_escalated_at/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE incidents + DROP COLUMN escalated_at; diff --git a/migrations/2026-05-29-100000-0000_incidents_escalated_at/up.sql b/migrations/2026-05-29-100000-0000_incidents_escalated_at/up.sql new file mode 100644 index 00000000..3bf9de50 --- /dev/null +++ b/migrations/2026-05-29-100000-0000_incidents_escalated_at/up.sql @@ -0,0 +1,6 @@ +-- Marks the first time an incident escalated from a lower severity to +-- Critical. NULL ⇒ never escalated. Used to gate the one-shot "open" +-- Slack message that fires on the first Critical contributor joining +-- an already-shipped incident. +ALTER TABLE incidents + ADD COLUMN escalated_at TIMESTAMPTZ; From 2034063c92426a83282d8777df6f009d148fbbbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 29 May 2026 16:31:07 +1200 Subject: [PATCH 2/3] incidents: fire a fresh Slack 'open' when a Critical first escalates an incident --- crates/database/src/issues.rs | 47 +++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/crates/database/src/issues.rs b/crates/database/src/issues.rs index adcc16a2..d07dc351 100644 --- a/crates/database/src/issues.rs +++ b/crates/database/src/issues.rs @@ -455,11 +455,35 @@ async fn re_evaluate_incident_membership( if newly_opened { enqueue_slack_open(conn, incident_id, server_group_id, issue).await?; } else if issue.severity == Severity::Critical { - // Critical joining an existing incident whose open is - // still inside its holding window: pull the delivery - // forward to now. enqueue_slack_open already does this - // for newly-opened incidents. - accelerate_pending_open(conn, incident_id).await?; + // Two sub-cases when a Critical joins an existing incident: + // - The original open is still pending in the outbox → + // accelerate so the "incident opened" message lands + // immediately. No second message: the open hasn't been + // seen yet, so a fresh open would be redundant noise. + // - The original open has already shipped → enqueue a + // fresh open at Critical severity as the escalation + // signal. Gated on incidents.escalated_at IS NULL so + // repeated Critical joins (or a Critical leaving and + // rejoining) don't re-fire the message. + let accelerated = accelerate_pending_open(conn, incident_id).await?; + if !accelerated { + let escalated: Option = diesel::update( + incidents::table + .filter(incidents::id.eq(incident_id)) + .filter(incidents::escalated_at.is_null()), + ) + .set( + incidents::escalated_at + .eq(jiff_diesel::Timestamp::from(transition_time)), + ) + .returning(Incident::as_select()) + .get_result(conn) + .await + .optional()?; + if escalated.is_some() { + enqueue_slack_open(conn, incident_id, server_group_id, issue).await?; + } + } } } (true, _, true) => { @@ -855,10 +879,17 @@ async fn enqueue_slack_open( /// Affects only rows that are still in their delay window /// (`delivered_at IS NULL`, `gave_up_at IS NULL`, `deliver_after > now`). /// Already-shipped or cancelled rows are left alone. -async fn accelerate_pending_open(conn: &mut AsyncPgConnection, incident_id: Uuid) -> Result<()> { +/// +/// Returns true if at least one row was accelerated. The caller uses +/// this to distinguish "open is still pending" (no extra Slack message +/// needed) from "open has already shipped" (fire an escalation open). +async fn accelerate_pending_open( + conn: &mut AsyncPgConnection, + incident_id: Uuid, +) -> Result { use crate::schema::slack_outbox::dsl; let now = Timestamp::now(); - diesel::update( + let updated = diesel::update( dsl::slack_outbox .filter(dsl::incident_id.eq(incident_id)) .filter(dsl::kind.eq(crate::slack_outbox::KIND_INCIDENT_OPEN)) @@ -870,7 +901,7 @@ async fn accelerate_pending_open(conn: &mut AsyncPgConnection, incident_id: Uuid .execute(conn) .await .map_err(AppError::from)?; - Ok(()) + Ok(updated > 0) } async fn enqueue_slack_resolve( From e6bbbc35643a802425fe506ac536c26ad2b7885d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 29 May 2026 16:34:44 +1200 Subject: [PATCH 3/3] tests: cover the Critical-escalation Slack 'open' path --- .../tests/incident_severity_semantics.rs | 123 ++++++++++++++++-- 1 file changed, 111 insertions(+), 12 deletions(-) diff --git a/crates/database/tests/incident_severity_semantics.rs b/crates/database/tests/incident_severity_semantics.rs index 2b3cb895..1a85d7e4 100644 --- a/crates/database/tests/incident_severity_semantics.rs +++ b/crates/database/tests/incident_severity_semantics.rs @@ -85,15 +85,12 @@ struct OpenInfo { /// Seconds until (positive) or since (negative) `deliver_after`. #[diesel(sql_type = sql_types::Double)] delay_secs: f64, - #[diesel(sql_type = sql_types::Bool)] - is_delivered: bool, } async fn pending_open(conn: &mut diesel_async::AsyncPgConnection, incident_id: Uuid) -> OpenInfo { sql_query( "SELECT id, \ - EXTRACT(EPOCH FROM (deliver_after - NOW()))::float8 AS delay_secs, \ - delivered_at IS NOT NULL AS is_delivered \ + EXTRACT(EPOCH FROM (deliver_after - NOW()))::float8 AS delay_secs \ FROM slack_outbox WHERE incident_id = $1 AND kind = $2 LIMIT 1", ) .bind::(incident_id) @@ -305,7 +302,12 @@ async fn critical_joining_existing_open_accelerates_pending_delivery() { } #[tokio::test(flavor = "multi_thread")] -async fn critical_join_does_not_disturb_already_delivered_open() { +async fn critical_join_after_delivered_open_fires_escalation_open() { + // Once the original "incident opened" Slack message has shipped, a + // Critical contributor joining is treated as an escalation: a fresh + // `incident_open` row is enqueued at Critical-severity timing so + // operators hear the change. Incident.escalated_at is stamped so we + // don't re-fire on every subsequent Critical join. commons_tests::db::TestDb::run(async |mut conn, _| { let server_id = insert_grouped_server(&mut conn, "http://crit-after-delivered.invalid/").await; @@ -329,9 +331,12 @@ async fn critical_join_does_not_disturb_already_delivered_open() { SlackOutbox::mark_delivered(&mut conn, pending.id, "ok") .await .expect("mark delivered"); + assert!( + incident.escalated_at.is_none(), + "precondition: not yet escalated" + ); - // A Critical issue now joins. There's no pending open to accelerate, - // so the call is a no-op and crucially doesn't re-enqueue Slack noise. + // A Critical issue joins → escalation fires. save_event( &mut conn, server_id, @@ -342,8 +347,101 @@ async fn critical_join_does_not_disturb_already_delivered_open() { ) .await; - // The (now-delivered) outbox row still has its delivered_at set; no - // duplicate incident_open rows have been created. + // Two open rows now: the original (delivered) and the escalation + // (pending, with delay ~0 because Critical bypasses the holding + // window). + #[derive(QueryableByName)] + struct OpenRow { + #[diesel(sql_type = sql_types::Double)] + delay_secs: f64, + #[diesel(sql_type = sql_types::Bool)] + is_delivered: bool, + } + let rows: Vec = sql_query( + "SELECT EXTRACT(EPOCH FROM (deliver_after - NOW()))::float8 AS delay_secs, \ + delivered_at IS NOT NULL AS is_delivered \ + FROM slack_outbox WHERE incident_id = $1 AND kind = $2 \ + ORDER BY created_at", + ) + .bind::(incident.id) + .bind::(KIND_INCIDENT_OPEN) + .get_results(&mut conn) + .await + .expect("rows"); + assert_eq!(rows.len(), 2, "two open rows: original + escalation"); + assert!(rows[0].is_delivered, "original is delivered"); + assert!(!rows[1].is_delivered, "escalation is freshly enqueued"); + assert!( + rows[1].delay_secs.abs() <= 5.0, + "escalation deliver_after is ~now (Critical bypasses delay); got {}s", + rows[1].delay_secs, + ); + + // incident.escalated_at is now set. + let refreshed = Incident::list_for_server(&mut conn, server_id, false, 10) + .await + .expect("list") + .into_iter() + .next() + .expect("incident"); + assert!( + refreshed.escalated_at.is_some(), + "escalated_at recorded after first Critical escalation" + ); + }) + .await +} + +#[tokio::test(flavor = "multi_thread")] +async fn repeated_critical_joins_do_not_re_fire_escalation() { + // escalated_at is the latch: the first Critical-after-delivered-open + // fires a fresh "incident opened" message, and every subsequent + // Critical contributor is silent. Otherwise a flapping check could + // repeatedly re-page operators long after the incident is well-known. + commons_tests::db::TestDb::run(async |mut conn, _| { + let server_id = + insert_grouped_server(&mut conn, "http://crit-no-double-escalate.invalid/").await; + save_event( + &mut conn, + server_id, + "warmup", + Severity::Error, + true, + "boom", + ) + .await; + let incident = Incident::list_for_server(&mut conn, server_id, false, 10) + .await + .expect("list") + .into_iter() + .next() + .expect("incident"); + let pending = pending_open(&mut conn, incident.id).await; + SlackOutbox::mark_delivered(&mut conn, pending.id, "ok") + .await + .expect("mark delivered"); + + // First Critical → escalation fires. + save_event( + &mut conn, + server_id, + "crit-1", + Severity::Critical, + true, + "red alert", + ) + .await; + // Second Critical → no new outbox row. + save_event( + &mut conn, + server_id, + "crit-2", + Severity::Critical, + true, + "also red", + ) + .await; + #[derive(QueryableByName)] struct Count { #[diesel(sql_type = sql_types::BigInt)] @@ -358,9 +456,10 @@ async fn critical_join_does_not_disturb_already_delivered_open() { .get_result(&mut conn) .await .expect("count"); - assert_eq!(count.n, 1); - let row = pending_open(&mut conn, incident.id).await; - assert!(row.is_delivered); + assert_eq!( + count.n, 2, + "original + one escalation; second Critical is silent" + ); }) .await }