Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions crates/commons-types/src/issue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@ impl Severity {
}

#[derive(Debug, Clone, Copy, thiserror::Error)]
#[error(
"invalid severity; expected one of: critical, error, warning, info, debug"
)]
#[error("invalid severity; expected one of: critical, error, warning, info, debug")]
pub struct SeverityFromStringError;

impl std::str::FromStr for Severity {
Expand All @@ -75,9 +73,7 @@ impl std::str::FromStr for Severity {
match s.to_ascii_lowercase().as_ref() {
// Retired severities still parse — emergency/alert collapse into
// critical and notice collapses into info, matching the migration.
"emergency" | "emerg" | "panic" | "alert" | "critical" | "crit" => {
Ok(Self::Critical)
}
"emergency" | "emerg" | "panic" | "alert" | "critical" | "crit" => Ok(Self::Critical),
"error" | "err" => Ok(Self::Error),
"warning" | "warn" => Ok(Self::Warning),
"notice" | "info" | "informational" => Ok(Self::Info),
Expand Down
49 changes: 41 additions & 8 deletions crates/database/src/issues.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ pub struct Incident {
pub resolved_at: Option<Timestamp>,
pub resolved_by: Option<String>,
pub resolved_reason: Option<String>,
#[diesel(deserialize_as = jiff_diesel::NullableTimestamp, serialize_as = jiff_diesel::NullableTimestamp)]
pub escalated_at: Option<Timestamp>,
}

#[derive(Clone, Debug, Serialize, Deserialize, Queryable, Selectable, Associations)]
Expand Down Expand Up @@ -453,11 +455,35 @@ async fn re_evaluate_incident_membership(
if newly_opened {
enqueue_slack_open(conn, incident_id, server_group_id, issue).await?;
} else if issue.severity == Severity::Critical {
// Critical joining an existing incident whose open is
// still inside its holding window: pull the delivery
// forward to now. enqueue_slack_open already does this
// for newly-opened incidents.
accelerate_pending_open(conn, incident_id).await?;
// Two sub-cases when a Critical joins an existing incident:
// - The original open is still pending in the outbox →
// accelerate so the "incident opened" message lands
// immediately. No second message: the open hasn't been
// seen yet, so a fresh open would be redundant noise.
// - The original open has already shipped → enqueue a
// fresh open at Critical severity as the escalation
// signal. Gated on incidents.escalated_at IS NULL so
// repeated Critical joins (or a Critical leaving and
// rejoining) don't re-fire the message.
let accelerated = accelerate_pending_open(conn, incident_id).await?;
if !accelerated {
let escalated: Option<Incident> = diesel::update(
incidents::table
.filter(incidents::id.eq(incident_id))
.filter(incidents::escalated_at.is_null()),
)
.set(
incidents::escalated_at
.eq(jiff_diesel::Timestamp::from(transition_time)),
)
.returning(Incident::as_select())
.get_result(conn)
.await
.optional()?;
if escalated.is_some() {
enqueue_slack_open(conn, incident_id, server_group_id, issue).await?;
}
}
}
}
(true, _, true) => {
Expand Down Expand Up @@ -853,10 +879,17 @@ async fn enqueue_slack_open(
/// Affects only rows that are still in their delay window
/// (`delivered_at IS NULL`, `gave_up_at IS NULL`, `deliver_after > now`).
/// Already-shipped or cancelled rows are left alone.
async fn accelerate_pending_open(conn: &mut AsyncPgConnection, incident_id: Uuid) -> Result<()> {
///
/// Returns true if at least one row was accelerated. The caller uses
/// this to distinguish "open is still pending" (no extra Slack message
/// needed) from "open has already shipped" (fire an escalation open).
async fn accelerate_pending_open(
conn: &mut AsyncPgConnection,
incident_id: Uuid,
) -> Result<bool> {
use crate::schema::slack_outbox::dsl;
let now = Timestamp::now();
diesel::update(
let updated = diesel::update(
dsl::slack_outbox
.filter(dsl::incident_id.eq(incident_id))
.filter(dsl::kind.eq(crate::slack_outbox::KIND_INCIDENT_OPEN))
Expand All @@ -868,7 +901,7 @@ async fn accelerate_pending_open(conn: &mut AsyncPgConnection, incident_id: Uuid
.execute(conn)
.await
.map_err(AppError::from)?;
Ok(())
Ok(updated > 0)
}

async fn enqueue_slack_resolve(
Expand Down
1 change: 1 addition & 0 deletions crates/database/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ diesel::table! {
resolved_by -> Nullable<Text>,
resolved_reason -> Nullable<Text>,
server_group_id -> Uuid,
escalated_at -> Nullable<Timestamptz>,
}
}

Expand Down
123 changes: 111 additions & 12 deletions crates/database/tests/incident_severity_semantics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,12 @@ struct OpenInfo {
/// Seconds until (positive) or since (negative) `deliver_after`.
#[diesel(sql_type = sql_types::Double)]
delay_secs: f64,
#[diesel(sql_type = sql_types::Bool)]
is_delivered: bool,
}

async fn pending_open(conn: &mut diesel_async::AsyncPgConnection, incident_id: Uuid) -> OpenInfo {
sql_query(
"SELECT id, \
EXTRACT(EPOCH FROM (deliver_after - NOW()))::float8 AS delay_secs, \
delivered_at IS NOT NULL AS is_delivered \
EXTRACT(EPOCH FROM (deliver_after - NOW()))::float8 AS delay_secs \
FROM slack_outbox WHERE incident_id = $1 AND kind = $2 LIMIT 1",
)
.bind::<sql_types::Uuid, _>(incident_id)
Expand Down Expand Up @@ -305,7 +302,12 @@ async fn critical_joining_existing_open_accelerates_pending_delivery() {
}

#[tokio::test(flavor = "multi_thread")]
async fn critical_join_does_not_disturb_already_delivered_open() {
async fn critical_join_after_delivered_open_fires_escalation_open() {
// Once the original "incident opened" Slack message has shipped, a
// Critical contributor joining is treated as an escalation: a fresh
// `incident_open` row is enqueued at Critical-severity timing so
// operators hear the change. Incident.escalated_at is stamped so we
// don't re-fire on every subsequent Critical join.
commons_tests::db::TestDb::run(async |mut conn, _| {
let server_id =
insert_grouped_server(&mut conn, "http://crit-after-delivered.invalid/").await;
Expand All @@ -329,9 +331,12 @@ async fn critical_join_does_not_disturb_already_delivered_open() {
SlackOutbox::mark_delivered(&mut conn, pending.id, "ok")
.await
.expect("mark delivered");
assert!(
incident.escalated_at.is_none(),
"precondition: not yet escalated"
);

// A Critical issue now joins. There's no pending open to accelerate,
// so the call is a no-op and crucially doesn't re-enqueue Slack noise.
// A Critical issue joins → escalation fires.
save_event(
&mut conn,
server_id,
Expand All @@ -342,8 +347,101 @@ async fn critical_join_does_not_disturb_already_delivered_open() {
)
.await;

// The (now-delivered) outbox row still has its delivered_at set; no
// duplicate incident_open rows have been created.
// Two open rows now: the original (delivered) and the escalation
// (pending, with delay ~0 because Critical bypasses the holding
// window).
#[derive(QueryableByName)]
struct OpenRow {
#[diesel(sql_type = sql_types::Double)]
delay_secs: f64,
#[diesel(sql_type = sql_types::Bool)]
is_delivered: bool,
}
let rows: Vec<OpenRow> = sql_query(
"SELECT EXTRACT(EPOCH FROM (deliver_after - NOW()))::float8 AS delay_secs, \
delivered_at IS NOT NULL AS is_delivered \
FROM slack_outbox WHERE incident_id = $1 AND kind = $2 \
ORDER BY created_at",
)
.bind::<sql_types::Uuid, _>(incident.id)
.bind::<sql_types::Text, _>(KIND_INCIDENT_OPEN)
.get_results(&mut conn)
.await
.expect("rows");
assert_eq!(rows.len(), 2, "two open rows: original + escalation");
assert!(rows[0].is_delivered, "original is delivered");
assert!(!rows[1].is_delivered, "escalation is freshly enqueued");
assert!(
rows[1].delay_secs.abs() <= 5.0,
"escalation deliver_after is ~now (Critical bypasses delay); got {}s",
rows[1].delay_secs,
);

// incident.escalated_at is now set.
let refreshed = Incident::list_for_server(&mut conn, server_id, false, 10)
.await
.expect("list")
.into_iter()
.next()
.expect("incident");
assert!(
refreshed.escalated_at.is_some(),
"escalated_at recorded after first Critical escalation"
);
})
.await
}

#[tokio::test(flavor = "multi_thread")]
async fn repeated_critical_joins_do_not_re_fire_escalation() {
// escalated_at is the latch: the first Critical-after-delivered-open
// fires a fresh "incident opened" message, and every subsequent
// Critical contributor is silent. Otherwise a flapping check could
// repeatedly re-page operators long after the incident is well-known.
commons_tests::db::TestDb::run(async |mut conn, _| {
let server_id =
insert_grouped_server(&mut conn, "http://crit-no-double-escalate.invalid/").await;
save_event(
&mut conn,
server_id,
"warmup",
Severity::Error,
true,
"boom",
)
.await;
let incident = Incident::list_for_server(&mut conn, server_id, false, 10)
.await
.expect("list")
.into_iter()
.next()
.expect("incident");
let pending = pending_open(&mut conn, incident.id).await;
SlackOutbox::mark_delivered(&mut conn, pending.id, "ok")
.await
.expect("mark delivered");

// First Critical → escalation fires.
save_event(
&mut conn,
server_id,
"crit-1",
Severity::Critical,
true,
"red alert",
)
.await;
// Second Critical → no new outbox row.
save_event(
&mut conn,
server_id,
"crit-2",
Severity::Critical,
true,
"also red",
)
.await;

#[derive(QueryableByName)]
struct Count {
#[diesel(sql_type = sql_types::BigInt)]
Expand All @@ -358,9 +456,10 @@ async fn critical_join_does_not_disturb_already_delivered_open() {
.get_result(&mut conn)
.await
.expect("count");
assert_eq!(count.n, 1);
let row = pending_open(&mut conn, incident.id).await;
assert!(row.is_delivered);
assert_eq!(
count.n, 2,
"original + one escalation; second Critical is silent"
);
})
.await
}
Expand Down
6 changes: 1 addition & 5 deletions crates/private-server/src/fns/healthchecks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,7 @@ pub async fn sample(

// Top-level extras — the column is always an object after our
// ingestion path strips reserved keys.
let status_extra = status
.extra
.as_object()
.cloned()
.unwrap_or_default();
let status_extra = status.extra.as_object().cloned().unwrap_or_default();

// Pull the failing-check entry out of the health array (any entry
// matching by name; we don't require unhealthy here so we still
Expand Down
14 changes: 8 additions & 6 deletions crates/private-server/src/fns/statuses.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,7 @@ pub struct StatusSnapshotData {
/// the explicit severity when one is known so operators see
/// all five levels, not just the legacy "warning/error" pair
/// derived from `healthy`.
pub check_severities:
std::collections::HashMap<String, commons_types::issue::Severity>,
pub check_severities: std::collections::HashMap<String, commons_types::issue::Severity>,
}

#[derive(Deserialize, ToSchema)]
Expand Down Expand Up @@ -314,8 +313,7 @@ pub async fn snapshot(
// Compute the per-unhealthy-check severity the rules engine would
// file at given this push. Healthy checks are omitted; the UI
// renders them with its 'passing' affordance regardless.
let check_severities =
compute_check_severities(&mut conn, args.server_id, &status).await?;
let check_severities = compute_check_severities(&mut conn, args.server_id, &status).await?;

Ok(Json(Some(StatusSnapshotData {
id: status.id,
Expand Down Expand Up @@ -357,8 +355,12 @@ async fn compute_check_severities(
let mut failing: Vec<(String, serde_json::Map<String, serde_json::Value>)> = Vec::new();
for raw in arr {
let Some(obj) = raw.as_object() else { continue };
let Some(check_name) = obj.get("check").and_then(|v| v.as_str()) else { continue };
let Some(healthy) = obj.get("healthy").and_then(|v| v.as_bool()) else { continue };
let Some(check_name) = obj.get("check").and_then(|v| v.as_str()) else {
continue;
};
let Some(healthy) = obj.get("healthy").and_then(|v| v.as_bool()) else {
continue;
};
if healthy {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE incidents
DROP COLUMN escalated_at;
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
-- Marks the first time an incident escalated from a lower severity to
-- Critical. NULL ⇒ never escalated. Used to gate the one-shot "open"
-- Slack message that fires on the first Critical contributor joining
-- an already-shipped incident.
ALTER TABLE incidents
ADD COLUMN escalated_at TIMESTAMPTZ;