Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use super::{CheckContext, util::tiered_rows_check};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

Expand All @@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check {
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
tiered_rows_check(
client,
"certificate_notification_errors",
"no recent certificate notification errors",
"certificate notification errors: ",
SQL,
&[&since],
1,
10,
)
.await
}
Expand Down
17 changes: 13 additions & 4 deletions crates/alertd/src/doctor/checks/db_connect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ use std::time::Instant;
use super::CheckContext;
use crate::doctor::check::Check;

/// Connect latency above which the DB is treated as degraded.
const WARN_LATENCY_MS: u64 = 1000;

pub async fn run(ctx: CheckContext) -> Check {
let host = ctx
.config
Expand All @@ -21,10 +24,16 @@ pub async fn run(ctx: CheckContext) -> Check {
tokio::spawn(async move {
let _ = conn.await;
});
Check::pass(
"db_connect",
format!("postgres at {host}/{name} ({latency_ms}ms)"),
)
let summary = format!("postgres at {host}/{name} ({latency_ms}ms)");
if latency_ms > WARN_LATENCY_MS {
Check::warning(
"db_connect",
summary,
format!("connect latency {latency_ms}ms over {WARN_LATENCY_MS}ms"),
)
} else {
Check::pass("db_connect", summary)
}
}
Err(err) => Check::fail(
"db_connect",
Expand Down
6 changes: 4 additions & 2 deletions crates/alertd/src/doctor/checks/fhir_job_errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use super::{CheckContext, util::tiered_rows_check};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

Expand All @@ -29,13 +29,15 @@ pub async fn run(ctx: CheckContext) -> Check {
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
tiered_rows_check(
client,
"fhir_job_errors",
"no recent FHIR job errors",
"FHIR job errors: ",
SQL,
&[&since],
1,
10,
)
.await
}
Expand Down
73 changes: 59 additions & 14 deletions crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
//! FHIR service requests that have stayed unresolved for too long.
//!
//! Fails when a FHIR service request linked to a lab request has been
//! unresolved for over an hour.
//! Lists FHIR service requests linked to a lab request that have been
//! unresolved for over an hour, tiering on the longest outstanding duration:
//! WARN past 1h, FAIL past 6h.

use super::{CheckContext, util::fail_if_any_rows};
use super::{CheckContext, fmt_db_error};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;
use serde_json::{Value, json};

const NAME: &str = "fhir_service_requests_unresolved";

const WARN_MINUTES: f64 = 60.0;
const FAIL_MINUTES: f64 = 6.0 * 60.0;

const SQL: &str = "SELECT lr.display_id AS lab_request_id, \
ROUND(EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60)::text AS duration_minutes \
EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60 AS duration_minutes \
FROM fhir.service_requests fsr JOIN lab_requests lr ON fsr.upstream_id = lr.id \
WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours'";
WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours' \
ORDER BY duration_minutes DESC";

pub async fn run(ctx: CheckContext) -> Check {
if ctx.kind != ApiServerKind::Central {
Expand All @@ -25,19 +32,53 @@ pub async fn run(ctx: CheckContext) -> Check {
return Check::skip(NAME, "no DB connection", "db unavailable");
};

fail_if_any_rows(
client,
NAME,
"no unresolved FHIR service requests",
"unresolved FHIR service requests: ",
SQL,
&[],
)
.await
let rows = match client.query(SQL, &[]).await {
Ok(r) => r,
Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)),
};

if rows.is_empty() {
return Check::pass(NAME, "no unresolved FHIR service requests");
}

let mut warn = Vec::new();
let mut fail = Vec::new();
for row in &rows {
let lab_request_id: Option<String> = row.try_get("lab_request_id").ok();
let minutes: f64 = row.try_get("duration_minutes").unwrap_or(0.0);
let entry = json!({
"lab_request_id": lab_request_id,
"duration_minutes": minutes.round() as i64,
});
if minutes > FAIL_MINUTES {
fail.push(entry);
} else if minutes > WARN_MINUTES {
warn.push(entry);
}
}

if warn.is_empty() && fail.is_empty() {
return Check::pass(NAME, "no unresolved FHIR service requests");
}

let summary = format!(
"unresolved FHIR service requests: {} over 6h, {} over 1h",
fail.len(),
warn.len()
);
let check = if fail.is_empty() {
Check::warning(NAME, summary, "unresolved FHIR service request(s)")
} else {
Check::fail(NAME, summary, "unresolved FHIR service request(s)")
};
check
.with_detail("fail", Value::Array(fail))
.with_detail("warn", Value::Array(warn))
}

#[cfg(test)]
mod tests {
use crate::doctor::check::CheckStatus;
use crate::doctor::checks::test_support::{central_ctx, facility_ctx};

#[tokio::test]
Expand All @@ -47,6 +88,10 @@ mod tests {
};
let check = super::run(ctx).await;
assert_eq!(check.name, "fhir_service_requests_unresolved");
assert!(matches!(
check.status,
CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_)
));
}

#[tokio::test]
Expand Down
6 changes: 4 additions & 2 deletions crates/alertd/src/doctor/checks/ips_errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use super::{CheckContext, util::tiered_rows_check};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

Expand All @@ -25,13 +25,15 @@ pub async fn run(ctx: CheckContext) -> Check {
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
tiered_rows_check(
client,
"ips_errors",
"no recent IPS request errors",
"IPS request errors: ",
SQL,
&[&since],
1,
10,
)
.await
}
Expand Down
19 changes: 19 additions & 0 deletions crates/alertd/src/doctor/checks/kopia_backup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use super::CheckContext;
use crate::doctor::check::Check;

const CHECK_NAME: &str = "kopia_backup";
const WARN_AGE_SECS: i64 = 12 * 60 * 60;
const FAIL_AGE_SECS: i64 = 24 * 60 * 60;

pub async fn run(_ctx: CheckContext) -> Check {
Expand Down Expand Up @@ -206,6 +207,12 @@ fn evaluate(snapshots: &[Snapshot], now: Timestamp) -> Check {
summary.clone(),
format!("no backup in {}", humanise_age(FAIL_AGE_SECS)),
)
} else if age_secs >= WARN_AGE_SECS {
Check::warning(
CHECK_NAME,
summary.clone(),
format!("no backup in {}", humanise_age(WARN_AGE_SECS)),
)
} else {
Check::pass(CHECK_NAME, summary)
};
Expand Down Expand Up @@ -300,6 +307,18 @@ mod tests {
assert!(matches!(check.status, CheckStatus::Pass), "{check:?}");
}

#[test]
fn warn_when_postgres_snapshot_between_12h_and_24h() {
let now = Timestamp::from_second(20_000_000).unwrap();
let snapshots = vec![snapshot(
"/var/lib/postgresql/16/main",
Some(now - 18.hours()),
None,
)];
let check = evaluate(&snapshots, now);
assert!(matches!(check.status, CheckStatus::Warning(_)), "{check:?}");
}

#[test]
fn fail_when_postgres_snapshot_older_than_24h() {
let now = Timestamp::from_second(20_000_000).unwrap();
Expand Down
73 changes: 69 additions & 4 deletions crates/alertd/src/doctor/checks/load.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
use sysinfo::System;
use sysinfo::{CpuRefreshKind, RefreshKind, System};

use super::CheckContext;
use crate::doctor::check::Check;
use crate::doctor::check::{Check, CheckStatus};

/// Multiplier on the logical core count above which the 5-minute load average
/// is treated as a hard failure.
const FAIL_PER_CORE: f64 = 4.0;
/// Multiplier on the logical core count above which the 5-minute load average
/// is treated as a warning.
const WARN_PER_CORE: f64 = 1.5;

pub async fn run(_ctx: CheckContext) -> Check {
if cfg!(target_os = "windows") {
Expand All @@ -12,13 +19,71 @@ pub async fn run(_ctx: CheckContext) -> Check {
);
}

let sys =
System::new_with_specifics(RefreshKind::nothing().with_cpu(CpuRefreshKind::nothing()));
let cores = sys.cpus().len().max(1);

let load = System::load_average();
let summary = format!(
"load average: {:.2}, {:.2}, {:.2}",
"load average: {:.2}, {:.2}, {:.2} ({cores} cores)",
load.one, load.five, load.fifteen
);
Check::pass("load", summary)

let check = match tier(load.five, cores) {
CheckStatus::Fail(_) => Check::fail(
"load",
summary,
format!(
"5-min load {:.2} over {:.1}x cores ({cores})",
load.five, FAIL_PER_CORE
),
),
CheckStatus::Warning(_) => Check::warning(
"load",
summary,
format!(
"5-min load {:.2} over {:.1}x cores ({cores})",
load.five, WARN_PER_CORE
),
),
_ => Check::pass("load", summary),
};

check
.with_detail("one_min", load.one)
.with_detail("five_min", load.five)
.with_detail("fifteen_min", load.fifteen)
.with_detail("cores", cores)
}

/// Tier the 5-minute load average against the logical core count.
fn tier(five: f64, cores: usize) -> CheckStatus {
let cores = cores as f64;
if five > FAIL_PER_CORE * cores {
CheckStatus::Fail(String::new())
} else if five > WARN_PER_CORE * cores {
CheckStatus::Warning(String::new())
} else {
CheckStatus::Pass
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn tier_boundaries() {
assert!(matches!(tier(5.9, 4), CheckStatus::Pass));
assert!(matches!(tier(6.1, 4), CheckStatus::Warning(_)));
assert!(matches!(tier(15.9, 4), CheckStatus::Warning(_)));
assert!(matches!(tier(16.1, 4), CheckStatus::Fail(_)));
}

#[test]
fn tier_single_core() {
assert!(matches!(tier(1.4, 1), CheckStatus::Pass));
assert!(matches!(tier(1.6, 1), CheckStatus::Warning(_)));
assert!(matches!(tier(4.1, 1), CheckStatus::Fail(_)));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use super::{CheckContext, util::tiered_rows_check};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

Expand All @@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check {
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
tiered_rows_check(
client,
"patient_communication_errors",
"no recent patient communication errors",
"patient communication errors: ",
SQL,
&[&since],
1,
10,
)
.await
}
Expand Down
6 changes: 4 additions & 2 deletions crates/alertd/src/doctor/checks/report_errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use super::{CheckContext, util::tiered_rows_check};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

Expand All @@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check {
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
tiered_rows_check(
client,
"report_errors",
"no recent report errors",
"report errors: ",
SQL,
&[&since],
1,
10,
)
.await
}
Expand Down
Loading
Loading