From 7f0da038c77629767a1a7eed6acddf7bc71a05e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Sat, 30 May 2026 17:56:23 +1200 Subject: [PATCH 1/3] feat(alertd/checks): port error-notification alerts to checks Add a shared row-fetching helper and the five recent-error checks ported from the certificate-notification, ips, patient-communications, report, and fhir YAML alerts. Each is central-only, skips when the DB is unavailable, and fails when any matching row exists within the lookback window. Co-authored-by: Claude --- crates/alertd/src/doctor/checks.rs | 85 +++++++++++++++++ .../checks/certificate_notification_errors.rs | 61 ++++++++++++ .../src/doctor/checks/fhir_job_errors.rs | 63 ++++++++++++ crates/alertd/src/doctor/checks/ips_errors.rs | 60 ++++++++++++ .../checks/patient_communication_errors.rs | 61 ++++++++++++ .../alertd/src/doctor/checks/report_errors.rs | 61 ++++++++++++ crates/alertd/src/doctor/checks/util.rs | 95 +++++++++++++++++++ 7 files changed, 486 insertions(+) create mode 100644 crates/alertd/src/doctor/checks/certificate_notification_errors.rs create mode 100644 crates/alertd/src/doctor/checks/fhir_job_errors.rs create mode 100644 crates/alertd/src/doctor/checks/ips_errors.rs create mode 100644 crates/alertd/src/doctor/checks/patient_communication_errors.rs create mode 100644 crates/alertd/src/doctor/checks/report_errors.rs create mode 100644 crates/alertd/src/doctor/checks/util.rs diff --git a/crates/alertd/src/doctor/checks.rs b/crates/alertd/src/doctor/checks.rs index e998bae8..52271144 100644 --- a/crates/alertd/src/doctor/checks.rs +++ b/crates/alertd/src/doctor/checks.rs @@ -13,17 +13,24 @@ use bestool_tamanu::{ApiServerKind, config::TamanuConfig}; use super::check::Check; +pub mod util; + pub mod caddy_version; +pub mod certificate_notification_errors; pub mod db_connect; pub mod db_version; pub mod disk_free; pub mod external_users; +pub mod fhir_job_errors; pub mod fhir_jobs; pub mod http_errors; +pub mod ips_errors; pub mod kopia_backup; pub mod load; pub mod memory; pub mod migrations; +pub mod patient_communication_errors; +pub mod report_errors; pub mod server_id; pub mod sync_sessions; pub mod tailscale; @@ -143,5 +150,83 @@ pub fn all() -> Vec { entry!("sync_sessions", sync_sessions), entry!("fhir_jobs", fhir_jobs), entry!("kopia_backup", kopia_backup), + entry!( + "certificate_notification_errors", + certificate_notification_errors + ), + entry!("ips_errors", ips_errors), + entry!("patient_communication_errors", patient_communication_errors), + entry!("report_errors", report_errors), + entry!("fhir_job_errors", fhir_job_errors), ] } + +#[cfg(test)] +pub mod test_support { + //! Helpers for DB-backed check tests. + //! + //! Each migrated check is central-only and DB-backed, so its tests need a + //! [`CheckContext`] wired to one of the local `tamanu-central` / + //! `tamanu-facility` databases. These connect lazily and return `None` when + //! the DB is unavailable so the suite degrades gracefully off-CI. + + use std::sync::Arc; + + use node_semver::Version; + + use bestool_tamanu::{ApiServerKind, config::TamanuConfig}; + + use super::CheckContext; + + fn central_config() -> TamanuConfig { + serde_json::from_value(serde_json::json!({ + "db": { "name": "tamanu-central", "username": "u", "password": "p" }, + })) + .expect("central test config should parse") + } + + fn facility_config() -> TamanuConfig { + serde_json::from_value(serde_json::json!({ + "db": { "name": "tamanu-facility", "username": "u", "password": "p" }, + "serverFacilityIds": ["facility-1"], + })) + .expect("facility test config should parse") + } + + async fn connect(db_name: &str) -> Option> { + let url = format!("postgresql://localhost/{db_name}"); + match bestool_postgres::pool::connect_one(&url, "bestool-alertd-test").await { + Ok(client) => Some(Arc::new(client)), + Err(_) => None, + } + } + + /// A central [`CheckContext`] backed by `tamanu-central`, or `None` if that + /// DB can't be reached. + pub async fn central_ctx() -> Option { + let db = connect("tamanu-central").await?; + Some(CheckContext { + tamanu_version: Version::parse("0.0.0").unwrap(), + tamanu_root: std::path::PathBuf::from("/nonexistent"), + config: Arc::new(central_config()), + kind: ApiServerKind::Central, + database_url: "postgresql://localhost/tamanu-central".into(), + db: Some(db), + http_client: reqwest::Client::new(), + }) + } + + /// A facility [`CheckContext`] with no DB; central-only checks skip on it + /// before ever touching the database. + pub fn facility_ctx() -> CheckContext { + CheckContext { + tamanu_version: Version::parse("0.0.0").unwrap(), + tamanu_root: std::path::PathBuf::from("/nonexistent"), + config: Arc::new(facility_config()), + kind: ApiServerKind::Facility, + database_url: "postgresql://localhost/tamanu-facility".into(), + db: None, + http_client: reqwest::Client::new(), + } + } +} diff --git a/crates/alertd/src/doctor/checks/certificate_notification_errors.rs b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs new file mode 100644 index 00000000..4beb789c --- /dev/null +++ b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs @@ -0,0 +1,61 @@ +//! Certificate notifications that errored within the lookback window. +//! +//! Ported from the `certificate-notification-error` YAML alert. + +use jiff::{Timestamp, ToSpan}; + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "certificate_notification_errors"; +const SQL: &str = "SELECT * FROM certificate_notifications \ + WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC"; + +// Lookback window for recent-error checks; revisit in the Phase 4 threshold +// review. +const LOOKBACK_HOURS: i64 = 1; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + let since = Timestamp::now() - LOOKBACK_HOURS.hours(); + fail_if_any_rows( + client, + "certificate_notification_errors", + "no recent certificate notification errors", + "certificate notification errors: ", + SQL, + &[&since], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "certificate_notification_errors"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/fhir_job_errors.rs b/crates/alertd/src/doctor/checks/fhir_job_errors.rs new file mode 100644 index 00000000..8f1104cb --- /dev/null +++ b/crates/alertd/src/doctor/checks/fhir_job_errors.rs @@ -0,0 +1,63 @@ +//! FHIR jobs that recorded an error within the lookback window. +//! +//! Ported from the `fhir-error` YAML alert. Distinct from `fhir_jobs`, which +//! measures live queue depth: this surfaces individual jobs that errored +//! recently. + +use jiff::{Timestamp, ToSpan}; + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "fhir_job_errors"; +const SQL: &str = + "SELECT * FROM fhir.jobs WHERE error IS NOT NULL AND created_at > $1 ORDER BY created_at DESC"; + +// Lookback window for recent-error checks; revisit in the Phase 4 threshold +// review. +const LOOKBACK_HOURS: i64 = 1; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + let since = Timestamp::now() - LOOKBACK_HOURS.hours(); + fail_if_any_rows( + client, + "fhir_job_errors", + "no recent FHIR job errors", + "FHIR job errors: ", + SQL, + &[&since], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "fhir_job_errors"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/ips_errors.rs b/crates/alertd/src/doctor/checks/ips_errors.rs new file mode 100644 index 00000000..b7f933d4 --- /dev/null +++ b/crates/alertd/src/doctor/checks/ips_errors.rs @@ -0,0 +1,60 @@ +//! IPS requests that errored within the lookback window. +//! +//! Ported from the `ips-error` YAML alert. + +use jiff::{Timestamp, ToSpan}; + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "ips_errors"; +const SQL: &str = "SELECT * FROM ips_requests WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC"; + +// Lookback window for recent-error checks; revisit in the Phase 4 threshold +// review. +const LOOKBACK_HOURS: i64 = 1; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + let since = Timestamp::now() - LOOKBACK_HOURS.hours(); + fail_if_any_rows( + client, + "ips_errors", + "no recent IPS request errors", + "IPS request errors: ", + SQL, + &[&since], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "ips_errors"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/patient_communication_errors.rs b/crates/alertd/src/doctor/checks/patient_communication_errors.rs new file mode 100644 index 00000000..1f289ab9 --- /dev/null +++ b/crates/alertd/src/doctor/checks/patient_communication_errors.rs @@ -0,0 +1,61 @@ +//! Patient communications that errored within the lookback window. +//! +//! Ported from the `patient-communications-error` YAML alert. + +use jiff::{Timestamp, ToSpan}; + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "patient_communication_errors"; +const SQL: &str = "SELECT * FROM patient_communications \ + WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC"; + +// Lookback window for recent-error checks; revisit in the Phase 4 threshold +// review. +const LOOKBACK_HOURS: i64 = 1; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + let since = Timestamp::now() - LOOKBACK_HOURS.hours(); + fail_if_any_rows( + client, + "patient_communication_errors", + "no recent patient communication errors", + "patient communication errors: ", + SQL, + &[&since], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "patient_communication_errors"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/report_errors.rs b/crates/alertd/src/doctor/checks/report_errors.rs new file mode 100644 index 00000000..f508fc2d --- /dev/null +++ b/crates/alertd/src/doctor/checks/report_errors.rs @@ -0,0 +1,61 @@ +//! Report requests that errored within the lookback window. +//! +//! Ported from the `report-error` YAML alert. + +use jiff::{Timestamp, ToSpan}; + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "report_errors"; +const SQL: &str = "SELECT * FROM report_requests \ + WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC"; + +// Lookback window for recent-error checks; revisit in the Phase 4 threshold +// review. +const LOOKBACK_HOURS: i64 = 1; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + let since = Timestamp::now() - LOOKBACK_HOURS.hours(); + fail_if_any_rows( + client, + "report_errors", + "no recent report errors", + "report errors: ", + SQL, + &[&since], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "report_errors"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/util.rs b/crates/alertd/src/doctor/checks/util.rs new file mode 100644 index 00000000..22bc9f97 --- /dev/null +++ b/crates/alertd/src/doctor/checks/util.rs @@ -0,0 +1,95 @@ +//! Shared helpers for checks that port the old YAML SQL alerts. +//! +//! Each migrated alert fails when its query returns any rows and attaches the +//! offending rows (capped) to `details`. To avoid the generic tokio-postgres +//! row→JSON conversion and to bound memory, every query is wrapped so Postgres +//! returns one JSONB column per row, capped just past the reporting limit. + +use std::sync::Arc; + +use serde_json::Value; +use tokio_postgres::{Client as PgClient, types::ToSql}; + +use super::fmt_db_error; +use crate::doctor::check::Check; + +/// Rows reported in `details` are capped here; one extra row is fetched to +/// detect truncation. +const REPORT_CAP: usize = 100; +const FETCH_CAP: usize = REPORT_CAP + 1; + +/// Wrap an alert's SQL so Postgres hands back one JSONB column (`row`) per +/// matching row, capped at [`FETCH_CAP`]. +fn wrap(sql: &str) -> String { + format!("SELECT to_jsonb(sub) AS row FROM ( {sql} ) sub LIMIT {FETCH_CAP}") +} + +/// Outcome of running one wrapped alert query: the rows (capped at +/// [`REPORT_CAP`]) and whether more existed than were reported. +pub struct RowSet { + pub rows: Vec, + pub truncated: bool, +} + +impl RowSet { + pub fn is_empty(&self) -> bool { + self.rows.is_empty() + } + + /// Number to report: the exact count, or `"100+"` when truncated. + pub fn count(&self) -> Value { + if self.truncated { + Value::from(format!("{REPORT_CAP}+")) + } else { + Value::from(self.rows.len()) + } + } +} + +/// Run a wrapped alert query and collect its rows. The `to_jsonb` wrapping is +/// applied here, so callers pass the original alert SQL. +pub async fn fetch_rows( + client: &Arc, + sql: &str, + params: &[&(dyn ToSql + Sync)], +) -> Result { + let wrapped = wrap(sql); + let raw = client.query(&wrapped, params).await?; + let truncated = raw.len() > REPORT_CAP; + let rows = raw + .into_iter() + .take(REPORT_CAP) + .map(|r| r.get::<_, Value>("row")) + .collect(); + Ok(RowSet { rows, truncated }) +} + +/// Run a single wrapped alert query: fail (with capped rows + count) if it +/// returns any rows, else pass. +/// +/// `summary_pass` is the headline shown when nothing matched; +/// `summary_fail_prefix` is prepended to the count when rows are found. +pub async fn fail_if_any_rows( + client: &Arc, + name: &'static str, + summary_pass: &str, + summary_fail_prefix: &str, + sql: &str, + params: &[&(dyn ToSql + Sync)], +) -> Check { + match fetch_rows(client, sql, params).await { + Ok(set) if set.is_empty() => Check::pass(name, summary_pass.to_string()), + Ok(set) => { + let count = set.count(); + Check::fail( + name, + format!("{summary_fail_prefix}{count}"), + format!("{} matching row(s)", count), + ) + .with_detail("rows", Value::Array(set.rows)) + .with_detail("truncated", set.truncated) + .with_detail("count", count) + } + Err(err) => Check::fail(name, "query failed", fmt_db_error(&err)), + } +} From ce97e3931590c251b12cba84f73fdfaebb2d1745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Sat, 30 May 2026 17:57:10 +1200 Subject: [PATCH 2/3] feat(alertd/checks): port sync alerts to checks (closes TODO sync_lookup) Add sync_session_errors (mobile + server, benign-error exclusions baked in), sync_facility_stale (not-syncing + no-recent-success), sync_lookup (lookup-table staleness, closes TODO #8), and sync_restart_loop. All central-only, skip when the DB is unavailable, and fail on any matching rows. Co-authored-by: Claude --- crates/alertd/src/doctor/checks.rs | 8 ++ .../src/doctor/checks/sync_facility_stale.rs | 99 +++++++++++++++++ .../alertd/src/doctor/checks/sync_lookup.rs | 57 ++++++++++ .../src/doctor/checks/sync_restart_loop.rs | 59 +++++++++++ .../src/doctor/checks/sync_session_errors.rs | 100 ++++++++++++++++++ 5 files changed, 323 insertions(+) create mode 100644 crates/alertd/src/doctor/checks/sync_facility_stale.rs create mode 100644 crates/alertd/src/doctor/checks/sync_lookup.rs create mode 100644 crates/alertd/src/doctor/checks/sync_restart_loop.rs create mode 100644 crates/alertd/src/doctor/checks/sync_session_errors.rs diff --git a/crates/alertd/src/doctor/checks.rs b/crates/alertd/src/doctor/checks.rs index 52271144..26570c51 100644 --- a/crates/alertd/src/doctor/checks.rs +++ b/crates/alertd/src/doctor/checks.rs @@ -32,6 +32,10 @@ pub mod migrations; pub mod patient_communication_errors; pub mod report_errors; pub mod server_id; +pub mod sync_facility_stale; +pub mod sync_lookup; +pub mod sync_restart_loop; +pub mod sync_session_errors; pub mod sync_sessions; pub mod tailscale; pub mod tamanu_found; @@ -158,6 +162,10 @@ pub fn all() -> Vec { entry!("patient_communication_errors", patient_communication_errors), entry!("report_errors", report_errors), entry!("fhir_job_errors", fhir_job_errors), + entry!("sync_session_errors", sync_session_errors), + entry!("sync_facility_stale", sync_facility_stale), + entry!("sync_lookup", sync_lookup), + entry!("sync_restart_loop", sync_restart_loop), ] } diff --git a/crates/alertd/src/doctor/checks/sync_facility_stale.rs b/crates/alertd/src/doctor/checks/sync_facility_stale.rs new file mode 100644 index 00000000..b8a11589 --- /dev/null +++ b/crates/alertd/src/doctor/checks/sync_facility_stale.rs @@ -0,0 +1,99 @@ +//! Facilities whose sync has gone stale. +//! +//! Consolidates the `sync-facility-not-syncing` and `sync-no-sessions` YAML +//! alerts: the first flags facilities that synced in the last 48h but have had +//! no completion in the last 30m; the second flags facilities whose last +//! successful sync was over an hour ago. + +use serde_json::Value; + +use super::{CheckContext, util::fetch_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "sync_facility_stale"; + +const NOT_SYNCING_SQL: &str = "with sync_sessions_with_facility_id as ( \ + select created_at, completed_at, \ + jsonb_array_elements_text(parameters->'facilityIds') as facility_id \ + from sync_sessions where parameters->>'isMobile' <> 'true' \ + ) \ + select distinct facility_id from sync_sessions_with_facility_id \ + where created_at > current_timestamp - '48 hours'::interval \ + except \ + select facility_id from sync_sessions_with_facility_id \ + where completed_at > current_timestamp - '30 minutes'::interval \ + group by facility_id order by facility_id"; + +const NO_RECENT_SUCCESS_SQL: &str = "SELECT facility_id, last_successful_sync FROM ( \ + SELECT facility_id, max(completed_at) as last_successful_sync FROM ( \ + SELECT jsonb_array_elements_text(parameters->'facilityIds') as facility_id, completed_at \ + FROM sync_sessions WHERE errors IS NULL \ + ) AS successful_syncs GROUP BY facility_id \ + ) AS last_successful_facility_syncs \ + WHERE last_successful_sync < now() - interval '1 hour'"; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + let not_syncing = match fetch_rows(client, NOT_SYNCING_SQL, &[]).await { + Ok(set) => set, + Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), + }; + let no_recent_success = match fetch_rows(client, NO_RECENT_SUCCESS_SQL, &[]).await { + Ok(set) => set, + Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), + }; + + if not_syncing.is_empty() && no_recent_success.is_empty() { + return Check::pass(NAME, "all facilities syncing"); + } + + let (not_syncing_count, not_syncing_truncated) = (not_syncing.count(), not_syncing.truncated); + let (no_recent_count, no_recent_truncated) = + (no_recent_success.count(), no_recent_success.truncated); + + let check = Check::fail( + NAME, + format!( + "stale sync: {not_syncing_count} not syncing, {no_recent_count} with no recent success" + ), + "facility sync stale", + ); + check + .with_detail("not_syncing", Value::Array(not_syncing.rows)) + .with_detail("not_syncing_count", not_syncing_count) + .with_detail("not_syncing_truncated", not_syncing_truncated) + .with_detail("no_recent_success", Value::Array(no_recent_success.rows)) + .with_detail("no_recent_success_count", no_recent_count) + .with_detail("no_recent_success_truncated", no_recent_truncated) +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "sync_facility_stale"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/sync_lookup.rs b/crates/alertd/src/doctor/checks/sync_lookup.rs new file mode 100644 index 00000000..ae8015ec --- /dev/null +++ b/crates/alertd/src/doctor/checks/sync_lookup.rs @@ -0,0 +1,57 @@ +//! Lookup table update staleness. +//! +//! Ported from the `sync-lookup-stale` YAML alert (closes TODO #8). Fails when +//! the central server hasn't recorded a successful lookup-table update in over +//! an hour. + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "sync_lookup"; +const SQL: &str = "SELECT key, value AS last_sync_tick, updated_at::text AS last_updated, \ + (now() - updated_at)::text AS time_since_update FROM local_system_facts \ + WHERE key = 'lastSuccessfulLookupTableUpdate' AND updated_at < now() - interval '1 hour'"; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + fail_if_any_rows( + client, + NAME, + "lookup table up to date", + "lookup table stale: ", + SQL, + &[], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "sync_lookup"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/sync_restart_loop.rs b/crates/alertd/src/doctor/checks/sync_restart_loop.rs new file mode 100644 index 00000000..e014983b --- /dev/null +++ b/crates/alertd/src/doctor/checks/sync_restart_loop.rs @@ -0,0 +1,59 @@ +//! Facilities stuck in a sync restart loop. +//! +//! Ported from the `sync-restart-loop` YAML alert. Fails when a facility has +//! accumulated 10 or more `snapshot-for-pushing` sync errors in the last hour, +//! which indicates the sync is repeatedly restarting rather than progressing. + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "sync_restart_loop"; +const SQL: &str = "SELECT jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \ + COUNT(*) AS error_count FROM sync_sessions \ + WHERE created_at > now() - interval '1 hour' AND errors IS NOT NULL \ + AND cardinality(errors) = 1 AND errors[1] LIKE '%snapshot-for-pushing%' \ + GROUP BY facility_id HAVING COUNT(*) >= 10 ORDER BY error_count DESC"; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + fail_if_any_rows( + client, + NAME, + "no sync restart loops", + "facilities in sync restart loop: ", + SQL, + &[], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "sync_restart_loop"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} diff --git a/crates/alertd/src/doctor/checks/sync_session_errors.rs b/crates/alertd/src/doctor/checks/sync_session_errors.rs new file mode 100644 index 00000000..60a3e5b2 --- /dev/null +++ b/crates/alertd/src/doctor/checks/sync_session_errors.rs @@ -0,0 +1,100 @@ +//! Recent sync-session errors, split mobile vs server, with benign-error +//! exclusions baked into the SQL. +//! +//! Ported from the `sync-errors-mobile` and `sync-errors-server` YAML alerts +//! into one check. The original alerts ran every minute, so the window is a +//! tight `updated_at > now() - interval '1 minute'`; the sweep runs every 60s, +//! so this still catches each error once. Both window and exclusions are +//! Phase-4-tunable. + +use serde_json::Value; + +use super::{CheckContext, util::fetch_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "sync_session_errors"; + +const MOBILE_SQL: &str = "SELECT id, errors::text, \ + jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \ + created_at::text AS created, (completed_at - created_at)::text AS duration \ + FROM sync_sessions \ + WHERE updated_at > now() - interval '1 minute' \ + AND parameters->>'isMobile' = 'true' \ + AND errors IS NOT NULL \ + AND errors <> ARRAY['Session marked as completed due to its device reconnecting'] \ + AND errors <> ARRAY['could not serialize access due to concurrent update'] \ + ORDER BY created_at DESC"; + +const SERVER_SQL: &str = "SELECT id, errors::text, \ + jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \ + created_at::text AS created, (completed_at - created_at)::text AS duration \ + FROM sync_sessions \ + WHERE updated_at > now() - interval '1 minute' \ + AND parameters->>'isMobile' IS DISTINCT FROM 'true' \ + AND errors IS NOT NULL \ + AND errors <> ARRAY['could not serialize access due to concurrent update'] \ + AND NOT (cardinality(errors) = 1 AND errors[1] LIKE '%snapshot-for-pushing%') \ + ORDER BY created_at DESC"; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + let mobile = match fetch_rows(client, MOBILE_SQL, &[]).await { + Ok(set) => set, + Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), + }; + let server = match fetch_rows(client, SERVER_SQL, &[]).await { + Ok(set) => set, + Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), + }; + + if mobile.is_empty() && server.is_empty() { + return Check::pass(NAME, "no recent sync session errors"); + } + + let (mobile_count, mobile_truncated) = (mobile.count(), mobile.truncated); + let (server_count, server_truncated) = (server.count(), server.truncated); + + let check = Check::fail( + NAME, + format!("sync session errors: {mobile_count} mobile, {server_count} server"), + "recent sync session error(s)", + ); + check + .with_detail("mobile", Value::Array(mobile.rows)) + .with_detail("mobile_count", mobile_count) + .with_detail("mobile_truncated", mobile_truncated) + .with_detail("server", Value::Array(server.rows)) + .with_detail("server_count", server_count) + .with_detail("server_truncated", server_truncated) +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "sync_session_errors"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +} From ae8de1c24ef59332bee08e730349f85d2df2e3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Sat, 30 May 2026 17:57:47 +1200 Subject: [PATCH 3/3] feat(alertd/checks): port fhir unresolvable-service-requests alert to a check Add fhir_service_requests_unresolved, central-only, skipping when the DB is unavailable and failing when a lab-linked FHIR service request has stayed unresolved for over an hour. Co-authored-by: Claude --- crates/alertd/src/doctor/checks.rs | 5 ++ .../fhir_service_requests_unresolved.rs | 58 +++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs diff --git a/crates/alertd/src/doctor/checks.rs b/crates/alertd/src/doctor/checks.rs index 26570c51..6a4a7cdc 100644 --- a/crates/alertd/src/doctor/checks.rs +++ b/crates/alertd/src/doctor/checks.rs @@ -23,6 +23,7 @@ pub mod disk_free; pub mod external_users; pub mod fhir_job_errors; pub mod fhir_jobs; +pub mod fhir_service_requests_unresolved; pub mod http_errors; pub mod ips_errors; pub mod kopia_backup; @@ -166,6 +167,10 @@ pub fn all() -> Vec { entry!("sync_facility_stale", sync_facility_stale), entry!("sync_lookup", sync_lookup), entry!("sync_restart_loop", sync_restart_loop), + entry!( + "fhir_service_requests_unresolved", + fhir_service_requests_unresolved + ), ] } diff --git a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs new file mode 100644 index 00000000..dece3729 --- /dev/null +++ b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs @@ -0,0 +1,58 @@ +//! FHIR service requests that have stayed unresolved for too long. +//! +//! Ported from the `fhir-unresolvable-service-requests-labs` YAML alert. Fails +//! when a FHIR service request linked to a lab request has been unresolved for +//! over an hour. + +use super::{CheckContext, util::fail_if_any_rows}; +use crate::doctor::check::Check; +use bestool_tamanu::ApiServerKind; + +const NAME: &str = "fhir_service_requests_unresolved"; +const SQL: &str = "SELECT lr.display_id AS lab_request_id, \ + ROUND(EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60)::text AS duration_minutes \ + FROM fhir.service_requests fsr JOIN lab_requests lr ON fsr.upstream_id = lr.id \ + WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours'"; + +pub async fn run(ctx: CheckContext) -> Check { + if ctx.kind != ApiServerKind::Central { + return Check::skip( + NAME, + "not applicable on facility server", + "central-only check", + ); + } + let Some(client) = ctx.db.as_ref() else { + return Check::skip(NAME, "no DB connection", "db unavailable"); + }; + + fail_if_any_rows( + client, + NAME, + "no unresolved FHIR service requests", + "unresolved FHIR service requests: ", + SQL, + &[], + ) + .await +} + +#[cfg(test)] +mod tests { + use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + + #[tokio::test] + async fn runs_against_central() { + let Some(ctx) = central_ctx().await else { + return; + }; + let check = super::run(ctx).await; + assert_eq!(check.name, "fhir_service_requests_unresolved"); + } + + #[tokio::test] + async fn skips_on_facility() { + let check = super::run(facility_ctx()).await; + assert!(check.status.is_skip()); + } +}