diff --git a/crates/alertd/src/doctor/checks/certificate_notification_errors.rs b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs index 450c2843..1d38d31e 100644 --- a/crates/alertd/src/doctor/checks/certificate_notification_errors.rs +++ b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs @@ -2,7 +2,7 @@ use jiff::{Timestamp, ToSpan}; -use super::{CheckContext, util::fail_if_any_rows}; +use super::{CheckContext, util::tiered_rows_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; @@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check { }; let since = Timestamp::now() - LOOKBACK_HOURS.hours(); - fail_if_any_rows( + tiered_rows_check( client, "certificate_notification_errors", "no recent certificate notification errors", "certificate notification errors: ", SQL, &[&since], + 1, + 10, ) .await } diff --git a/crates/alertd/src/doctor/checks/db_connect.rs b/crates/alertd/src/doctor/checks/db_connect.rs index a2b1cc56..d3d42447 100644 --- a/crates/alertd/src/doctor/checks/db_connect.rs +++ b/crates/alertd/src/doctor/checks/db_connect.rs @@ -3,6 +3,9 @@ use std::time::Instant; use super::CheckContext; use crate::doctor::check::Check; +/// Connect latency above which the DB is treated as degraded. +const WARN_LATENCY_MS: u64 = 1000; + pub async fn run(ctx: CheckContext) -> Check { let host = ctx .config @@ -21,10 +24,16 @@ pub async fn run(ctx: CheckContext) -> Check { tokio::spawn(async move { let _ = conn.await; }); - Check::pass( - "db_connect", - format!("postgres at {host}/{name} ({latency_ms}ms)"), - ) + let summary = format!("postgres at {host}/{name} ({latency_ms}ms)"); + if latency_ms > WARN_LATENCY_MS { + Check::warning( + "db_connect", + summary, + format!("connect latency {latency_ms}ms over {WARN_LATENCY_MS}ms"), + ) + } else { + Check::pass("db_connect", summary) + } } Err(err) => Check::fail( "db_connect", diff --git a/crates/alertd/src/doctor/checks/fhir_job_errors.rs b/crates/alertd/src/doctor/checks/fhir_job_errors.rs index 806e6706..5208c8fb 100644 --- a/crates/alertd/src/doctor/checks/fhir_job_errors.rs +++ b/crates/alertd/src/doctor/checks/fhir_job_errors.rs @@ -5,7 +5,7 @@ use jiff::{Timestamp, ToSpan}; -use super::{CheckContext, util::fail_if_any_rows}; +use super::{CheckContext, util::tiered_rows_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; @@ -29,13 +29,15 @@ pub async fn run(ctx: CheckContext) -> Check { }; let since = Timestamp::now() - LOOKBACK_HOURS.hours(); - fail_if_any_rows( + tiered_rows_check( client, "fhir_job_errors", "no recent FHIR job errors", "FHIR job errors: ", SQL, &[&since], + 1, + 10, ) .await } diff --git a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs index 4afe2bf3..f18cdf7a 100644 --- a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs +++ b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs @@ -1,17 +1,24 @@ //! FHIR service requests that have stayed unresolved for too long. //! -//! Fails when a FHIR service request linked to a lab request has been -//! unresolved for over an hour. +//! Lists FHIR service requests linked to a lab request that have been +//! unresolved for over an hour, tiering on the longest outstanding duration: +//! WARN past 1h, FAIL past 6h. -use super::{CheckContext, util::fail_if_any_rows}; +use super::{CheckContext, fmt_db_error}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; +use serde_json::{Value, json}; const NAME: &str = "fhir_service_requests_unresolved"; + +const WARN_MINUTES: f64 = 60.0; +const FAIL_MINUTES: f64 = 6.0 * 60.0; + const SQL: &str = "SELECT lr.display_id AS lab_request_id, \ - ROUND(EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60)::text AS duration_minutes \ + EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60 AS duration_minutes \ FROM fhir.service_requests fsr JOIN lab_requests lr ON fsr.upstream_id = lr.id \ - WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours'"; + WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours' \ + ORDER BY duration_minutes DESC"; pub async fn run(ctx: CheckContext) -> Check { if ctx.kind != ApiServerKind::Central { @@ -25,19 +32,53 @@ pub async fn run(ctx: CheckContext) -> Check { return Check::skip(NAME, "no DB connection", "db unavailable"); }; - fail_if_any_rows( - client, - NAME, - "no unresolved FHIR service requests", - "unresolved FHIR service requests: ", - SQL, - &[], - ) - .await + let rows = match client.query(SQL, &[]).await { + Ok(r) => r, + Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), + }; + + if rows.is_empty() { + return Check::pass(NAME, "no unresolved FHIR service requests"); + } + + let mut warn = Vec::new(); + let mut fail = Vec::new(); + for row in &rows { + let lab_request_id: Option = row.try_get("lab_request_id").ok(); + let minutes: f64 = row.try_get("duration_minutes").unwrap_or(0.0); + let entry = json!({ + "lab_request_id": lab_request_id, + "duration_minutes": minutes.round() as i64, + }); + if minutes > FAIL_MINUTES { + fail.push(entry); + } else if minutes > WARN_MINUTES { + warn.push(entry); + } + } + + if warn.is_empty() && fail.is_empty() { + return Check::pass(NAME, "no unresolved FHIR service requests"); + } + + let summary = format!( + "unresolved FHIR service requests: {} over 6h, {} over 1h", + fail.len(), + warn.len() + ); + let check = if fail.is_empty() { + Check::warning(NAME, summary, "unresolved FHIR service request(s)") + } else { + Check::fail(NAME, summary, "unresolved FHIR service request(s)") + }; + check + .with_detail("fail", Value::Array(fail)) + .with_detail("warn", Value::Array(warn)) } #[cfg(test)] mod tests { + use crate::doctor::check::CheckStatus; use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; #[tokio::test] @@ -47,6 +88,10 @@ mod tests { }; let check = super::run(ctx).await; assert_eq!(check.name, "fhir_service_requests_unresolved"); + assert!(matches!( + check.status, + CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_) + )); } #[tokio::test] diff --git a/crates/alertd/src/doctor/checks/ips_errors.rs b/crates/alertd/src/doctor/checks/ips_errors.rs index 1812e8b8..2b45f187 100644 --- a/crates/alertd/src/doctor/checks/ips_errors.rs +++ b/crates/alertd/src/doctor/checks/ips_errors.rs @@ -2,7 +2,7 @@ use jiff::{Timestamp, ToSpan}; -use super::{CheckContext, util::fail_if_any_rows}; +use super::{CheckContext, util::tiered_rows_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; @@ -25,13 +25,15 @@ pub async fn run(ctx: CheckContext) -> Check { }; let since = Timestamp::now() - LOOKBACK_HOURS.hours(); - fail_if_any_rows( + tiered_rows_check( client, "ips_errors", "no recent IPS request errors", "IPS request errors: ", SQL, &[&since], + 1, + 10, ) .await } diff --git a/crates/alertd/src/doctor/checks/kopia_backup.rs b/crates/alertd/src/doctor/checks/kopia_backup.rs index dee4a8c0..c2f60ad5 100644 --- a/crates/alertd/src/doctor/checks/kopia_backup.rs +++ b/crates/alertd/src/doctor/checks/kopia_backup.rs @@ -29,6 +29,7 @@ use super::CheckContext; use crate::doctor::check::Check; const CHECK_NAME: &str = "kopia_backup"; +const WARN_AGE_SECS: i64 = 12 * 60 * 60; const FAIL_AGE_SECS: i64 = 24 * 60 * 60; pub async fn run(_ctx: CheckContext) -> Check { @@ -206,6 +207,12 @@ fn evaluate(snapshots: &[Snapshot], now: Timestamp) -> Check { summary.clone(), format!("no backup in {}", humanise_age(FAIL_AGE_SECS)), ) + } else if age_secs >= WARN_AGE_SECS { + Check::warning( + CHECK_NAME, + summary.clone(), + format!("no backup in {}", humanise_age(WARN_AGE_SECS)), + ) } else { Check::pass(CHECK_NAME, summary) }; @@ -300,6 +307,18 @@ mod tests { assert!(matches!(check.status, CheckStatus::Pass), "{check:?}"); } + #[test] + fn warn_when_postgres_snapshot_between_12h_and_24h() { + let now = Timestamp::from_second(20_000_000).unwrap(); + let snapshots = vec![snapshot( + "/var/lib/postgresql/16/main", + Some(now - 18.hours()), + None, + )]; + let check = evaluate(&snapshots, now); + assert!(matches!(check.status, CheckStatus::Warning(_)), "{check:?}"); + } + #[test] fn fail_when_postgres_snapshot_older_than_24h() { let now = Timestamp::from_second(20_000_000).unwrap(); diff --git a/crates/alertd/src/doctor/checks/load.rs b/crates/alertd/src/doctor/checks/load.rs index 32ed48ab..6cc9fc8e 100644 --- a/crates/alertd/src/doctor/checks/load.rs +++ b/crates/alertd/src/doctor/checks/load.rs @@ -1,7 +1,14 @@ -use sysinfo::System; +use sysinfo::{CpuRefreshKind, RefreshKind, System}; use super::CheckContext; -use crate::doctor::check::Check; +use crate::doctor::check::{Check, CheckStatus}; + +/// Multiplier on the logical core count above which the 5-minute load average +/// is treated as a hard failure. +const FAIL_PER_CORE: f64 = 4.0; +/// Multiplier on the logical core count above which the 5-minute load average +/// is treated as a warning. +const WARN_PER_CORE: f64 = 1.5; pub async fn run(_ctx: CheckContext) -> Check { if cfg!(target_os = "windows") { @@ -12,13 +19,71 @@ pub async fn run(_ctx: CheckContext) -> Check { ); } + let sys = + System::new_with_specifics(RefreshKind::nothing().with_cpu(CpuRefreshKind::nothing())); + let cores = sys.cpus().len().max(1); + let load = System::load_average(); let summary = format!( - "load average: {:.2}, {:.2}, {:.2}", + "load average: {:.2}, {:.2}, {:.2} ({cores} cores)", load.one, load.five, load.fifteen ); - Check::pass("load", summary) + + let check = match tier(load.five, cores) { + CheckStatus::Fail(_) => Check::fail( + "load", + summary, + format!( + "5-min load {:.2} over {:.1}x cores ({cores})", + load.five, FAIL_PER_CORE + ), + ), + CheckStatus::Warning(_) => Check::warning( + "load", + summary, + format!( + "5-min load {:.2} over {:.1}x cores ({cores})", + load.five, WARN_PER_CORE + ), + ), + _ => Check::pass("load", summary), + }; + + check .with_detail("one_min", load.one) .with_detail("five_min", load.five) .with_detail("fifteen_min", load.fifteen) + .with_detail("cores", cores) +} + +/// Tier the 5-minute load average against the logical core count. +fn tier(five: f64, cores: usize) -> CheckStatus { + let cores = cores as f64; + if five > FAIL_PER_CORE * cores { + CheckStatus::Fail(String::new()) + } else if five > WARN_PER_CORE * cores { + CheckStatus::Warning(String::new()) + } else { + CheckStatus::Pass + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tier_boundaries() { + assert!(matches!(tier(5.9, 4), CheckStatus::Pass)); + assert!(matches!(tier(6.1, 4), CheckStatus::Warning(_))); + assert!(matches!(tier(15.9, 4), CheckStatus::Warning(_))); + assert!(matches!(tier(16.1, 4), CheckStatus::Fail(_))); + } + + #[test] + fn tier_single_core() { + assert!(matches!(tier(1.4, 1), CheckStatus::Pass)); + assert!(matches!(tier(1.6, 1), CheckStatus::Warning(_))); + assert!(matches!(tier(4.1, 1), CheckStatus::Fail(_))); + } } diff --git a/crates/alertd/src/doctor/checks/patient_communication_errors.rs b/crates/alertd/src/doctor/checks/patient_communication_errors.rs index 74d5c537..0caf97c0 100644 --- a/crates/alertd/src/doctor/checks/patient_communication_errors.rs +++ b/crates/alertd/src/doctor/checks/patient_communication_errors.rs @@ -2,7 +2,7 @@ use jiff::{Timestamp, ToSpan}; -use super::{CheckContext, util::fail_if_any_rows}; +use super::{CheckContext, util::tiered_rows_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; @@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check { }; let since = Timestamp::now() - LOOKBACK_HOURS.hours(); - fail_if_any_rows( + tiered_rows_check( client, "patient_communication_errors", "no recent patient communication errors", "patient communication errors: ", SQL, &[&since], + 1, + 10, ) .await } diff --git a/crates/alertd/src/doctor/checks/report_errors.rs b/crates/alertd/src/doctor/checks/report_errors.rs index 4dd0b2ce..dc63d35c 100644 --- a/crates/alertd/src/doctor/checks/report_errors.rs +++ b/crates/alertd/src/doctor/checks/report_errors.rs @@ -2,7 +2,7 @@ use jiff::{Timestamp, ToSpan}; -use super::{CheckContext, util::fail_if_any_rows}; +use super::{CheckContext, util::tiered_rows_check}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; @@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check { }; let since = Timestamp::now() - LOOKBACK_HOURS.hours(); - fail_if_any_rows( + tiered_rows_check( client, "report_errors", "no recent report errors", "report errors: ", SQL, &[&since], + 1, + 10, ) .await } diff --git a/crates/alertd/src/doctor/checks/sync_facility_stale.rs b/crates/alertd/src/doctor/checks/sync_facility_stale.rs index e25c21bb..9daa36a1 100644 --- a/crates/alertd/src/doctor/checks/sync_facility_stale.rs +++ b/crates/alertd/src/doctor/checks/sync_facility_stale.rs @@ -1,36 +1,38 @@ //! Facilities whose sync has gone stale. //! -//! Flags facilities that synced in the last 48h but have had no completion in -//! the last 30m, as well as facilities whose last successful sync was over an -//! hour ago. +//! Sync runs about every 60s, so for each facility that has synced in the last +//! 48h we compute the minutes since its last successful (errorless, completed) +//! sync and tier: WARN past 10 minutes, FAIL past 30. The 48h-active guard +//! keeps decommissioned facilities from flagging. -use serde_json::Value; +use serde_json::{Value, json}; -use super::{CheckContext, util::fetch_rows}; +use super::{CheckContext, fmt_db_error}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; const NAME: &str = "sync_facility_stale"; -const NOT_SYNCING_SQL: &str = "with sync_sessions_with_facility_id as ( \ - select created_at, completed_at, \ - jsonb_array_elements_text(parameters->'facilityIds') as facility_id \ - from sync_sessions where parameters->>'isMobile' <> 'true' \ - ) \ - select distinct facility_id from sync_sessions_with_facility_id \ - where created_at > current_timestamp - '48 hours'::interval \ - except \ - select facility_id from sync_sessions_with_facility_id \ - where completed_at > current_timestamp - '30 minutes'::interval \ - group by facility_id order by facility_id"; +const WARN_MINUTES: f64 = 10.0; +const FAIL_MINUTES: f64 = 30.0; -const NO_RECENT_SUCCESS_SQL: &str = "SELECT facility_id, last_successful_sync FROM ( \ - SELECT facility_id, max(completed_at) as last_successful_sync FROM ( \ - SELECT jsonb_array_elements_text(parameters->'facilityIds') as facility_id, completed_at \ - FROM sync_sessions WHERE errors IS NULL \ - ) AS successful_syncs GROUP BY facility_id \ - ) AS last_successful_facility_syncs \ - WHERE last_successful_sync < now() - interval '1 hour'"; +const SQL: &str = "WITH facility_sessions AS ( \ + SELECT jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \ + created_at, completed_at, errors \ + FROM sync_sessions WHERE parameters->>'isMobile' <> 'true' \ + ), active AS ( \ + SELECT DISTINCT facility_id FROM facility_sessions \ + WHERE created_at > now() - interval '48 hours' \ + ), last_success AS ( \ + SELECT facility_id, max(completed_at) AS last_successful_sync \ + FROM facility_sessions WHERE errors IS NULL AND completed_at IS NOT NULL \ + GROUP BY facility_id \ + ) \ + SELECT a.facility_id, \ + ls.last_successful_sync::text AS last_successful_sync, \ + EXTRACT(EPOCH FROM (now() - ls.last_successful_sync)) / 60 AS minutes_since_success \ + FROM active a LEFT JOIN last_success ls USING (facility_id) \ + ORDER BY minutes_since_success DESC NULLS FIRST"; pub async fn run(ctx: CheckContext) -> Check { if ctx.kind != ApiServerKind::Central { @@ -44,41 +46,55 @@ pub async fn run(ctx: CheckContext) -> Check { return Check::skip(NAME, "no DB connection", "db unavailable"); }; - let not_syncing = match fetch_rows(client, NOT_SYNCING_SQL, &[]).await { - Ok(set) => set, - Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), - }; - let no_recent_success = match fetch_rows(client, NO_RECENT_SUCCESS_SQL, &[]).await { - Ok(set) => set, - Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)), + let rows = match client.query(SQL, &[]).await { + Ok(r) => r, + Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), }; - if not_syncing.is_empty() && no_recent_success.is_empty() { - return Check::pass(NAME, "all facilities syncing"); + let mut warn = Vec::new(); + let mut fail = Vec::new(); + for row in &rows { + let facility_id: String = row.try_get("facility_id").unwrap_or_default(); + let last: Option = row.try_get("last_successful_sync").ok(); + // A facility that is active but has never had a successful sync (NULL + // minutes) is as bad as a very stale one: treat it as a failure. + let minutes: Option = row.try_get("minutes_since_success").ok(); + let entry = json!({ + "facility_id": facility_id, + "last_successful_sync": last, + "minutes_since_success": minutes, + }); + match minutes { + Some(m) if m <= WARN_MINUTES => {} + Some(m) if m <= FAIL_MINUTES => warn.push(entry), + _ => fail.push(entry), + } } - let (not_syncing_count, not_syncing_truncated) = (not_syncing.count(), not_syncing.truncated); - let (no_recent_count, no_recent_truncated) = - (no_recent_success.count(), no_recent_success.truncated); + if warn.is_empty() && fail.is_empty() { + return Check::pass(NAME, "all facilities syncing"); + } - let check = Check::fail( - NAME, - format!( - "stale sync: {not_syncing_count} not syncing, {no_recent_count} with no recent success" - ), - "facility sync stale", + let summary = format!( + "stale sync: {} over {}m, {} over {}m", + fail.len(), + FAIL_MINUTES as i64, + warn.len(), + WARN_MINUTES as i64 ); + let check = if fail.is_empty() { + Check::warning(NAME, summary, "facility sync stale") + } else { + Check::fail(NAME, summary, "facility sync stale") + }; check - .with_detail("not_syncing", Value::Array(not_syncing.rows)) - .with_detail("not_syncing_count", not_syncing_count) - .with_detail("not_syncing_truncated", not_syncing_truncated) - .with_detail("no_recent_success", Value::Array(no_recent_success.rows)) - .with_detail("no_recent_success_count", no_recent_count) - .with_detail("no_recent_success_truncated", no_recent_truncated) + .with_detail("fail", Value::Array(fail)) + .with_detail("warn", Value::Array(warn)) } #[cfg(test)] mod tests { + use crate::doctor::check::CheckStatus; use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; #[tokio::test] @@ -88,6 +104,10 @@ mod tests { }; let check = super::run(ctx).await; assert_eq!(check.name, "sync_facility_stale"); + assert!(matches!( + check.status, + CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_) + )); } #[tokio::test] diff --git a/crates/alertd/src/doctor/checks/sync_lookup.rs b/crates/alertd/src/doctor/checks/sync_lookup.rs index 0a7a72ad..4c229c73 100644 --- a/crates/alertd/src/doctor/checks/sync_lookup.rs +++ b/crates/alertd/src/doctor/checks/sync_lookup.rs @@ -1,16 +1,20 @@ //! Lookup table update staleness. //! -//! Fails when the central server hasn't recorded a successful lookup-table -//! update in over an hour. +//! The lookup table refreshes roughly every 20s, so tier on minutes of +//! staleness: WARN past 2 minutes, FAIL past 5. If the tracking row is absent, +//! treat the lookup as not tracked and pass. -use super::{CheckContext, util::fail_if_any_rows}; -use crate::doctor::check::Check; +use super::{CheckContext, fmt_db_error}; +use crate::doctor::check::{Check, CheckStatus}; use bestool_tamanu::ApiServerKind; const NAME: &str = "sync_lookup"; -const SQL: &str = "SELECT key, value AS last_sync_tick, updated_at::text AS last_updated, \ - (now() - updated_at)::text AS time_since_update FROM local_system_facts \ - WHERE key = 'lastSuccessfulLookupTableUpdate' AND updated_at < now() - interval '1 hour'"; +const SQL: &str = "SELECT value AS last_sync_tick, updated_at::text AS last_updated, \ + EXTRACT(EPOCH FROM (now() - updated_at))::bigint AS age_seconds \ + FROM local_system_facts WHERE key = 'lastSuccessfulLookupTableUpdate'"; + +const WARN_SECS: i64 = 2 * 60; +const FAIL_SECS: i64 = 5 * 60; pub async fn run(ctx: CheckContext) -> Check { if ctx.kind != ApiServerKind::Central { @@ -24,21 +28,66 @@ pub async fn run(ctx: CheckContext) -> Check { return Check::skip(NAME, "no DB connection", "db unavailable"); }; - fail_if_any_rows( - client, - NAME, - "lookup table up to date", - "lookup table stale: ", - SQL, - &[], - ) - .await + let row = match client.query_opt(SQL, &[]).await { + Ok(Some(r)) => r, + Ok(None) => return Check::pass(NAME, "lookup table not tracked"), + Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), + }; + + let last_sync_tick: Option = row.try_get("last_sync_tick").ok(); + let last_updated: Option = row.try_get("last_updated").ok(); + let age_seconds: i64 = row.try_get("age_seconds").unwrap_or(0); + + let summary = format!("lookup table updated {}m ago", age_seconds / 60); + let check = match tier(age_seconds) { + CheckStatus::Fail(_) => Check::fail( + NAME, + summary, + format!("lookup table stale: {age_seconds}s since last update"), + ), + CheckStatus::Warning(_) => Check::warning( + NAME, + summary, + format!("lookup table stale: {age_seconds}s since last update"), + ), + _ => Check::pass(NAME, "lookup table up to date"), + }; + + let mut check = check.with_detail("age_seconds", age_seconds); + if let Some(tick) = last_sync_tick { + check = check.with_detail("last_sync_tick", tick); + } + if let Some(updated) = last_updated { + check = check.with_detail("last_updated", updated); + } + check +} + +/// Tier on seconds since the lookup table last updated. +fn tier(seconds: i64) -> CheckStatus { + if seconds > FAIL_SECS { + CheckStatus::Fail(String::new()) + } else if seconds > WARN_SECS { + CheckStatus::Warning(String::new()) + } else { + CheckStatus::Pass + } } #[cfg(test)] mod tests { + use super::*; use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; + #[test] + fn tier_boundaries() { + assert!(matches!(tier(0), CheckStatus::Pass)); + assert!(matches!(tier(120), CheckStatus::Pass)); + assert!(matches!(tier(121), CheckStatus::Warning(_))); + assert!(matches!(tier(300), CheckStatus::Warning(_))); + assert!(matches!(tier(301), CheckStatus::Fail(_))); + } + #[tokio::test] async fn runs_against_central() { let Some(ctx) = central_ctx().await else { @@ -46,6 +95,10 @@ mod tests { }; let check = super::run(ctx).await; assert_eq!(check.name, "sync_lookup"); + assert!(matches!( + check.status, + CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_) + )); } #[tokio::test] diff --git a/crates/alertd/src/doctor/checks/sync_restart_loop.rs b/crates/alertd/src/doctor/checks/sync_restart_loop.rs index 09d36bc2..cf5fb7a0 100644 --- a/crates/alertd/src/doctor/checks/sync_restart_loop.rs +++ b/crates/alertd/src/doctor/checks/sync_restart_loop.rs @@ -1,19 +1,25 @@ //! Facilities stuck in a sync restart loop. //! -//! Fails when a facility has accumulated 10 or more `snapshot-for-pushing` sync -//! errors in the last hour, which indicates the sync is repeatedly restarting -//! rather than progressing. +//! Counts `snapshot-for-pushing` sync errors per facility in the last hour, +//! which indicates sync repeatedly restarting rather than progressing. WARN at +//! 5 restarts/hr, FAIL at 10. -use super::{CheckContext, util::fail_if_any_rows}; +use serde_json::{Value, json}; + +use super::{CheckContext, fmt_db_error}; use crate::doctor::check::Check; use bestool_tamanu::ApiServerKind; const NAME: &str = "sync_restart_loop"; + +const WARN_RESTARTS: i64 = 5; +const FAIL_RESTARTS: i64 = 10; + const SQL: &str = "SELECT jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \ COUNT(*) AS error_count FROM sync_sessions \ WHERE created_at > now() - interval '1 hour' AND errors IS NOT NULL \ AND cardinality(errors) = 1 AND errors[1] LIKE '%snapshot-for-pushing%' \ - GROUP BY facility_id HAVING COUNT(*) >= 10 ORDER BY error_count DESC"; + GROUP BY facility_id HAVING COUNT(*) >= 5 ORDER BY error_count DESC"; pub async fn run(ctx: CheckContext) -> Check { if ctx.kind != ApiServerKind::Central { @@ -27,19 +33,46 @@ pub async fn run(ctx: CheckContext) -> Check { return Check::skip(NAME, "no DB connection", "db unavailable"); }; - fail_if_any_rows( - client, - NAME, - "no sync restart loops", - "facilities in sync restart loop: ", - SQL, - &[], - ) - .await + let rows = match client.query(SQL, &[]).await { + Ok(r) => r, + Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)), + }; + + let mut warn = Vec::new(); + let mut fail = Vec::new(); + for row in &rows { + let facility_id: String = row.try_get("facility_id").unwrap_or_default(); + let error_count: i64 = row.try_get("error_count").unwrap_or(0); + let entry = json!({ "facility_id": facility_id, "error_count": error_count }); + if error_count >= FAIL_RESTARTS { + fail.push(entry); + } else if error_count >= WARN_RESTARTS { + warn.push(entry); + } + } + + if warn.is_empty() && fail.is_empty() { + return Check::pass(NAME, "no sync restart loops"); + } + + let summary = format!( + "sync restart loops: {} over {FAIL_RESTARTS}/hr, {} over {WARN_RESTARTS}/hr", + fail.len(), + warn.len() + ); + let check = if fail.is_empty() { + Check::warning(NAME, summary, "facilities in sync restart loop") + } else { + Check::fail(NAME, summary, "facilities in sync restart loop") + }; + check + .with_detail("fail", Value::Array(fail)) + .with_detail("warn", Value::Array(warn)) } #[cfg(test)] mod tests { + use crate::doctor::check::CheckStatus; use crate::doctor::checks::test_support::{central_ctx, facility_ctx}; #[tokio::test] @@ -49,6 +82,10 @@ mod tests { }; let check = super::run(ctx).await; assert_eq!(check.name, "sync_restart_loop"); + assert!(matches!( + check.status, + CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_) + )); } #[tokio::test] diff --git a/crates/alertd/src/doctor/checks/sync_session_errors.rs b/crates/alertd/src/doctor/checks/sync_session_errors.rs index f29d893d..8b15f0db 100644 --- a/crates/alertd/src/doctor/checks/sync_session_errors.rs +++ b/crates/alertd/src/doctor/checks/sync_session_errors.rs @@ -12,6 +12,8 @@ use bestool_tamanu::ApiServerKind; const NAME: &str = "sync_session_errors"; +const FAIL_ERRORS: usize = 10; + const MOBILE_SQL: &str = "SELECT id, errors::text, \ jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \ created_at::text AS created, (completed_at - created_at)::text AS duration \ @@ -62,11 +64,20 @@ pub async fn run(ctx: CheckContext) -> Check { let (mobile_count, mobile_truncated) = (mobile.count(), mobile.truncated); let (server_count, server_truncated) = (server.count(), server.truncated); - let check = Check::fail( - NAME, - format!("sync session errors: {mobile_count} mobile, {server_count} server"), - "recent sync session error(s)", - ); + // Truncation means well over FAIL_ERRORS rows, so saturate the total there. + let total = if mobile.truncated || server.truncated { + FAIL_ERRORS + } else { + mobile.rows.len() + server.rows.len() + }; + + let summary = format!("sync session errors: {mobile_count} mobile, {server_count} server"); + let reason = "recent sync session error(s)"; + let check = if total >= FAIL_ERRORS { + Check::fail(NAME, summary, reason) + } else { + Check::warning(NAME, summary, reason) + }; check .with_detail("mobile", Value::Array(mobile.rows)) .with_detail("mobile_count", mobile_count) diff --git a/crates/alertd/src/doctor/checks/sync_sessions.rs b/crates/alertd/src/doctor/checks/sync_sessions.rs index 4c79b8af..34252dac 100644 --- a/crates/alertd/src/doctor/checks/sync_sessions.rs +++ b/crates/alertd/src/doctor/checks/sync_sessions.rs @@ -12,10 +12,10 @@ pub async fn run(ctx: CheckContext) -> Check { SELECT count(*) FILTER (WHERE completed_at IS NULL) AS active_count, count(*) FILTER ( - WHERE completed_at IS NULL AND start_time < now() - interval '1 hour' + WHERE completed_at IS NULL AND start_time < now() - interval '15 minutes' ) AS stuck_warn, count(*) FILTER ( - WHERE completed_at IS NULL AND start_time < now() - interval '6 hours' + WHERE completed_at IS NULL AND start_time < now() - interval '45 minutes' ) AS stuck_fail, min(start_time) FILTER (WHERE completed_at IS NULL) AS oldest_started_at FROM sync_sessions @@ -42,18 +42,18 @@ pub async fn run(ctx: CheckContext) -> Check { let stuck_fail: i64 = row.try_get("stuck_fail").unwrap_or(0); let oldest: Option = row.try_get("oldest_started_at").ok(); - let summary = format!("{active} active, {stuck_warn} stuck >1h"); + let summary = format!("{active} active, {stuck_warn} stuck >15m"); let check = if stuck_fail > 0 { Check::fail( "sync_sessions", summary.clone(), - format!("{stuck_fail} session(s) stuck >6h"), + format!("{stuck_fail} session(s) stuck >45m"), ) } else if stuck_warn > 0 { Check::warning( "sync_sessions", summary.clone(), - format!("{stuck_warn} session(s) stuck >1h"), + format!("{stuck_warn} session(s) stuck >15m"), ) } else { Check::pass("sync_sessions", summary) diff --git a/crates/alertd/src/doctor/checks/tamanu_http.rs b/crates/alertd/src/doctor/checks/tamanu_http.rs index 3a5133d8..45f19dcc 100644 --- a/crates/alertd/src/doctor/checks/tamanu_http.rs +++ b/crates/alertd/src/doctor/checks/tamanu_http.rs @@ -5,6 +5,8 @@ use crate::doctor::check::Check; const PING_URL: &str = "http://localhost/api/public/ping"; const TIMEOUT: Duration = Duration::from_secs(5); +/// Response latency above which a reachable endpoint is treated as degraded. +const WARN_LATENCY_MS: u64 = 2000; pub async fn run(ctx: CheckContext) -> Check { let start = Instant::now(); @@ -16,10 +18,16 @@ pub async fn run(ctx: CheckContext) -> Check { let status = resp.status(); let detail_status = status.as_u16(); if status.is_success() { - Check::pass( - "tamanu_http", - format!("HTTP {} from {PING_URL} ({latency_ms}ms)", status.as_u16()), - ) + let summary = format!("HTTP {} from {PING_URL} ({latency_ms}ms)", status.as_u16()); + if latency_ms > WARN_LATENCY_MS { + Check::warning( + "tamanu_http", + summary, + format!("response latency {latency_ms}ms over {WARN_LATENCY_MS}ms"), + ) + } else { + Check::pass("tamanu_http", summary) + } .with_detail("status_code", detail_status) } else { Check::fail( diff --git a/crates/alertd/src/doctor/checks/uptime.rs b/crates/alertd/src/doctor/checks/uptime.rs index 1ddd42fc..79243845 100644 --- a/crates/alertd/src/doctor/checks/uptime.rs +++ b/crates/alertd/src/doctor/checks/uptime.rs @@ -3,9 +3,22 @@ use sysinfo::System; use super::CheckContext; use crate::doctor::check::Check; +/// Below this uptime the host has rebooted recently, which may be unexpected. +const WARN_UPTIME_SECS: u64 = 10 * 60; + pub async fn run(_ctx: CheckContext) -> Check { let secs = System::uptime(); - Check::pass("uptime", humanise(secs)).with_detail("uptime_secs", secs) + let summary = humanise(secs); + let check = if secs < WARN_UPTIME_SECS { + Check::warning( + "uptime", + summary, + "host rebooted within the last 10 minutes", + ) + } else { + Check::pass("uptime", summary) + }; + check.with_detail("uptime_secs", secs) } fn humanise(secs: u64) -> String { diff --git a/crates/alertd/src/doctor/checks/util.rs b/crates/alertd/src/doctor/checks/util.rs index 3cfbcdc2..6ab1cefe 100644 --- a/crates/alertd/src/doctor/checks/util.rs +++ b/crates/alertd/src/doctor/checks/util.rs @@ -64,32 +64,78 @@ pub async fn fetch_rows( Ok(RowSet { rows, truncated }) } -/// Run a single wrapped query: fail (with capped rows + count) if it -/// returns any rows, else pass. +/// Run a single wrapped query and tier the outcome on the number of +/// matching rows: PASS below `warn_min`, WARN at or above it, FAIL at or above +/// `fail_min`. /// -/// `summary_pass` is the headline shown when nothing matched; -/// `summary_fail_prefix` is prepended to the count when rows are found. -pub async fn fail_if_any_rows( +/// `summary_pass` is the headline shown when nothing crosses `warn_min`; +/// `summary_prefix` is prepended to the count for the WARN/FAIL summary. +/// +/// Rows are capped at [`REPORT_CAP`] (reported as `"100+"`), which is enough to +/// distinguish the small WARN/FAIL boundaries the error-stream checks use. +#[expect( + clippy::too_many_arguments, + reason = "shared query helper; each parameter is a distinct knob the call sites set" +)] +pub async fn tiered_rows_check( client: &Arc, name: &'static str, summary_pass: &str, - summary_fail_prefix: &str, + summary_prefix: &str, sql: &str, params: &[&(dyn ToSql + Sync)], + warn_min: usize, + fail_min: usize, ) -> Check { match fetch_rows(client, sql, params).await { - Ok(set) if set.is_empty() => Check::pass(name, summary_pass.to_string()), Ok(set) => { + // `truncated` means there were more than REPORT_CAP rows, which is + // well past any realistic fail_min, so treat it as the cap. + let n = if set.truncated { + REPORT_CAP + 1 + } else { + set.rows.len() + }; let count = set.count(); - Check::fail( - name, - format!("{summary_fail_prefix}{count}"), - format!("{} matching row(s)", count), - ) - .with_detail("rows", Value::Array(set.rows)) - .with_detail("truncated", set.truncated) - .with_detail("count", count) + if n < warn_min { + return Check::pass(name, summary_pass.to_string()); + } + let summary = format!("{summary_prefix}{count}"); + let reason = format!("{count} matching row(s)"); + let check = if n >= fail_min { + Check::fail(name, summary, reason) + } else { + Check::warning(name, summary, reason) + }; + check + .with_detail("rows", Value::Array(set.rows)) + .with_detail("truncated", set.truncated) + .with_detail("count", count) } Err(err) => Check::fail(name, "query failed", fmt_db_error(&err)), } } + +#[cfg(test)] +mod tests { + /// Pure count→tier decision mirroring [`tiered_rows_check`], factored so the + /// WARN/FAIL boundaries can be asserted without a database. + fn tier(n: usize, warn_min: usize, fail_min: usize) -> &'static str { + if n >= fail_min { + "fail" + } else if n >= warn_min { + "warning" + } else { + "pass" + } + } + + #[test] + fn error_stream_boundaries() { + assert_eq!(tier(0, 1, 10), "pass"); + assert_eq!(tier(1, 1, 10), "warning"); + assert_eq!(tier(9, 1, 10), "warning"); + assert_eq!(tier(10, 1, 10), "fail"); + assert_eq!(tier(100, 1, 10), "fail"); + } +} diff --git a/crates/alertd/src/doctor/server_info.rs b/crates/alertd/src/doctor/server_info.rs index ba113a86..887116ce 100644 --- a/crates/alertd/src/doctor/server_info.rs +++ b/crates/alertd/src/doctor/server_info.rs @@ -8,7 +8,7 @@ use std::{ }; use serde::Serialize; -use sysinfo::{Disks, System}; +use sysinfo::{CpuRefreshKind, Disks, MemoryRefreshKind, RefreshKind, System}; use tokio::net::TcpStream; use tracing::debug; @@ -63,6 +63,10 @@ pub struct ServerInfo { pub pg_version: Option, pub uptime_secs: u64, + /// Logical CPU count (what load average is relative to, i.e. `nproc`). + pub cpu_cores: usize, + /// Total physical memory, in bytes. + pub total_memory_bytes: u64, pub os_kind: &'static str, #[serde(skip_serializing_if = "Option::is_none")] pub os_name: Option, @@ -115,6 +119,14 @@ pub async fn gather(bestool_version: &str, tamanu_version: &str, facts: ServerFa .iana_name() .map(|s| s.to_string()); + let sys = System::new_with_specifics( + RefreshKind::nothing() + .with_cpu(CpuRefreshKind::nothing()) + .with_memory(MemoryRefreshKind::nothing().with_ram()), + ); + let cpu_cores = sys.cpus().len(); + let total_memory_bytes = sys.total_memory(); + ServerInfo { bestool_version: bestool_version.to_string(), tamanu_version: tamanu_version.to_string(), @@ -126,6 +138,8 @@ pub async fn gather(bestool_version: &str, tamanu_version: &str, facts: ServerFa os_timezone, pg_version: facts.pg_version, uptime_secs: System::uptime(), + cpu_cores, + total_memory_bytes, os_kind: if cfg!(target_os = "linux") { "linux" } else if cfg!(target_os = "windows") { diff --git a/docs/plans/healthchecks-into-alertd.md b/docs/plans/healthchecks-into-alertd.md deleted file mode 100644 index edcf65b6..00000000 --- a/docs/plans/healthchecks-into-alertd.md +++ /dev/null @@ -1,72 +0,0 @@ -# Plan: consolidate healthchecks into alertd, migrate YAML alerts to checks, retire the alert engine (TODO #10) - -## Context - -Two monitoring systems run in parallel: - -- **Healthchecks** (doctor): code-defined `Check`s run as a concurrent "sweep" → pass/warning/fail + JSON details, POSTed to **canopy** (`POST /status/{server_id}`). Today these live in `crates/tamanu/src/doctor/` (the `Check` type + checks), with the sweep orchestration (`perform_sweep`), canopy posting, and the `DoctorTask` background task in `crates/bestool/`. Viewable via `bestool tamanu doctor`. -- **alertd** (`crates/alertd/`): a daemon loading **YAML alert definitions** (deployed in Tamanu installs at `/etc/tamanu/alerts`, …), scheduling each on its own interval, evaluating SQL/shell/event sources, and dispatching to email/Slack/canopy `/events`. Ships a standalone `bestool-alertd` binary + library; also hosts the `DoctorTask`. - -Decisions: - -1. **Invert the crate relationship.** Move the whole doctor subsystem (framework + checks + sweep + canopy posting + `DoctorTask`) **into `bestool-alertd`**, which calls into `bestool-tamanu` for common Tamanu domain utilities. alertd becomes the monitoring engine that owns both the framework and the checks. No dependency cycle: `bestool-tamanu` never depends on alertd. -2. **Migrate** all 16 production YAML alerts (`~/code/work/tamanu/alerts`) into checks. Migrated checks default to **`Check::fail`** when triggered (single severity, no warn tier). -3. **Canopy owns alerting** and has its own logic — it ignores the sweep's top-level `healthy:false`, so the warn-vs-fail-for-top-level distinction is irrelevant at the canopy level. bestool just posts the sweep; drop email/Slack/per-alert targets, dedup, hysteresis, cadence. -4. **Retire the YAML alert engine and the standalone CLI**; alertd keeps only the daemon framework + the doctor subsystem. -5. **Then review thresholds** across all checks (migrated and pre-existing). - -Note: deployed installs still have YAML files under `/etc/tamanu/alerts`; once the loader is removed they're simply ignored (no error). Operators can delete them later. - -## Target architecture - -- **`bestool-alertd`**: owns the monitoring framework (`BackgroundTask` daemon, http server) **and** the doctor subsystem — `Check`/`CheckStatus`/`OverallResult` wire types, `CheckContext`, the registry + `checks/*`, `progress`, the `ServerInfo` facts, `perform_sweep` + `SweepResult` + canopy status posting, and a built-in `DoctorTask` it registers itself. Depends on `bestool-tamanu` (common domain), `bestool-canopy`, `bestool-postgres`, `bestool-kopia`. -- **`bestool-tamanu`**: common Tamanu domain library only — `config`, `roots`, `connection_url`, `services`, `systemd`, `pm2`, `server_info` (DB queries: metaServerId, patient-portal), `versions`, `ApiServerKind`, `find_tamanu`, `detect_kind`. The `doctor` module and `doctor` feature are removed; description updated. -- **`bestool`**: thin CLI. `bestool tamanu doctor` keeps arg parsing + human rendering + daemon-fetch (`/tasks/doctor/latest`/`recompute`) and calls `bestool_alertd::doctor` for local sweeps + types. `bestool tamanu alertd` configures and runs the alertd daemon (which self-registers its `DoctorTask`). - -## Phase 1 — Invert: move the doctor subsystem into alertd (behaviour-preserving refactor) - -- Relocate `crates/tamanu/src/doctor/{check,checks,checks/*,progress,server_info}.rs` → `crates/alertd/src/doctor/…`. -- Move `perform_sweep` + `SweepResult` + canopy status posting from `crates/bestool/src/actions/tamanu/doctor.rs` into alertd (e.g. `bestool_alertd::doctor::perform_sweep`). -- Move `DoctorTask` (`crates/bestool/src/actions/tamanu/alertd/doctor_task.rs`) into alertd as the built-in task; alertd registers it (or exposes a constructor) so bestool no longer wires it. -- Add `bestool-tamanu` as an alertd dependency; rewrite check imports from `crate::{ApiServerKind, config::TamanuConfig, services, systemd, pm2, server_info, detect_kind, versions}` → `bestool_tamanu::{…}`. -- Move doctor-only deps (`bestool-kopia`, `hickory-resolver`, and `reqwest`/`owo-colors` as needed) from `crates/tamanu/Cargo.toml` to `crates/alertd/Cargo.toml`; remove tamanu's `doctor` feature and update its package description. -- bestool side: `doctor.rs` keeps CLI args + rendering + daemon-fetch, calling `bestool_alertd::doctor`; delete the moved `doctor_task` module; retarget Cargo features (`bestool-tamanu/doctor` → alertd). -- **Behaviour-preserving** — no check logic changes. This is large but mechanical (mostly imports + module moves). - -## Phase 2 — Migrate the 16 YAML alerts to checks (now in alertd), default FAIL, central-only - -Migrated checks emit `Check::fail` when triggered, skip on Facility (gate on `ctx.kind`, mirroring `fhir_jobs`), and attach offending rows as `details`. A shared "recent error rows" helper serves the 7 recent-error alerts (run query → `fail` with rows if any match, else `pass`); the old per-alert `$1 = now - interval` becomes a per-check lookback constant. Verbatim SQL is in `~/code/work/tamanu/alerts/.yml`. - -**New checks (~10):** -| Alert(s) | New check | Style | -|---|---|---| -| certificate-notification-error | `certificate_notification_errors` | recent-error | -| ips-error | `ips_errors` | recent-error | -| patient-communications-error | `patient_communication_errors` | recent-error | -| report-error | `report_errors` | recent-error | -| fhir-error | `fhir_job_errors` | recent-error | -| sync-errors-mobile + sync-errors-server | `sync_session_errors` (one check; detail splits mobile/server; keep benign-error exclusions) | recent-error | -| sync-facility-not-syncing + sync-no-sessions | `sync_facility_stale` (one check; facilities with no recent successful sync) | stuck | -| sync-lookup-stale | `sync_lookup` (**= TODO #8**) | stuck | -| sync-restart-loop | `sync_restart_loop` | threshold | -| fhir-unresolvable-service-requests-labs | `fhir_service_requests_unresolved` | stuck | - -**Already covered (confirm/extend detail, no new check):** fhir-queue-incredibly-large, fhir-queued-job-long, fhir-running-job-long → `fhir_jobs`; sync-long → `sync_sessions`. - -Add via the registry pattern: `pub mod ;` + `entry!("", )` in the registry; `pub async fn run(ctx: CheckContext) -> Check`. Split into ~3 PRs by theme (error-notification / sync / fhir+reconcile). - -## Phase 3 — Retire the YAML alert engine + standalone CLI - -- Remove from alertd: `alert.rs`, `loader.rs`, `glob_resolver.rs`, `events.rs`, `targets.rs` + `targets/*`, `templates.rs`, per-alert `state_file.rs`, the alert parts of `scheduler.rs`, `commands.rs` + `commands/*`, `main.rs`, the `[[bin]]` + `cli` feature, `windows_service.rs`. Trim `DaemonConfig` (drop `alert_globs`, `email`, `server_kind`, alert `dry_run`; keep `pg_pool`, `database_url`, `device_key_pem`, `tamanu_version`, `no_server`, `server_addrs`, `watchdog_timeout`, `background_tasks`), `daemon.rs`, `http_server` (drop `/alerts`,`/targets`,`/validate`,`/reload`,`/pause`; keep `/`,`/status`,`/health`,`/metrics`,`/tasks/*`), and `lib.rs` exports. Relocate `InternalContext` out of `alert.rs` into `daemon.rs`/`context.rs`, slimmed to `{ pg_pool, http_client, canopy_client }`. -- bestool: simplify `tamanu alertd` (drop alert-dir discovery/globs, email/Mailgun flags, alert-filtering `server_kind`, and the passthrough subcommands `status`/`reload`/`pause`/`validate`/`loaded-alerts`); keep pg pool, device-key fetch (canopy auth), `tamanu_version`, build `DaemonConfig`, run. Remove the legacy `bestool tamanu alerts` command + module. Delete example alerts (`alerts/`) and alert test fixtures (`crates/bestool/tests/cmd/alerts*`). -- Gated after Phase 2 so coverage isn't lost. Optional follow-up (not in scope): rename `bestool-alertd` / `bestool tamanu alertd` now that it owns healthchecks, not alerts — deferred to avoid crates.io + systemd/install churn. - -## Phase 4 — Threshold review (all checks) - -After migration, review every check (the 10 migrated + the pre-existing ones) for triggering behaviour: warn-vs-fail, threshold values, central/facility gating, and whether any migrated check should be a warning rather than fail. Produce a short follow-up (possibly its own plan) and adjust. Migrated checks land at FAIL in Phase 2; this pass tunes them. - -## Verification - -- **Phase 1 (refactor)**: `cargo build`/`clippy` across the workspace and all feature combos; `cargo check -p bestool --target x86_64-pc-windows-gnu`; confirm identical behaviour — `bestool tamanu doctor` (local `--no-daemon` and daemon-fetch `--fresh`), canopy `/status` posting, and `/tasks/doctor/{latest,recompute}` all work; grep for dangling `bestool_tamanu::doctor` references. -- **Phase 2 (checks)**: against the local `tamanu-central` / `tamanu-facility` databases, `cargo test -p bestool-alertd` (DB-backed tests where feasible) and `bestool tamanu doctor --json --no-daemon`; confirm each new check appears as pass/fail with `details`, and is skipped on a facility install. -- **Phase 3 (teardown)**: full-workspace `cargo build`/`clippy` + Windows cross-check (windows_service removed); `bestool tamanu alertd` starts, ticks the sweep, posts to canopy, and `bestool tamanu doctor` still fetches from it; grep for leftover references (`loader`, `targets`, `templates`, `AlertDefinition`, `tamanu alerts`).