From 7f0da038c77629767a1a7eed6acddf7bc71a05e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Sat, 30 May 2026 17:56:23 +1200
Subject: [PATCH 1/3] feat(alertd/checks): port error-notification alerts to
 checks

Add a shared row-fetching helper and the five recent-error checks ported
from the certificate-notification, ips, patient-communications, report,
and fhir YAML alerts. Each is central-only, skips when the DB is
unavailable, and fails when any matching row exists within the lookback
window.

Co-authored-by: Claude <noreply@anthropic.com>
---
 crates/alertd/src/doctor/checks.rs            | 85 +++++++++++++++++
 .../checks/certificate_notification_errors.rs | 61 ++++++++++++
 .../src/doctor/checks/fhir_job_errors.rs      | 63 ++++++++++++
 crates/alertd/src/doctor/checks/ips_errors.rs | 60 ++++++++++++
 .../checks/patient_communication_errors.rs    | 61 ++++++++++++
 .../alertd/src/doctor/checks/report_errors.rs | 61 ++++++++++++
 crates/alertd/src/doctor/checks/util.rs       | 95 +++++++++++++++++++
 7 files changed, 486 insertions(+)
 create mode 100644 crates/alertd/src/doctor/checks/certificate_notification_errors.rs
 create mode 100644 crates/alertd/src/doctor/checks/fhir_job_errors.rs
 create mode 100644 crates/alertd/src/doctor/checks/ips_errors.rs
 create mode 100644 crates/alertd/src/doctor/checks/patient_communication_errors.rs
 create mode 100644 crates/alertd/src/doctor/checks/report_errors.rs
 create mode 100644 crates/alertd/src/doctor/checks/util.rs
diff --git a/crates/alertd/src/doctor/checks.rs b/crates/alertd/src/doctor/checks.rs
index e998bae8..52271144 100644
--- a/crates/alertd/src/doctor/checks.rs
+++ b/crates/alertd/src/doctor/checks.rs
@@ -13,17 +13,24 @@ use bestool_tamanu::{ApiServerKind, config::TamanuConfig};
 
 use super::check::Check;
 
+pub mod util;
+
 pub mod caddy_version;
+pub mod certificate_notification_errors;
 pub mod db_connect;
 pub mod db_version;
 pub mod disk_free;
 pub mod external_users;
+pub mod fhir_job_errors;
 pub mod fhir_jobs;
 pub mod http_errors;
+pub mod ips_errors;
 pub mod kopia_backup;
 pub mod load;
 pub mod memory;
 pub mod migrations;
+pub mod patient_communication_errors;
+pub mod report_errors;
 pub mod server_id;
 pub mod sync_sessions;
 pub mod tailscale;
@@ -143,5 +150,83 @@ pub fn all() -> Vec<CheckEntry> {
 		entry!("sync_sessions", sync_sessions),
 		entry!("fhir_jobs", fhir_jobs),
 		entry!("kopia_backup", kopia_backup),
+		entry!(
+			"certificate_notification_errors",
+			certificate_notification_errors
+		),
+		entry!("ips_errors", ips_errors),
+		entry!("patient_communication_errors", patient_communication_errors),
+		entry!("report_errors", report_errors),
+		entry!("fhir_job_errors", fhir_job_errors),
 	]
 }
+
+#[cfg(test)]
+pub mod test_support {
+	//! Helpers for DB-backed check tests.
+	//!
+	//! Each migrated check is central-only and DB-backed, so its tests need a
+	//! [`CheckContext`] wired to one of the local `tamanu-central` /
+	//! `tamanu-facility` databases. These connect lazily and return `None` when
+	//! the DB is unavailable so the suite degrades gracefully off-CI.
+
+	use std::sync::Arc;
+
+	use node_semver::Version;
+
+	use bestool_tamanu::{ApiServerKind, config::TamanuConfig};
+
+	use super::CheckContext;
+
+	fn central_config() -> TamanuConfig {
+		serde_json::from_value(serde_json::json!({
+			"db": { "name": "tamanu-central", "username": "u", "password": "p" },
+		}))
+		.expect("central test config should parse")
+	}
+
+	fn facility_config() -> TamanuConfig {
+		serde_json::from_value(serde_json::json!({
+			"db": { "name": "tamanu-facility", "username": "u", "password": "p" },
+			"serverFacilityIds": ["facility-1"],
+		}))
+		.expect("facility test config should parse")
+	}
+
+	async fn connect(db_name: &str) -> Option<Arc<tokio_postgres::Client>> {
+		let url = format!("postgresql://localhost/{db_name}");
+		match bestool_postgres::pool::connect_one(&url, "bestool-alertd-test").await {
+			Ok(client) => Some(Arc::new(client)),
+			Err(_) => None,
+		}
+	}
+
+	/// A central [`CheckContext`] backed by `tamanu-central`, or `None` if that
+	/// DB can't be reached.
+	pub async fn central_ctx() -> Option<CheckContext> {
+		let db = connect("tamanu-central").await?;
+		Some(CheckContext {
+			tamanu_version: Version::parse("0.0.0").unwrap(),
+			tamanu_root: std::path::PathBuf::from("/nonexistent"),
+			config: Arc::new(central_config()),
+			kind: ApiServerKind::Central,
+			database_url: "postgresql://localhost/tamanu-central".into(),
+			db: Some(db),
+			http_client: reqwest::Client::new(),
+		})
+	}
+
+	/// A facility [`CheckContext`] with no DB; central-only checks skip on it
+	/// before ever touching the database.
+	pub fn facility_ctx() -> CheckContext {
+		CheckContext {
+			tamanu_version: Version::parse("0.0.0").unwrap(),
+			tamanu_root: std::path::PathBuf::from("/nonexistent"),
+			config: Arc::new(facility_config()),
+			kind: ApiServerKind::Facility,
+			database_url: "postgresql://localhost/tamanu-facility".into(),
+			db: None,
+			http_client: reqwest::Client::new(),
+		}
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/certificate_notification_errors.rs b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs
new file mode 100644
index 00000000..4beb789c
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs
@@ -0,0 +1,61 @@
+//! Certificate notifications that errored within the lookback window.
+//!
+//! Ported from the `certificate-notification-error` YAML alert.
+
+use jiff::{Timestamp, ToSpan};
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "certificate_notification_errors";
+const SQL: &str = "SELECT * FROM certificate_notifications \
+	WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC";
+
+// Lookback window for recent-error checks; revisit in the Phase 4 threshold
+// review.
+const LOOKBACK_HOURS: i64 = 1;
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
+	fail_if_any_rows(
+		client,
+		"certificate_notification_errors",
+		"no recent certificate notification errors",
+		"certificate notification errors: ",
+		SQL,
+		&[&since],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "certificate_notification_errors");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/fhir_job_errors.rs b/crates/alertd/src/doctor/checks/fhir_job_errors.rs
new file mode 100644
index 00000000..8f1104cb
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/fhir_job_errors.rs
@@ -0,0 +1,63 @@
+//! FHIR jobs that recorded an error within the lookback window.
+//!
+//! Ported from the `fhir-error` YAML alert. Distinct from `fhir_jobs`, which
+//! measures live queue depth: this surfaces individual jobs that errored
+//! recently.
+
+use jiff::{Timestamp, ToSpan};
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "fhir_job_errors";
+const SQL: &str =
+	"SELECT * FROM fhir.jobs WHERE error IS NOT NULL AND created_at > $1 ORDER BY created_at DESC";
+
+// Lookback window for recent-error checks; revisit in the Phase 4 threshold
+// review.
+const LOOKBACK_HOURS: i64 = 1;
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
+	fail_if_any_rows(
+		client,
+		"fhir_job_errors",
+		"no recent FHIR job errors",
+		"FHIR job errors: ",
+		SQL,
+		&[&since],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "fhir_job_errors");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/ips_errors.rs b/crates/alertd/src/doctor/checks/ips_errors.rs
new file mode 100644
index 00000000..b7f933d4
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/ips_errors.rs
@@ -0,0 +1,60 @@
+//! IPS requests that errored within the lookback window.
+//!
+//! Ported from the `ips-error` YAML alert.
+
+use jiff::{Timestamp, ToSpan};
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "ips_errors";
+const SQL: &str = "SELECT * FROM ips_requests WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC";
+
+// Lookback window for recent-error checks; revisit in the Phase 4 threshold
+// review.
+const LOOKBACK_HOURS: i64 = 1;
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
+	fail_if_any_rows(
+		client,
+		"ips_errors",
+		"no recent IPS request errors",
+		"IPS request errors: ",
+		SQL,
+		&[&since],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "ips_errors");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/patient_communication_errors.rs b/crates/alertd/src/doctor/checks/patient_communication_errors.rs
new file mode 100644
index 00000000..1f289ab9
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/patient_communication_errors.rs
@@ -0,0 +1,61 @@
+//! Patient communications that errored within the lookback window.
+//!
+//! Ported from the `patient-communications-error` YAML alert.
+
+use jiff::{Timestamp, ToSpan};
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "patient_communication_errors";
+const SQL: &str = "SELECT * FROM patient_communications \
+	WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC";
+
+// Lookback window for recent-error checks; revisit in the Phase 4 threshold
+// review.
+const LOOKBACK_HOURS: i64 = 1;
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
+	fail_if_any_rows(
+		client,
+		"patient_communication_errors",
+		"no recent patient communication errors",
+		"patient communication errors: ",
+		SQL,
+		&[&since],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "patient_communication_errors");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/report_errors.rs b/crates/alertd/src/doctor/checks/report_errors.rs
new file mode 100644
index 00000000..f508fc2d
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/report_errors.rs
@@ -0,0 +1,61 @@
+//! Report requests that errored within the lookback window.
+//!
+//! Ported from the `report-error` YAML alert.
+
+use jiff::{Timestamp, ToSpan};
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "report_errors";
+const SQL: &str = "SELECT * FROM report_requests \
+	WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC";
+
+// Lookback window for recent-error checks; revisit in the Phase 4 threshold
+// review.
+const LOOKBACK_HOURS: i64 = 1;
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
+	fail_if_any_rows(
+		client,
+		"report_errors",
+		"no recent report errors",
+		"report errors: ",
+		SQL,
+		&[&since],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "report_errors");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/util.rs b/crates/alertd/src/doctor/checks/util.rs
new file mode 100644
index 00000000..22bc9f97
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/util.rs
@@ -0,0 +1,95 @@
+//! Shared helpers for checks that port the old YAML SQL alerts.
+//!
+//! Each migrated alert fails when its query returns any rows and attaches the
+//! offending rows (capped) to `details`. To avoid the generic tokio-postgres
+//! row→JSON conversion and to bound memory, every query is wrapped so Postgres
+//! returns one JSONB column per row, capped just past the reporting limit.
+
+use std::sync::Arc;
+
+use serde_json::Value;
+use tokio_postgres::{Client as PgClient, types::ToSql};
+
+use super::fmt_db_error;
+use crate::doctor::check::Check;
+
+/// Rows reported in `details` are capped here; one extra row is fetched to
+/// detect truncation.
+const REPORT_CAP: usize = 100;
+const FETCH_CAP: usize = REPORT_CAP + 1;
+
+/// Wrap an alert's SQL so Postgres hands back one JSONB column (`row`) per
+/// matching row, capped at [`FETCH_CAP`].
+fn wrap(sql: &str) -> String {
+	format!("SELECT to_jsonb(sub) AS row FROM ( {sql} ) sub LIMIT {FETCH_CAP}")
+}
+
+/// Outcome of running one wrapped alert query: the rows (capped at
+/// [`REPORT_CAP`]) and whether more existed than were reported.
+pub struct RowSet {
+	pub rows: Vec<Value>,
+	pub truncated: bool,
+}
+
+impl RowSet {
+	pub fn is_empty(&self) -> bool {
+		self.rows.is_empty()
+	}
+
+	/// Number to report: the exact count, or `"100+"` when truncated.
+	pub fn count(&self) -> Value {
+		if self.truncated {
+			Value::from(format!("{REPORT_CAP}+"))
+		} else {
+			Value::from(self.rows.len())
+		}
+	}
+}
+
+/// Run a wrapped alert query and collect its rows. The `to_jsonb` wrapping is
+/// applied here, so callers pass the original alert SQL.
+pub async fn fetch_rows(
+	client: &Arc<PgClient>,
+	sql: &str,
+	params: &[&(dyn ToSql + Sync)],
+) -> Result<RowSet, tokio_postgres::Error> {
+	let wrapped = wrap(sql);
+	let raw = client.query(&wrapped, params).await?;
+	let truncated = raw.len() > REPORT_CAP;
+	let rows = raw
+		.into_iter()
+		.take(REPORT_CAP)
+		.map(|r| r.get::<_, Value>("row"))
+		.collect();
+	Ok(RowSet { rows, truncated })
+}
+
+/// Run a single wrapped alert query: fail (with capped rows + count) if it
+/// returns any rows, else pass.
+///
+/// `summary_pass` is the headline shown when nothing matched;
+/// `summary_fail_prefix` is prepended to the count when rows are found.
+pub async fn fail_if_any_rows(
+	client: &Arc<PgClient>,
+	name: &'static str,
+	summary_pass: &str,
+	summary_fail_prefix: &str,
+	sql: &str,
+	params: &[&(dyn ToSql + Sync)],
+) -> Check {
+	match fetch_rows(client, sql, params).await {
+		Ok(set) if set.is_empty() => Check::pass(name, summary_pass.to_string()),
+		Ok(set) => {
+			let count = set.count();
+			Check::fail(
+				name,
+				format!("{summary_fail_prefix}{count}"),
+				format!("{} matching row(s)", count),
+			)
+			.with_detail("rows", Value::Array(set.rows))
+			.with_detail("truncated", set.truncated)
+			.with_detail("count", count)
+		}
+		Err(err) => Check::fail(name, "query failed", fmt_db_error(&err)),
+	}
+}

From ce97e3931590c251b12cba84f73fdfaebb2d1745 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Sat, 30 May 2026 17:57:10 +1200
Subject: [PATCH 2/3] feat(alertd/checks): port sync alerts to checks (closes
 TODO sync_lookup)

Add sync_session_errors (mobile + server, benign-error exclusions baked
in), sync_facility_stale (not-syncing + no-recent-success), sync_lookup
(lookup-table staleness, closes TODO #8), and sync_restart_loop. All
central-only, skip when the DB is unavailable, and fail on any matching
rows.

Co-authored-by: Claude <noreply@anthropic.com>
---
 crates/alertd/src/doctor/checks.rs            |   8 ++
 .../src/doctor/checks/sync_facility_stale.rs  |  99 +++++++++++++++++
 .../alertd/src/doctor/checks/sync_lookup.rs   |  57 ++++++++++
 .../src/doctor/checks/sync_restart_loop.rs    |  59 +++++++++++
 .../src/doctor/checks/sync_session_errors.rs  | 100 ++++++++++++++++++
 5 files changed, 323 insertions(+)
 create mode 100644 crates/alertd/src/doctor/checks/sync_facility_stale.rs
 create mode 100644 crates/alertd/src/doctor/checks/sync_lookup.rs
 create mode 100644 crates/alertd/src/doctor/checks/sync_restart_loop.rs
 create mode 100644 crates/alertd/src/doctor/checks/sync_session_errors.rs

diff --git a/crates/alertd/src/doctor/checks.rs b/crates/alertd/src/doctor/checks.rs
index 52271144..26570c51 100644
--- a/crates/alertd/src/doctor/checks.rs
+++ b/crates/alertd/src/doctor/checks.rs
@@ -32,6 +32,10 @@ pub mod migrations;
 pub mod patient_communication_errors;
 pub mod report_errors;
 pub mod server_id;
+pub mod sync_facility_stale;
+pub mod sync_lookup;
+pub mod sync_restart_loop;
+pub mod sync_session_errors;
 pub mod sync_sessions;
 pub mod tailscale;
 pub mod tamanu_found;
@@ -158,6 +162,10 @@ pub fn all() -> Vec<CheckEntry> {
 		entry!("patient_communication_errors", patient_communication_errors),
 		entry!("report_errors", report_errors),
 		entry!("fhir_job_errors", fhir_job_errors),
+		entry!("sync_session_errors", sync_session_errors),
+		entry!("sync_facility_stale", sync_facility_stale),
+		entry!("sync_lookup", sync_lookup),
+		entry!("sync_restart_loop", sync_restart_loop),
 	]
 }
 
diff --git a/crates/alertd/src/doctor/checks/sync_facility_stale.rs b/crates/alertd/src/doctor/checks/sync_facility_stale.rs
new file mode 100644
index 00000000..b8a11589
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/sync_facility_stale.rs
@@ -0,0 +1,99 @@
+//! Facilities whose sync has gone stale.
+//!
+//! Consolidates the `sync-facility-not-syncing` and `sync-no-sessions` YAML
+//! alerts: the first flags facilities that synced in the last 48h but have had
+//! no completion in the last 30m; the second flags facilities whose last
+//! successful sync was over an hour ago.
+
+use serde_json::Value;
+
+use super::{CheckContext, util::fetch_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "sync_facility_stale";
+
+const NOT_SYNCING_SQL: &str = "with sync_sessions_with_facility_id as ( \
+		select created_at, completed_at, \
+			jsonb_array_elements_text(parameters->'facilityIds') as facility_id \
+		from sync_sessions where parameters->>'isMobile' <> 'true' \
+	) \
+	select distinct facility_id from sync_sessions_with_facility_id \
+	where created_at > current_timestamp - '48 hours'::interval \
+	except \
+	select facility_id from sync_sessions_with_facility_id \
+	where completed_at > current_timestamp - '30 minutes'::interval \
+	group by facility_id order by facility_id";
+
+const NO_RECENT_SUCCESS_SQL: &str = "SELECT facility_id, last_successful_sync FROM ( \
+		SELECT facility_id, max(completed_at) as last_successful_sync FROM ( \
+			SELECT jsonb_array_elements_text(parameters->'facilityIds') as facility_id, completed_at \
+			FROM sync_sessions WHERE errors IS NULL \
+		) AS successful_syncs GROUP BY facility_id \
+	) AS last_successful_facility_syncs \
+	WHERE last_successful_sync < now() - interval '1 hour'";
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	let not_syncing = match fetch_rows(client, NOT_SYNCING_SQL, &[]).await {
+		Ok(set) => set,
+		Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)),
+	};
+	let no_recent_success = match fetch_rows(client, NO_RECENT_SUCCESS_SQL, &[]).await {
+		Ok(set) => set,
+		Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)),
+	};
+
+	if not_syncing.is_empty() && no_recent_success.is_empty() {
+		return Check::pass(NAME, "all facilities syncing");
+	}
+
+	let (not_syncing_count, not_syncing_truncated) = (not_syncing.count(), not_syncing.truncated);
+	let (no_recent_count, no_recent_truncated) =
+		(no_recent_success.count(), no_recent_success.truncated);
+
+	let check = Check::fail(
+		NAME,
+		format!(
+			"stale sync: {not_syncing_count} not syncing, {no_recent_count} with no recent success"
+		),
+		"facility sync stale",
+	);
+	check
+		.with_detail("not_syncing", Value::Array(not_syncing.rows))
+		.with_detail("not_syncing_count", not_syncing_count)
+		.with_detail("not_syncing_truncated", not_syncing_truncated)
+		.with_detail("no_recent_success", Value::Array(no_recent_success.rows))
+		.with_detail("no_recent_success_count", no_recent_count)
+		.with_detail("no_recent_success_truncated", no_recent_truncated)
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "sync_facility_stale");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/sync_lookup.rs b/crates/alertd/src/doctor/checks/sync_lookup.rs
new file mode 100644
index 00000000..ae8015ec
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/sync_lookup.rs
@@ -0,0 +1,57 @@
+//! Lookup table update staleness.
+//!
+//! Ported from the `sync-lookup-stale` YAML alert (closes TODO #8). Fails when
+//! the central server hasn't recorded a successful lookup-table update in over
+//! an hour.
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "sync_lookup";
+const SQL: &str = "SELECT key, value AS last_sync_tick, updated_at::text AS last_updated, \
+	(now() - updated_at)::text AS time_since_update FROM local_system_facts \
+	WHERE key = 'lastSuccessfulLookupTableUpdate' AND updated_at < now() - interval '1 hour'";
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	fail_if_any_rows(
+		client,
+		NAME,
+		"lookup table up to date",
+		"lookup table stale: ",
+		SQL,
+		&[],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "sync_lookup");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/sync_restart_loop.rs b/crates/alertd/src/doctor/checks/sync_restart_loop.rs
new file mode 100644
index 00000000..e014983b
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/sync_restart_loop.rs
@@ -0,0 +1,59 @@
+//! Facilities stuck in a sync restart loop.
+//!
+//! Ported from the `sync-restart-loop` YAML alert. Fails when a facility has
+//! accumulated 10 or more `snapshot-for-pushing` sync errors in the last hour,
+//! which indicates the sync is repeatedly restarting rather than progressing.
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "sync_restart_loop";
+const SQL: &str = "SELECT jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \
+	COUNT(*) AS error_count FROM sync_sessions \
+	WHERE created_at > now() - interval '1 hour' AND errors IS NOT NULL \
+	AND cardinality(errors) = 1 AND errors[1] LIKE '%snapshot-for-pushing%' \
+	GROUP BY facility_id HAVING COUNT(*) >= 10 ORDER BY error_count DESC";
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	fail_if_any_rows(
+		client,
+		NAME,
+		"no sync restart loops",
+		"facilities in sync restart loop: ",
+		SQL,
+		&[],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "sync_restart_loop");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}
diff --git a/crates/alertd/src/doctor/checks/sync_session_errors.rs b/crates/alertd/src/doctor/checks/sync_session_errors.rs
new file mode 100644
index 00000000..60a3e5b2
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/sync_session_errors.rs
@@ -0,0 +1,100 @@
+//! Recent sync-session errors, split mobile vs server, with benign-error
+//! exclusions baked into the SQL.
+//!
+//! Ported from the `sync-errors-mobile` and `sync-errors-server` YAML alerts
+//! into one check. The original alerts ran every minute, so the window is a
+//! tight `updated_at > now() - interval '1 minute'`; the sweep runs every 60s,
+//! so this still catches each error once. Both window and exclusions are
+//! Phase-4-tunable.
+
+use serde_json::Value;
+
+use super::{CheckContext, util::fetch_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "sync_session_errors";
+
+const MOBILE_SQL: &str = "SELECT id, errors::text, \
+	jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \
+	created_at::text AS created, (completed_at - created_at)::text AS duration \
+	FROM sync_sessions \
+	WHERE updated_at > now() - interval '1 minute' \
+	AND parameters->>'isMobile' = 'true' \
+	AND errors IS NOT NULL \
+	AND errors <> ARRAY['Session marked as completed due to its device reconnecting'] \
+	AND errors <> ARRAY['could not serialize access due to concurrent update'] \
+	ORDER BY created_at DESC";
+
+const SERVER_SQL: &str = "SELECT id, errors::text, \
+	jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \
+	created_at::text AS created, (completed_at - created_at)::text AS duration \
+	FROM sync_sessions \
+	WHERE updated_at > now() - interval '1 minute' \
+	AND parameters->>'isMobile' IS DISTINCT FROM 'true' \
+	AND errors IS NOT NULL \
+	AND errors <> ARRAY['could not serialize access due to concurrent update'] \
+	AND NOT (cardinality(errors) = 1 AND errors[1] LIKE '%snapshot-for-pushing%') \
+	ORDER BY created_at DESC";
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	let mobile = match fetch_rows(client, MOBILE_SQL, &[]).await {
+		Ok(set) => set,
+		Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)),
+	};
+	let server = match fetch_rows(client, SERVER_SQL, &[]).await {
+		Ok(set) => set,
+		Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)),
+	};
+
+	if mobile.is_empty() && server.is_empty() {
+		return Check::pass(NAME, "no recent sync session errors");
+	}
+
+	let (mobile_count, mobile_truncated) = (mobile.count(), mobile.truncated);
+	let (server_count, server_truncated) = (server.count(), server.truncated);
+
+	let check = Check::fail(
+		NAME,
+		format!("sync session errors: {mobile_count} mobile, {server_count} server"),
+		"recent sync session error(s)",
+	);
+	check
+		.with_detail("mobile", Value::Array(mobile.rows))
+		.with_detail("mobile_count", mobile_count)
+		.with_detail("mobile_truncated", mobile_truncated)
+		.with_detail("server", Value::Array(server.rows))
+		.with_detail("server_count", server_count)
+		.with_detail("server_truncated", server_truncated)
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "sync_session_errors");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}

From ae8de1c24ef59332bee08e730349f85d2df2e3f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= <felix@bes.au>
Date: Sat, 30 May 2026 17:57:47 +1200
Subject: [PATCH 3/3] feat(alertd/checks): port fhir
 unresolvable-service-requests alert to a check

Add fhir_service_requests_unresolved, central-only, skipping when the DB
is unavailable and failing when a lab-linked FHIR service request has
stayed unresolved for over an hour.

Co-authored-by: Claude <noreply@anthropic.com>
---
 crates/alertd/src/doctor/checks.rs            |  5 ++
 .../fhir_service_requests_unresolved.rs       | 58 +++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs

diff --git a/crates/alertd/src/doctor/checks.rs b/crates/alertd/src/doctor/checks.rs
index 26570c51..6a4a7cdc 100644
--- a/crates/alertd/src/doctor/checks.rs
+++ b/crates/alertd/src/doctor/checks.rs
@@ -23,6 +23,7 @@ pub mod disk_free;
 pub mod external_users;
 pub mod fhir_job_errors;
 pub mod fhir_jobs;
+pub mod fhir_service_requests_unresolved;
 pub mod http_errors;
 pub mod ips_errors;
 pub mod kopia_backup;
@@ -166,6 +167,10 @@ pub fn all() -> Vec<CheckEntry> {
 		entry!("sync_facility_stale", sync_facility_stale),
 		entry!("sync_lookup", sync_lookup),
 		entry!("sync_restart_loop", sync_restart_loop),
+		entry!(
+			"fhir_service_requests_unresolved",
+			fhir_service_requests_unresolved
+		),
 	]
 }
 
diff --git a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs
new file mode 100644
index 00000000..dece3729
--- /dev/null
+++ b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs
@@ -0,0 +1,58 @@
+//! FHIR service requests that have stayed unresolved for too long.
+//!
+//! Ported from the `fhir-unresolvable-service-requests-labs` YAML alert. Fails
+//! when a FHIR service request linked to a lab request has been unresolved for
+//! over an hour.
+
+use super::{CheckContext, util::fail_if_any_rows};
+use crate::doctor::check::Check;
+use bestool_tamanu::ApiServerKind;
+
+const NAME: &str = "fhir_service_requests_unresolved";
+const SQL: &str = "SELECT lr.display_id AS lab_request_id, \
+	ROUND(EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60)::text AS duration_minutes \
+	FROM fhir.service_requests fsr JOIN lab_requests lr ON fsr.upstream_id = lr.id \
+	WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours'";
+
+pub async fn run(ctx: CheckContext) -> Check {
+	if ctx.kind != ApiServerKind::Central {
+		return Check::skip(
+			NAME,
+			"not applicable on facility server",
+			"central-only check",
+		);
+	}
+	let Some(client) = ctx.db.as_ref() else {
+		return Check::skip(NAME, "no DB connection", "db unavailable");
+	};
+
+	fail_if_any_rows(
+		client,
+		NAME,
+		"no unresolved FHIR service requests",
+		"unresolved FHIR service requests: ",
+		SQL,
+		&[],
+	)
+	.await
+}
+
+#[cfg(test)]
+mod tests {
+	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
+
+	#[tokio::test]
+	async fn runs_against_central() {
+		let Some(ctx) = central_ctx().await else {
+			return;
+		};
+		let check = super::run(ctx).await;
+		assert_eq!(check.name, "fhir_service_requests_unresolved");
+	}
+
+	#[tokio::test]
+	async fn skips_on_facility() {
+		let check = super::run(facility_ctx()).await;
+		assert!(check.status.is_skip());
+	}
+}