diff --git a/crates/alertd/src/doctor/checks/certificate_notification_errors.rs b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs
index 450c2843..1d38d31e 100644
--- a/crates/alertd/src/doctor/checks/certificate_notification_errors.rs
+++ b/crates/alertd/src/doctor/checks/certificate_notification_errors.rs
@@ -2,7 +2,7 @@
 
 use jiff::{Timestamp, ToSpan};
 
-use super::{CheckContext, util::fail_if_any_rows};
+use super::{CheckContext, util::tiered_rows_check};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
 
@@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check {
 	};
 
 	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
-	fail_if_any_rows(
+	tiered_rows_check(
 		client,
 		"certificate_notification_errors",
 		"no recent certificate notification errors",
 		"certificate notification errors: ",
 		SQL,
 		&[&since],
+		1,
+		10,
 	)
 	.await
 }
diff --git a/crates/alertd/src/doctor/checks/db_connect.rs b/crates/alertd/src/doctor/checks/db_connect.rs
index a2b1cc56..d3d42447 100644
--- a/crates/alertd/src/doctor/checks/db_connect.rs
+++ b/crates/alertd/src/doctor/checks/db_connect.rs
@@ -3,6 +3,9 @@ use std::time::Instant;
 use super::CheckContext;
 use crate::doctor::check::Check;
 
+/// Connect latency above which the DB is treated as degraded.
+const WARN_LATENCY_MS: u64 = 1000;
+
 pub async fn run(ctx: CheckContext) -> Check {
 	let host = ctx
 		.config
@@ -21,10 +24,16 @@ pub async fn run(ctx: CheckContext) -> Check {
 			tokio::spawn(async move {
 				let _ = conn.await;
 			});
-			Check::pass(
-				"db_connect",
-				format!("postgres at {host}/{name} ({latency_ms}ms)"),
-			)
+			let summary = format!("postgres at {host}/{name} ({latency_ms}ms)");
+			if latency_ms > WARN_LATENCY_MS {
+				Check::warning(
+					"db_connect",
+					summary,
+					format!("connect latency {latency_ms}ms over {WARN_LATENCY_MS}ms"),
+				)
+			} else {
+				Check::pass("db_connect", summary)
+			}
 		}
 		Err(err) => Check::fail(
 			"db_connect",
diff --git a/crates/alertd/src/doctor/checks/fhir_job_errors.rs b/crates/alertd/src/doctor/checks/fhir_job_errors.rs
index 806e6706..5208c8fb 100644
--- a/crates/alertd/src/doctor/checks/fhir_job_errors.rs
+++ b/crates/alertd/src/doctor/checks/fhir_job_errors.rs
@@ -5,7 +5,7 @@
 
 use jiff::{Timestamp, ToSpan};
 
-use super::{CheckContext, util::fail_if_any_rows};
+use super::{CheckContext, util::tiered_rows_check};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
 
@@ -29,13 +29,15 @@ pub async fn run(ctx: CheckContext) -> Check {
 	};
 
 	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
-	fail_if_any_rows(
+	tiered_rows_check(
 		client,
 		"fhir_job_errors",
 		"no recent FHIR job errors",
 		"FHIR job errors: ",
 		SQL,
 		&[&since],
+		1,
+		10,
 	)
 	.await
 }
diff --git a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs
index 4afe2bf3..f18cdf7a 100644
--- a/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs
+++ b/crates/alertd/src/doctor/checks/fhir_service_requests_unresolved.rs
@@ -1,17 +1,24 @@
 //! FHIR service requests that have stayed unresolved for too long.
 //!
-//! Fails when a FHIR service request linked to a lab request has been
-//! unresolved for over an hour.
+//! Lists FHIR service requests linked to a lab request that have been
+//! unresolved for over an hour, tiering on the longest outstanding duration:
+//! WARN past 1h, FAIL past 6h.
 
-use super::{CheckContext, util::fail_if_any_rows};
+use super::{CheckContext, fmt_db_error};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
+use serde_json::{Value, json};
 
 const NAME: &str = "fhir_service_requests_unresolved";
+
+const WARN_MINUTES: f64 = 60.0;
+const FAIL_MINUTES: f64 = 6.0 * 60.0;
+
 const SQL: &str = "SELECT lr.display_id AS lab_request_id, \
-	ROUND(EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60)::text AS duration_minutes \
+	EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60 AS duration_minutes \
 	FROM fhir.service_requests fsr JOIN lab_requests lr ON fsr.upstream_id = lr.id \
-	WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours'";
+	WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours' \
+	ORDER BY duration_minutes DESC";
 
 pub async fn run(ctx: CheckContext) -> Check {
 	if ctx.kind != ApiServerKind::Central {
@@ -25,19 +32,53 @@ pub async fn run(ctx: CheckContext) -> Check {
 		return Check::skip(NAME, "no DB connection", "db unavailable");
 	};
 
-	fail_if_any_rows(
-		client,
-		NAME,
-		"no unresolved FHIR service requests",
-		"unresolved FHIR service requests: ",
-		SQL,
-		&[],
-	)
-	.await
+	let rows = match client.query(SQL, &[]).await {
+		Ok(r) => r,
+		Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)),
+	};
+
+	if rows.is_empty() {
+		return Check::pass(NAME, "no unresolved FHIR service requests");
+	}
+
+	let mut warn = Vec::new();
+	let mut fail = Vec::new();
+	for row in &rows {
+		let lab_request_id: Option<String> = row.try_get("lab_request_id").ok();
+		let minutes: f64 = row.try_get("duration_minutes").unwrap_or(0.0);
+		let entry = json!({
+			"lab_request_id": lab_request_id,
+			"duration_minutes": minutes.round() as i64,
+		});
+		if minutes > FAIL_MINUTES {
+			fail.push(entry);
+		} else if minutes > WARN_MINUTES {
+			warn.push(entry);
+		}
+	}
+
+	if warn.is_empty() && fail.is_empty() {
+		return Check::pass(NAME, "no unresolved FHIR service requests");
+	}
+
+	let summary = format!(
+		"unresolved FHIR service requests: {} over 6h, {} over 1h",
+		fail.len(),
+		warn.len()
+	);
+	let check = if fail.is_empty() {
+		Check::warning(NAME, summary, "unresolved FHIR service request(s)")
+	} else {
+		Check::fail(NAME, summary, "unresolved FHIR service request(s)")
+	};
+	check
+		.with_detail("fail", Value::Array(fail))
+		.with_detail("warn", Value::Array(warn))
 }
 
 #[cfg(test)]
 mod tests {
+	use crate::doctor::check::CheckStatus;
 	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
 
 	#[tokio::test]
@@ -47,6 +88,10 @@ mod tests {
 		};
 		let check = super::run(ctx).await;
 		assert_eq!(check.name, "fhir_service_requests_unresolved");
+		assert!(matches!(
+			check.status,
+			CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_)
+		));
 	}
 
 	#[tokio::test]
diff --git a/crates/alertd/src/doctor/checks/ips_errors.rs b/crates/alertd/src/doctor/checks/ips_errors.rs
index 1812e8b8..2b45f187 100644
--- a/crates/alertd/src/doctor/checks/ips_errors.rs
+++ b/crates/alertd/src/doctor/checks/ips_errors.rs
@@ -2,7 +2,7 @@
 
 use jiff::{Timestamp, ToSpan};
 
-use super::{CheckContext, util::fail_if_any_rows};
+use super::{CheckContext, util::tiered_rows_check};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
 
@@ -25,13 +25,15 @@ pub async fn run(ctx: CheckContext) -> Check {
 	};
 
 	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
-	fail_if_any_rows(
+	tiered_rows_check(
 		client,
 		"ips_errors",
 		"no recent IPS request errors",
 		"IPS request errors: ",
 		SQL,
 		&[&since],
+		1,
+		10,
 	)
 	.await
 }
diff --git a/crates/alertd/src/doctor/checks/kopia_backup.rs b/crates/alertd/src/doctor/checks/kopia_backup.rs
index dee4a8c0..c2f60ad5 100644
--- a/crates/alertd/src/doctor/checks/kopia_backup.rs
+++ b/crates/alertd/src/doctor/checks/kopia_backup.rs
@@ -29,6 +29,7 @@ use super::CheckContext;
 use crate::doctor::check::Check;
 
 const CHECK_NAME: &str = "kopia_backup";
+const WARN_AGE_SECS: i64 = 12 * 60 * 60;
 const FAIL_AGE_SECS: i64 = 24 * 60 * 60;
 
 pub async fn run(_ctx: CheckContext) -> Check {
@@ -206,6 +207,12 @@ fn evaluate(snapshots: &[Snapshot], now: Timestamp) -> Check {
 			summary.clone(),
 			format!("no backup in {}", humanise_age(FAIL_AGE_SECS)),
 		)
+	} else if age_secs >= WARN_AGE_SECS {
+		Check::warning(
+			CHECK_NAME,
+			summary.clone(),
+			format!("no backup in {}", humanise_age(WARN_AGE_SECS)),
+		)
 	} else {
 		Check::pass(CHECK_NAME, summary)
 	};
@@ -300,6 +307,18 @@ mod tests {
 		assert!(matches!(check.status, CheckStatus::Pass), "{check:?}");
 	}
 
+	#[test]
+	fn warn_when_postgres_snapshot_between_12h_and_24h() {
+		let now = Timestamp::from_second(20_000_000).unwrap();
+		let snapshots = vec![snapshot(
+			"/var/lib/postgresql/16/main",
+			Some(now - 18.hours()),
+			None,
+		)];
+		let check = evaluate(&snapshots, now);
+		assert!(matches!(check.status, CheckStatus::Warning(_)), "{check:?}");
+	}
+
 	#[test]
 	fn fail_when_postgres_snapshot_older_than_24h() {
 		let now = Timestamp::from_second(20_000_000).unwrap();
diff --git a/crates/alertd/src/doctor/checks/load.rs b/crates/alertd/src/doctor/checks/load.rs
index 32ed48ab..6cc9fc8e 100644
--- a/crates/alertd/src/doctor/checks/load.rs
+++ b/crates/alertd/src/doctor/checks/load.rs
@@ -1,7 +1,14 @@
-use sysinfo::System;
+use sysinfo::{CpuRefreshKind, RefreshKind, System};
 
 use super::CheckContext;
-use crate::doctor::check::Check;
+use crate::doctor::check::{Check, CheckStatus};
+
+/// Multiplier on the logical core count above which the 5-minute load average
+/// is treated as a hard failure.
+const FAIL_PER_CORE: f64 = 4.0;
+/// Multiplier on the logical core count above which the 5-minute load average
+/// is treated as a warning.
+const WARN_PER_CORE: f64 = 1.5;
 
 pub async fn run(_ctx: CheckContext) -> Check {
 	if cfg!(target_os = "windows") {
@@ -12,13 +19,71 @@ pub async fn run(_ctx: CheckContext) -> Check {
 		);
 	}
 
+	let sys =
+		System::new_with_specifics(RefreshKind::nothing().with_cpu(CpuRefreshKind::nothing()));
+	let cores = sys.cpus().len().max(1);
+
 	let load = System::load_average();
 	let summary = format!(
-		"load average: {:.2}, {:.2}, {:.2}",
+		"load average: {:.2}, {:.2}, {:.2} ({cores} cores)",
 		load.one, load.five, load.fifteen
 	);
-	Check::pass("load", summary)
+
+	let check = match tier(load.five, cores) {
+		CheckStatus::Fail(_) => Check::fail(
+			"load",
+			summary,
+			format!(
+				"5-min load {:.2} over {:.1}x cores ({cores})",
+				load.five, FAIL_PER_CORE
+			),
+		),
+		CheckStatus::Warning(_) => Check::warning(
+			"load",
+			summary,
+			format!(
+				"5-min load {:.2} over {:.1}x cores ({cores})",
+				load.five, WARN_PER_CORE
+			),
+		),
+		_ => Check::pass("load", summary),
+	};
+
+	check
 		.with_detail("one_min", load.one)
 		.with_detail("five_min", load.five)
 		.with_detail("fifteen_min", load.fifteen)
+		.with_detail("cores", cores)
+}
+
+/// Tier the 5-minute load average against the logical core count.
+fn tier(five: f64, cores: usize) -> CheckStatus {
+	let cores = cores as f64;
+	if five > FAIL_PER_CORE * cores {
+		CheckStatus::Fail(String::new())
+	} else if five > WARN_PER_CORE * cores {
+		CheckStatus::Warning(String::new())
+	} else {
+		CheckStatus::Pass
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn tier_boundaries() {
+		assert!(matches!(tier(5.9, 4), CheckStatus::Pass));
+		assert!(matches!(tier(6.1, 4), CheckStatus::Warning(_)));
+		assert!(matches!(tier(15.9, 4), CheckStatus::Warning(_)));
+		assert!(matches!(tier(16.1, 4), CheckStatus::Fail(_)));
+	}
+
+	#[test]
+	fn tier_single_core() {
+		assert!(matches!(tier(1.4, 1), CheckStatus::Pass));
+		assert!(matches!(tier(1.6, 1), CheckStatus::Warning(_)));
+		assert!(matches!(tier(4.1, 1), CheckStatus::Fail(_)));
+	}
 }
diff --git a/crates/alertd/src/doctor/checks/patient_communication_errors.rs b/crates/alertd/src/doctor/checks/patient_communication_errors.rs
index 74d5c537..0caf97c0 100644
--- a/crates/alertd/src/doctor/checks/patient_communication_errors.rs
+++ b/crates/alertd/src/doctor/checks/patient_communication_errors.rs
@@ -2,7 +2,7 @@
 
 use jiff::{Timestamp, ToSpan};
 
-use super::{CheckContext, util::fail_if_any_rows};
+use super::{CheckContext, util::tiered_rows_check};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
 
@@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check {
 	};
 
 	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
-	fail_if_any_rows(
+	tiered_rows_check(
 		client,
 		"patient_communication_errors",
 		"no recent patient communication errors",
 		"patient communication errors: ",
 		SQL,
 		&[&since],
+		1,
+		10,
 	)
 	.await
 }
diff --git a/crates/alertd/src/doctor/checks/report_errors.rs b/crates/alertd/src/doctor/checks/report_errors.rs
index 4dd0b2ce..dc63d35c 100644
--- a/crates/alertd/src/doctor/checks/report_errors.rs
+++ b/crates/alertd/src/doctor/checks/report_errors.rs
@@ -2,7 +2,7 @@
 
 use jiff::{Timestamp, ToSpan};
 
-use super::{CheckContext, util::fail_if_any_rows};
+use super::{CheckContext, util::tiered_rows_check};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
 
@@ -26,13 +26,15 @@ pub async fn run(ctx: CheckContext) -> Check {
 	};
 
 	let since = Timestamp::now() - LOOKBACK_HOURS.hours();
-	fail_if_any_rows(
+	tiered_rows_check(
 		client,
 		"report_errors",
 		"no recent report errors",
 		"report errors: ",
 		SQL,
 		&[&since],
+		1,
+		10,
 	)
 	.await
 }
diff --git a/crates/alertd/src/doctor/checks/sync_facility_stale.rs b/crates/alertd/src/doctor/checks/sync_facility_stale.rs
index e25c21bb..9daa36a1 100644
--- a/crates/alertd/src/doctor/checks/sync_facility_stale.rs
+++ b/crates/alertd/src/doctor/checks/sync_facility_stale.rs
@@ -1,36 +1,38 @@
 //! Facilities whose sync has gone stale.
 //!
-//! Flags facilities that synced in the last 48h but have had no completion in
-//! the last 30m, as well as facilities whose last successful sync was over an
-//! hour ago.
+//! Sync runs about every 60s, so for each facility that has synced in the last
+//! 48h we compute the minutes since its last successful (errorless, completed)
+//! sync and tier: WARN past 10 minutes, FAIL past 30. The 48h-active guard
+//! keeps decommissioned facilities from flagging.
 
-use serde_json::Value;
+use serde_json::{Value, json};
 
-use super::{CheckContext, util::fetch_rows};
+use super::{CheckContext, fmt_db_error};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
 
 const NAME: &str = "sync_facility_stale";
 
-const NOT_SYNCING_SQL: &str = "with sync_sessions_with_facility_id as ( \
-		select created_at, completed_at, \
-			jsonb_array_elements_text(parameters->'facilityIds') as facility_id \
-		from sync_sessions where parameters->>'isMobile' <> 'true' \
-	) \
-	select distinct facility_id from sync_sessions_with_facility_id \
-	where created_at > current_timestamp - '48 hours'::interval \
-	except \
-	select facility_id from sync_sessions_with_facility_id \
-	where completed_at > current_timestamp - '30 minutes'::interval \
-	group by facility_id order by facility_id";
+const WARN_MINUTES: f64 = 10.0;
+const FAIL_MINUTES: f64 = 30.0;
 
-const NO_RECENT_SUCCESS_SQL: &str = "SELECT facility_id, last_successful_sync FROM ( \
-		SELECT facility_id, max(completed_at) as last_successful_sync FROM ( \
-			SELECT jsonb_array_elements_text(parameters->'facilityIds') as facility_id, completed_at \
-			FROM sync_sessions WHERE errors IS NULL \
-		) AS successful_syncs GROUP BY facility_id \
-	) AS last_successful_facility_syncs \
-	WHERE last_successful_sync < now() - interval '1 hour'";
+const SQL: &str = "WITH facility_sessions AS ( \
+		SELECT jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \
+			created_at, completed_at, errors \
+		FROM sync_sessions WHERE parameters->>'isMobile' <> 'true' \
+	), active AS ( \
+		SELECT DISTINCT facility_id FROM facility_sessions \
+		WHERE created_at > now() - interval '48 hours' \
+	), last_success AS ( \
+		SELECT facility_id, max(completed_at) AS last_successful_sync \
+		FROM facility_sessions WHERE errors IS NULL AND completed_at IS NOT NULL \
+		GROUP BY facility_id \
+	) \
+	SELECT a.facility_id, \
+		ls.last_successful_sync::text AS last_successful_sync, \
+		EXTRACT(EPOCH FROM (now() - ls.last_successful_sync)) / 60 AS minutes_since_success \
+	FROM active a LEFT JOIN last_success ls USING (facility_id) \
+	ORDER BY minutes_since_success DESC NULLS FIRST";
 
 pub async fn run(ctx: CheckContext) -> Check {
 	if ctx.kind != ApiServerKind::Central {
@@ -44,41 +46,55 @@ pub async fn run(ctx: CheckContext) -> Check {
 		return Check::skip(NAME, "no DB connection", "db unavailable");
 	};
 
-	let not_syncing = match fetch_rows(client, NOT_SYNCING_SQL, &[]).await {
-		Ok(set) => set,
-		Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)),
-	};
-	let no_recent_success = match fetch_rows(client, NO_RECENT_SUCCESS_SQL, &[]).await {
-		Ok(set) => set,
-		Err(err) => return Check::fail(NAME, "query failed", super::fmt_db_error(&err)),
+	let rows = match client.query(SQL, &[]).await {
+		Ok(r) => r,
+		Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)),
 	};
 
-	if not_syncing.is_empty() && no_recent_success.is_empty() {
-		return Check::pass(NAME, "all facilities syncing");
+	let mut warn = Vec::new();
+	let mut fail = Vec::new();
+	for row in &rows {
+		let facility_id: String = row.try_get("facility_id").unwrap_or_default();
+		let last: Option<String> = row.try_get("last_successful_sync").ok();
+		// A facility that is active but has never had a successful sync (NULL
+		// minutes) is as bad as a very stale one: treat it as a failure.
+		let minutes: Option<f64> = row.try_get("minutes_since_success").ok();
+		let entry = json!({
+			"facility_id": facility_id,
+			"last_successful_sync": last,
+			"minutes_since_success": minutes,
+		});
+		match minutes {
+			Some(m) if m <= WARN_MINUTES => {}
+			Some(m) if m <= FAIL_MINUTES => warn.push(entry),
+			_ => fail.push(entry),
+		}
 	}
 
-	let (not_syncing_count, not_syncing_truncated) = (not_syncing.count(), not_syncing.truncated);
-	let (no_recent_count, no_recent_truncated) =
-		(no_recent_success.count(), no_recent_success.truncated);
+	if warn.is_empty() && fail.is_empty() {
+		return Check::pass(NAME, "all facilities syncing");
+	}
 
-	let check = Check::fail(
-		NAME,
-		format!(
-			"stale sync: {not_syncing_count} not syncing, {no_recent_count} with no recent success"
-		),
-		"facility sync stale",
+	let summary = format!(
+		"stale sync: {} over {}m, {} over {}m",
+		fail.len(),
+		FAIL_MINUTES as i64,
+		warn.len(),
+		WARN_MINUTES as i64
 	);
+	let check = if fail.is_empty() {
+		Check::warning(NAME, summary, "facility sync stale")
+	} else {
+		Check::fail(NAME, summary, "facility sync stale")
+	};
 	check
-		.with_detail("not_syncing", Value::Array(not_syncing.rows))
-		.with_detail("not_syncing_count", not_syncing_count)
-		.with_detail("not_syncing_truncated", not_syncing_truncated)
-		.with_detail("no_recent_success", Value::Array(no_recent_success.rows))
-		.with_detail("no_recent_success_count", no_recent_count)
-		.with_detail("no_recent_success_truncated", no_recent_truncated)
+		.with_detail("fail", Value::Array(fail))
+		.with_detail("warn", Value::Array(warn))
 }
 
 #[cfg(test)]
 mod tests {
+	use crate::doctor::check::CheckStatus;
 	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
 
 	#[tokio::test]
@@ -88,6 +104,10 @@ mod tests {
 		};
 		let check = super::run(ctx).await;
 		assert_eq!(check.name, "sync_facility_stale");
+		assert!(matches!(
+			check.status,
+			CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_)
+		));
 	}
 
 	#[tokio::test]
diff --git a/crates/alertd/src/doctor/checks/sync_lookup.rs b/crates/alertd/src/doctor/checks/sync_lookup.rs
index 0a7a72ad..4c229c73 100644
--- a/crates/alertd/src/doctor/checks/sync_lookup.rs
+++ b/crates/alertd/src/doctor/checks/sync_lookup.rs
@@ -1,16 +1,20 @@
 //! Lookup table update staleness.
 //!
-//! Fails when the central server hasn't recorded a successful lookup-table
-//! update in over an hour.
+//! The lookup table refreshes roughly every 20s, so tier on minutes of
+//! staleness: WARN past 2 minutes, FAIL past 5. If the tracking row is absent,
+//! treat the lookup as not tracked and pass.
 
-use super::{CheckContext, util::fail_if_any_rows};
-use crate::doctor::check::Check;
+use super::{CheckContext, fmt_db_error};
+use crate::doctor::check::{Check, CheckStatus};
 use bestool_tamanu::ApiServerKind;
 
 const NAME: &str = "sync_lookup";
-const SQL: &str = "SELECT key, value AS last_sync_tick, updated_at::text AS last_updated, \
-	(now() - updated_at)::text AS time_since_update FROM local_system_facts \
-	WHERE key = 'lastSuccessfulLookupTableUpdate' AND updated_at < now() - interval '1 hour'";
+const SQL: &str = "SELECT value AS last_sync_tick, updated_at::text AS last_updated, \
+	EXTRACT(EPOCH FROM (now() - updated_at))::bigint AS age_seconds \
+	FROM local_system_facts WHERE key = 'lastSuccessfulLookupTableUpdate'";
+
+const WARN_SECS: i64 = 2 * 60;
+const FAIL_SECS: i64 = 5 * 60;
 
 pub async fn run(ctx: CheckContext) -> Check {
 	if ctx.kind != ApiServerKind::Central {
@@ -24,21 +28,66 @@ pub async fn run(ctx: CheckContext) -> Check {
 		return Check::skip(NAME, "no DB connection", "db unavailable");
 	};
 
-	fail_if_any_rows(
-		client,
-		NAME,
-		"lookup table up to date",
-		"lookup table stale: ",
-		SQL,
-		&[],
-	)
-	.await
+	let row = match client.query_opt(SQL, &[]).await {
+		Ok(Some(r)) => r,
+		Ok(None) => return Check::pass(NAME, "lookup table not tracked"),
+		Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)),
+	};
+
+	let last_sync_tick: Option<String> = row.try_get("last_sync_tick").ok();
+	let last_updated: Option<String> = row.try_get("last_updated").ok();
+	let age_seconds: i64 = row.try_get("age_seconds").unwrap_or(0);
+
+	let summary = format!("lookup table updated {}m ago", age_seconds / 60);
+	let check = match tier(age_seconds) {
+		CheckStatus::Fail(_) => Check::fail(
+			NAME,
+			summary,
+			format!("lookup table stale: {age_seconds}s since last update"),
+		),
+		CheckStatus::Warning(_) => Check::warning(
+			NAME,
+			summary,
+			format!("lookup table stale: {age_seconds}s since last update"),
+		),
+		_ => Check::pass(NAME, "lookup table up to date"),
+	};
+
+	let mut check = check.with_detail("age_seconds", age_seconds);
+	if let Some(tick) = last_sync_tick {
+		check = check.with_detail("last_sync_tick", tick);
+	}
+	if let Some(updated) = last_updated {
+		check = check.with_detail("last_updated", updated);
+	}
+	check
+}
+
+/// Tier on seconds since the lookup table last updated.
+fn tier(seconds: i64) -> CheckStatus {
+	if seconds > FAIL_SECS {
+		CheckStatus::Fail(String::new())
+	} else if seconds > WARN_SECS {
+		CheckStatus::Warning(String::new())
+	} else {
+		CheckStatus::Pass
+	}
 }
 
 #[cfg(test)]
 mod tests {
+	use super::*;
 	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
 
+	#[test]
+	fn tier_boundaries() {
+		assert!(matches!(tier(0), CheckStatus::Pass));
+		assert!(matches!(tier(120), CheckStatus::Pass));
+		assert!(matches!(tier(121), CheckStatus::Warning(_)));
+		assert!(matches!(tier(300), CheckStatus::Warning(_)));
+		assert!(matches!(tier(301), CheckStatus::Fail(_)));
+	}
+
 	#[tokio::test]
 	async fn runs_against_central() {
 		let Some(ctx) = central_ctx().await else {
@@ -46,6 +95,10 @@ mod tests {
 		};
 		let check = super::run(ctx).await;
 		assert_eq!(check.name, "sync_lookup");
+		assert!(matches!(
+			check.status,
+			CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_)
+		));
 	}
 
 	#[tokio::test]
diff --git a/crates/alertd/src/doctor/checks/sync_restart_loop.rs b/crates/alertd/src/doctor/checks/sync_restart_loop.rs
index 09d36bc2..cf5fb7a0 100644
--- a/crates/alertd/src/doctor/checks/sync_restart_loop.rs
+++ b/crates/alertd/src/doctor/checks/sync_restart_loop.rs
@@ -1,19 +1,25 @@
 //! Facilities stuck in a sync restart loop.
 //!
-//! Fails when a facility has accumulated 10 or more `snapshot-for-pushing` sync
-//! errors in the last hour, which indicates the sync is repeatedly restarting
-//! rather than progressing.
+//! Counts `snapshot-for-pushing` sync errors per facility in the last hour,
+//! which indicates sync repeatedly restarting rather than progressing. WARN at
+//! 5 restarts/hr, FAIL at 10.
 
-use super::{CheckContext, util::fail_if_any_rows};
+use serde_json::{Value, json};
+
+use super::{CheckContext, fmt_db_error};
 use crate::doctor::check::Check;
 use bestool_tamanu::ApiServerKind;
 
 const NAME: &str = "sync_restart_loop";
+
+const WARN_RESTARTS: i64 = 5;
+const FAIL_RESTARTS: i64 = 10;
+
 const SQL: &str = "SELECT jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \
 	COUNT(*) AS error_count FROM sync_sessions \
 	WHERE created_at > now() - interval '1 hour' AND errors IS NOT NULL \
 	AND cardinality(errors) = 1 AND errors[1] LIKE '%snapshot-for-pushing%' \
-	GROUP BY facility_id HAVING COUNT(*) >= 10 ORDER BY error_count DESC";
+	GROUP BY facility_id HAVING COUNT(*) >= 5 ORDER BY error_count DESC";
 
 pub async fn run(ctx: CheckContext) -> Check {
 	if ctx.kind != ApiServerKind::Central {
@@ -27,19 +33,46 @@ pub async fn run(ctx: CheckContext) -> Check {
 		return Check::skip(NAME, "no DB connection", "db unavailable");
 	};
 
-	fail_if_any_rows(
-		client,
-		NAME,
-		"no sync restart loops",
-		"facilities in sync restart loop: ",
-		SQL,
-		&[],
-	)
-	.await
+	let rows = match client.query(SQL, &[]).await {
+		Ok(r) => r,
+		Err(err) => return Check::fail(NAME, "query failed", fmt_db_error(&err)),
+	};
+
+	let mut warn = Vec::new();
+	let mut fail = Vec::new();
+	for row in &rows {
+		let facility_id: String = row.try_get("facility_id").unwrap_or_default();
+		let error_count: i64 = row.try_get("error_count").unwrap_or(0);
+		let entry = json!({ "facility_id": facility_id, "error_count": error_count });
+		if error_count >= FAIL_RESTARTS {
+			fail.push(entry);
+		} else if error_count >= WARN_RESTARTS {
+			warn.push(entry);
+		}
+	}
+
+	if warn.is_empty() && fail.is_empty() {
+		return Check::pass(NAME, "no sync restart loops");
+	}
+
+	let summary = format!(
+		"sync restart loops: {} over {FAIL_RESTARTS}/hr, {} over {WARN_RESTARTS}/hr",
+		fail.len(),
+		warn.len()
+	);
+	let check = if fail.is_empty() {
+		Check::warning(NAME, summary, "facilities in sync restart loop")
+	} else {
+		Check::fail(NAME, summary, "facilities in sync restart loop")
+	};
+	check
+		.with_detail("fail", Value::Array(fail))
+		.with_detail("warn", Value::Array(warn))
 }
 
 #[cfg(test)]
 mod tests {
+	use crate::doctor::check::CheckStatus;
 	use crate::doctor::checks::test_support::{central_ctx, facility_ctx};
 
 	#[tokio::test]
@@ -49,6 +82,10 @@ mod tests {
 		};
 		let check = super::run(ctx).await;
 		assert_eq!(check.name, "sync_restart_loop");
+		assert!(matches!(
+			check.status,
+			CheckStatus::Pass | CheckStatus::Warning(_) | CheckStatus::Fail(_)
+		));
 	}
 
 	#[tokio::test]
diff --git a/crates/alertd/src/doctor/checks/sync_session_errors.rs b/crates/alertd/src/doctor/checks/sync_session_errors.rs
index f29d893d..8b15f0db 100644
--- a/crates/alertd/src/doctor/checks/sync_session_errors.rs
+++ b/crates/alertd/src/doctor/checks/sync_session_errors.rs
@@ -12,6 +12,8 @@ use bestool_tamanu::ApiServerKind;
 
 const NAME: &str = "sync_session_errors";
 
+const FAIL_ERRORS: usize = 10;
+
 const MOBILE_SQL: &str = "SELECT id, errors::text, \
 	jsonb_array_elements_text(parameters->'facilityIds') AS facility_id, \
 	created_at::text AS created, (completed_at - created_at)::text AS duration \
@@ -62,11 +64,20 @@ pub async fn run(ctx: CheckContext) -> Check {
 	let (mobile_count, mobile_truncated) = (mobile.count(), mobile.truncated);
 	let (server_count, server_truncated) = (server.count(), server.truncated);
 
-	let check = Check::fail(
-		NAME,
-		format!("sync session errors: {mobile_count} mobile, {server_count} server"),
-		"recent sync session error(s)",
-	);
+	// Truncation means well over FAIL_ERRORS rows, so saturate the total there.
+	let total = if mobile.truncated || server.truncated {
+		FAIL_ERRORS
+	} else {
+		mobile.rows.len() + server.rows.len()
+	};
+
+	let summary = format!("sync session errors: {mobile_count} mobile, {server_count} server");
+	let reason = "recent sync session error(s)";
+	let check = if total >= FAIL_ERRORS {
+		Check::fail(NAME, summary, reason)
+	} else {
+		Check::warning(NAME, summary, reason)
+	};
 	check
 		.with_detail("mobile", Value::Array(mobile.rows))
 		.with_detail("mobile_count", mobile_count)
diff --git a/crates/alertd/src/doctor/checks/sync_sessions.rs b/crates/alertd/src/doctor/checks/sync_sessions.rs
index 4c79b8af..34252dac 100644
--- a/crates/alertd/src/doctor/checks/sync_sessions.rs
+++ b/crates/alertd/src/doctor/checks/sync_sessions.rs
@@ -12,10 +12,10 @@ pub async fn run(ctx: CheckContext) -> Check {
 		SELECT
 			count(*) FILTER (WHERE completed_at IS NULL) AS active_count,
 			count(*) FILTER (
-				WHERE completed_at IS NULL AND start_time < now() - interval '1 hour'
+				WHERE completed_at IS NULL AND start_time < now() - interval '15 minutes'
 			) AS stuck_warn,
 			count(*) FILTER (
-				WHERE completed_at IS NULL AND start_time < now() - interval '6 hours'
+				WHERE completed_at IS NULL AND start_time < now() - interval '45 minutes'
 			) AS stuck_fail,
 			min(start_time) FILTER (WHERE completed_at IS NULL) AS oldest_started_at
 		FROM sync_sessions
@@ -42,18 +42,18 @@ pub async fn run(ctx: CheckContext) -> Check {
 	let stuck_fail: i64 = row.try_get("stuck_fail").unwrap_or(0);
 	let oldest: Option<Timestamp> = row.try_get("oldest_started_at").ok();
 
-	let summary = format!("{active} active, {stuck_warn} stuck >1h");
+	let summary = format!("{active} active, {stuck_warn} stuck >15m");
 	let check = if stuck_fail > 0 {
 		Check::fail(
 			"sync_sessions",
 			summary.clone(),
-			format!("{stuck_fail} session(s) stuck >6h"),
+			format!("{stuck_fail} session(s) stuck >45m"),
 		)
 	} else if stuck_warn > 0 {
 		Check::warning(
 			"sync_sessions",
 			summary.clone(),
-			format!("{stuck_warn} session(s) stuck >1h"),
+			format!("{stuck_warn} session(s) stuck >15m"),
 		)
 	} else {
 		Check::pass("sync_sessions", summary)
diff --git a/crates/alertd/src/doctor/checks/tamanu_http.rs b/crates/alertd/src/doctor/checks/tamanu_http.rs
index 3a5133d8..45f19dcc 100644
--- a/crates/alertd/src/doctor/checks/tamanu_http.rs
+++ b/crates/alertd/src/doctor/checks/tamanu_http.rs
@@ -5,6 +5,8 @@ use crate::doctor::check::Check;
 
 const PING_URL: &str = "http://localhost/api/public/ping";
 const TIMEOUT: Duration = Duration::from_secs(5);
+/// Response latency above which a reachable endpoint is treated as degraded.
+const WARN_LATENCY_MS: u64 = 2000;
 
 pub async fn run(ctx: CheckContext) -> Check {
 	let start = Instant::now();
@@ -16,10 +18,16 @@ pub async fn run(ctx: CheckContext) -> Check {
 			let status = resp.status();
 			let detail_status = status.as_u16();
 			if status.is_success() {
-				Check::pass(
-					"tamanu_http",
-					format!("HTTP {} from {PING_URL} ({latency_ms}ms)", status.as_u16()),
-				)
+				let summary = format!("HTTP {} from {PING_URL} ({latency_ms}ms)", status.as_u16());
+				if latency_ms > WARN_LATENCY_MS {
+					Check::warning(
+						"tamanu_http",
+						summary,
+						format!("response latency {latency_ms}ms over {WARN_LATENCY_MS}ms"),
+					)
+				} else {
+					Check::pass("tamanu_http", summary)
+				}
 				.with_detail("status_code", detail_status)
 			} else {
 				Check::fail(
diff --git a/crates/alertd/src/doctor/checks/uptime.rs b/crates/alertd/src/doctor/checks/uptime.rs
index 1ddd42fc..79243845 100644
--- a/crates/alertd/src/doctor/checks/uptime.rs
+++ b/crates/alertd/src/doctor/checks/uptime.rs
@@ -3,9 +3,22 @@ use sysinfo::System;
 use super::CheckContext;
 use crate::doctor::check::Check;
 
+/// Below this uptime the host has rebooted recently, which may be unexpected.
+const WARN_UPTIME_SECS: u64 = 10 * 60;
+
 pub async fn run(_ctx: CheckContext) -> Check {
 	let secs = System::uptime();
-	Check::pass("uptime", humanise(secs)).with_detail("uptime_secs", secs)
+	let summary = humanise(secs);
+	let check = if secs < WARN_UPTIME_SECS {
+		Check::warning(
+			"uptime",
+			summary,
+			"host rebooted within the last 10 minutes",
+		)
+	} else {
+		Check::pass("uptime", summary)
+	};
+	check.with_detail("uptime_secs", secs)
 }
 
 fn humanise(secs: u64) -> String {
diff --git a/crates/alertd/src/doctor/checks/util.rs b/crates/alertd/src/doctor/checks/util.rs
index 3cfbcdc2..6ab1cefe 100644
--- a/crates/alertd/src/doctor/checks/util.rs
+++ b/crates/alertd/src/doctor/checks/util.rs
@@ -64,32 +64,78 @@ pub async fn fetch_rows(
 	Ok(RowSet { rows, truncated })
 }
 
-/// Run a single wrapped query: fail (with capped rows + count) if it
-/// returns any rows, else pass.
+/// Run a single wrapped query and tier the outcome on the number of
+/// matching rows: PASS below `warn_min`, WARN at or above it, FAIL at or above
+/// `fail_min`.
 ///
-/// `summary_pass` is the headline shown when nothing matched;
-/// `summary_fail_prefix` is prepended to the count when rows are found.
-pub async fn fail_if_any_rows(
+/// `summary_pass` is the headline shown when nothing crosses `warn_min`;
+/// `summary_prefix` is prepended to the count for the WARN/FAIL summary.
+///
+/// Rows are capped at [`REPORT_CAP`] (reported as `"100+"`), which is enough to
+/// distinguish the small WARN/FAIL boundaries the error-stream checks use.
+#[expect(
+	clippy::too_many_arguments,
+	reason = "shared query helper; each parameter is a distinct knob the call sites set"
+)]
+pub async fn tiered_rows_check(
 	client: &Arc<PgClient>,
 	name: &'static str,
 	summary_pass: &str,
-	summary_fail_prefix: &str,
+	summary_prefix: &str,
 	sql: &str,
 	params: &[&(dyn ToSql + Sync)],
+	warn_min: usize,
+	fail_min: usize,
 ) -> Check {
 	match fetch_rows(client, sql, params).await {
-		Ok(set) if set.is_empty() => Check::pass(name, summary_pass.to_string()),
 		Ok(set) => {
+			// `truncated` means there were more than REPORT_CAP rows, which is
+			// well past any realistic fail_min, so treat it as the cap.
+			let n = if set.truncated {
+				REPORT_CAP + 1
+			} else {
+				set.rows.len()
+			};
 			let count = set.count();
-			Check::fail(
-				name,
-				format!("{summary_fail_prefix}{count}"),
-				format!("{} matching row(s)", count),
-			)
-			.with_detail("rows", Value::Array(set.rows))
-			.with_detail("truncated", set.truncated)
-			.with_detail("count", count)
+			if n < warn_min {
+				return Check::pass(name, summary_pass.to_string());
+			}
+			let summary = format!("{summary_prefix}{count}");
+			let reason = format!("{count} matching row(s)");
+			let check = if n >= fail_min {
+				Check::fail(name, summary, reason)
+			} else {
+				Check::warning(name, summary, reason)
+			};
+			check
+				.with_detail("rows", Value::Array(set.rows))
+				.with_detail("truncated", set.truncated)
+				.with_detail("count", count)
 		}
 		Err(err) => Check::fail(name, "query failed", fmt_db_error(&err)),
 	}
 }
+
+#[cfg(test)]
+mod tests {
+	/// Pure count→tier decision mirroring [`tiered_rows_check`], factored so the
+	/// WARN/FAIL boundaries can be asserted without a database.
+	fn tier(n: usize, warn_min: usize, fail_min: usize) -> &'static str {
+		if n >= fail_min {
+			"fail"
+		} else if n >= warn_min {
+			"warning"
+		} else {
+			"pass"
+		}
+	}
+
+	#[test]
+	fn error_stream_boundaries() {
+		assert_eq!(tier(0, 1, 10), "pass");
+		assert_eq!(tier(1, 1, 10), "warning");
+		assert_eq!(tier(9, 1, 10), "warning");
+		assert_eq!(tier(10, 1, 10), "fail");
+		assert_eq!(tier(100, 1, 10), "fail");
+	}
+}
diff --git a/crates/alertd/src/doctor/server_info.rs b/crates/alertd/src/doctor/server_info.rs
index ba113a86..887116ce 100644
--- a/crates/alertd/src/doctor/server_info.rs
+++ b/crates/alertd/src/doctor/server_info.rs
@@ -8,7 +8,7 @@ use std::{
 };
 
 use serde::Serialize;
-use sysinfo::{Disks, System};
+use sysinfo::{CpuRefreshKind, Disks, MemoryRefreshKind, RefreshKind, System};
 use tokio::net::TcpStream;
 use tracing::debug;
 
@@ -63,6 +63,10 @@ pub struct ServerInfo {
 	pub pg_version: Option<String>,
 
 	pub uptime_secs: u64,
+	/// Logical CPU count (what load average is relative to, i.e. `nproc`).
+	pub cpu_cores: usize,
+	/// Total physical memory, in bytes.
+	pub total_memory_bytes: u64,
 	pub os_kind: &'static str,
 	#[serde(skip_serializing_if = "Option::is_none")]
 	pub os_name: Option<String>,
@@ -115,6 +119,14 @@ pub async fn gather(bestool_version: &str, tamanu_version: &str, facts: ServerFa
 		.iana_name()
 		.map(|s| s.to_string());
 
+	let sys = System::new_with_specifics(
+		RefreshKind::nothing()
+			.with_cpu(CpuRefreshKind::nothing())
+			.with_memory(MemoryRefreshKind::nothing().with_ram()),
+	);
+	let cpu_cores = sys.cpus().len();
+	let total_memory_bytes = sys.total_memory();
+
 	ServerInfo {
 		bestool_version: bestool_version.to_string(),
 		tamanu_version: tamanu_version.to_string(),
@@ -126,6 +138,8 @@ pub async fn gather(bestool_version: &str, tamanu_version: &str, facts: ServerFa
 		os_timezone,
 		pg_version: facts.pg_version,
 		uptime_secs: System::uptime(),
+		cpu_cores,
+		total_memory_bytes,
 		os_kind: if cfg!(target_os = "linux") {
 			"linux"
 		} else if cfg!(target_os = "windows") {
diff --git a/docs/plans/healthchecks-into-alertd.md b/docs/plans/healthchecks-into-alertd.md
deleted file mode 100644
index edcf65b6..00000000
--- a/docs/plans/healthchecks-into-alertd.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Plan: consolidate healthchecks into alertd, migrate YAML alerts to checks, retire the alert engine (TODO #10)
-
-## Context
-
-Two monitoring systems run in parallel:
-
-- **Healthchecks** (doctor): code-defined `Check`s run as a concurrent "sweep" → pass/warning/fail + JSON details, POSTed to **canopy** (`POST /status/{server_id}`). Today these live in `crates/tamanu/src/doctor/` (the `Check` type + checks), with the sweep orchestration (`perform_sweep`), canopy posting, and the `DoctorTask` background task in `crates/bestool/`. Viewable via `bestool tamanu doctor`.
-- **alertd** (`crates/alertd/`): a daemon loading **YAML alert definitions** (deployed in Tamanu installs at `/etc/tamanu/alerts`, …), scheduling each on its own interval, evaluating SQL/shell/event sources, and dispatching to email/Slack/canopy `/events`. Ships a standalone `bestool-alertd` binary + library; also hosts the `DoctorTask`.
-
-Decisions:
-
-1. **Invert the crate relationship.** Move the whole doctor subsystem (framework + checks + sweep + canopy posting + `DoctorTask`) **into `bestool-alertd`**, which calls into `bestool-tamanu` for common Tamanu domain utilities. alertd becomes the monitoring engine that owns both the framework and the checks. No dependency cycle: `bestool-tamanu` never depends on alertd.
-2. **Migrate** all 16 production YAML alerts (`~/code/work/tamanu/alerts`) into checks. Migrated checks default to **`Check::fail`** when triggered (single severity, no warn tier).
-3. **Canopy owns alerting** and has its own logic — it ignores the sweep's top-level `healthy:false`, so the warn-vs-fail-for-top-level distinction is irrelevant at the canopy level. bestool just posts the sweep; drop email/Slack/per-alert targets, dedup, hysteresis, cadence.
-4. **Retire the YAML alert engine and the standalone CLI**; alertd keeps only the daemon framework + the doctor subsystem.
-5. **Then review thresholds** across all checks (migrated and pre-existing).
-
-Note: deployed installs still have YAML files under `/etc/tamanu/alerts`; once the loader is removed they're simply ignored (no error). Operators can delete them later.
-
-## Target architecture
-
-- **`bestool-alertd`**: owns the monitoring framework (`BackgroundTask` daemon, http server) **and** the doctor subsystem — `Check`/`CheckStatus`/`OverallResult` wire types, `CheckContext`, the registry + `checks/*`, `progress`, the `ServerInfo` facts, `perform_sweep` + `SweepResult` + canopy status posting, and a built-in `DoctorTask` it registers itself. Depends on `bestool-tamanu` (common domain), `bestool-canopy`, `bestool-postgres`, `bestool-kopia`.
-- **`bestool-tamanu`**: common Tamanu domain library only — `config`, `roots`, `connection_url`, `services`, `systemd`, `pm2`, `server_info` (DB queries: metaServerId, patient-portal), `versions`, `ApiServerKind`, `find_tamanu`, `detect_kind`. The `doctor` module and `doctor` feature are removed; description updated.
-- **`bestool`**: thin CLI. `bestool tamanu doctor` keeps arg parsing + human rendering + daemon-fetch (`/tasks/doctor/latest`/`recompute`) and calls `bestool_alertd::doctor` for local sweeps + types. `bestool tamanu alertd` configures and runs the alertd daemon (which self-registers its `DoctorTask`).
-
-## Phase 1 — Invert: move the doctor subsystem into alertd (behaviour-preserving refactor)
-
-- Relocate `crates/tamanu/src/doctor/{check,checks,checks/*,progress,server_info}.rs` → `crates/alertd/src/doctor/…`.
-- Move `perform_sweep` + `SweepResult` + canopy status posting from `crates/bestool/src/actions/tamanu/doctor.rs` into alertd (e.g. `bestool_alertd::doctor::perform_sweep`).
-- Move `DoctorTask` (`crates/bestool/src/actions/tamanu/alertd/doctor_task.rs`) into alertd as the built-in task; alertd registers it (or exposes a constructor) so bestool no longer wires it.
-- Add `bestool-tamanu` as an alertd dependency; rewrite check imports from `crate::{ApiServerKind, config::TamanuConfig, services, systemd, pm2, server_info, detect_kind, versions}` → `bestool_tamanu::{…}`.
-- Move doctor-only deps (`bestool-kopia`, `hickory-resolver`, and `reqwest`/`owo-colors` as needed) from `crates/tamanu/Cargo.toml` to `crates/alertd/Cargo.toml`; remove tamanu's `doctor` feature and update its package description.
-- bestool side: `doctor.rs` keeps CLI args + rendering + daemon-fetch, calling `bestool_alertd::doctor`; delete the moved `doctor_task` module; retarget Cargo features (`bestool-tamanu/doctor` → alertd).
-- **Behaviour-preserving** — no check logic changes. This is large but mechanical (mostly imports + module moves).
-
-## Phase 2 — Migrate the 16 YAML alerts to checks (now in alertd), default FAIL, central-only
-
-Migrated checks emit `Check::fail` when triggered, skip on Facility (gate on `ctx.kind`, mirroring `fhir_jobs`), and attach offending rows as `details`. A shared "recent error rows" helper serves the 7 recent-error alerts (run query → `fail` with rows if any match, else `pass`); the old per-alert `$1 = now - interval` becomes a per-check lookback constant. Verbatim SQL is in `~/code/work/tamanu/alerts/<name>.yml`.
-
-**New checks (~10):**
-| Alert(s) | New check | Style |
-|---|---|---|
-| certificate-notification-error | `certificate_notification_errors` | recent-error |
-| ips-error | `ips_errors` | recent-error |
-| patient-communications-error | `patient_communication_errors` | recent-error |
-| report-error | `report_errors` | recent-error |
-| fhir-error | `fhir_job_errors` | recent-error |
-| sync-errors-mobile + sync-errors-server | `sync_session_errors` (one check; detail splits mobile/server; keep benign-error exclusions) | recent-error |
-| sync-facility-not-syncing + sync-no-sessions | `sync_facility_stale` (one check; facilities with no recent successful sync) | stuck |
-| sync-lookup-stale | `sync_lookup` (**= TODO #8**) | stuck |
-| sync-restart-loop | `sync_restart_loop` | threshold |
-| fhir-unresolvable-service-requests-labs | `fhir_service_requests_unresolved` | stuck |
-
-**Already covered (confirm/extend detail, no new check):** fhir-queue-incredibly-large, fhir-queued-job-long, fhir-running-job-long → `fhir_jobs`; sync-long → `sync_sessions`.
-
-Add via the registry pattern: `pub mod <name>;` + `entry!("<name>", <name>)` in the registry; `pub async fn run(ctx: CheckContext) -> Check`. Split into ~3 PRs by theme (error-notification / sync / fhir+reconcile).
-
-## Phase 3 — Retire the YAML alert engine + standalone CLI
-
-- Remove from alertd: `alert.rs`, `loader.rs`, `glob_resolver.rs`, `events.rs`, `targets.rs` + `targets/*`, `templates.rs`, per-alert `state_file.rs`, the alert parts of `scheduler.rs`, `commands.rs` + `commands/*`, `main.rs`, the `[[bin]]` + `cli` feature, `windows_service.rs`. Trim `DaemonConfig` (drop `alert_globs`, `email`, `server_kind`, alert `dry_run`; keep `pg_pool`, `database_url`, `device_key_pem`, `tamanu_version`, `no_server`, `server_addrs`, `watchdog_timeout`, `background_tasks`), `daemon.rs`, `http_server` (drop `/alerts`,`/targets`,`/validate`,`/reload`,`/pause`; keep `/`,`/status`,`/health`,`/metrics`,`/tasks/*`), and `lib.rs` exports. Relocate `InternalContext` out of `alert.rs` into `daemon.rs`/`context.rs`, slimmed to `{ pg_pool, http_client, canopy_client }`.
-- bestool: simplify `tamanu alertd` (drop alert-dir discovery/globs, email/Mailgun flags, alert-filtering `server_kind`, and the passthrough subcommands `status`/`reload`/`pause`/`validate`/`loaded-alerts`); keep pg pool, device-key fetch (canopy auth), `tamanu_version`, build `DaemonConfig`, run. Remove the legacy `bestool tamanu alerts` command + module. Delete example alerts (`alerts/`) and alert test fixtures (`crates/bestool/tests/cmd/alerts*`).
-- Gated after Phase 2 so coverage isn't lost. Optional follow-up (not in scope): rename `bestool-alertd` / `bestool tamanu alertd` now that it owns healthchecks, not alerts — deferred to avoid crates.io + systemd/install churn.
-
-## Phase 4 — Threshold review (all checks)
-
-After migration, review every check (the 10 migrated + the pre-existing ones) for triggering behaviour: warn-vs-fail, threshold values, central/facility gating, and whether any migrated check should be a warning rather than fail. Produce a short follow-up (possibly its own plan) and adjust. Migrated checks land at FAIL in Phase 2; this pass tunes them.
-
-## Verification
-
-- **Phase 1 (refactor)**: `cargo build`/`clippy` across the workspace and all feature combos; `cargo check -p bestool --target x86_64-pc-windows-gnu`; confirm identical behaviour — `bestool tamanu doctor` (local `--no-daemon` and daemon-fetch `--fresh`), canopy `/status` posting, and `/tasks/doctor/{latest,recompute}` all work; grep for dangling `bestool_tamanu::doctor` references.
-- **Phase 2 (checks)**: against the local `tamanu-central` / `tamanu-facility` databases, `cargo test -p bestool-alertd` (DB-backed tests where feasible) and `bestool tamanu doctor --json --no-daemon`; confirm each new check appears as pass/fail with `details`, and is skipped on a facility install.
-- **Phase 3 (teardown)**: full-workspace `cargo build`/`clippy` + Windows cross-check (windows_service removed); `bestool tamanu alertd` starts, ticks the sweep, posts to canopy, and `bestool tamanu doctor` still fetches from it; grep for leftover references (`loader`, `targets`, `templates`, `AlertDefinition`, `tamanu alerts`).