Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions crates/alertd/src/doctor/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,30 @@ use bestool_tamanu::{ApiServerKind, config::TamanuConfig};

use super::check::Check;

pub mod util;

pub mod caddy_version;
pub mod certificate_notification_errors;
pub mod db_connect;
pub mod db_version;
pub mod disk_free;
pub mod external_users;
pub mod fhir_job_errors;
pub mod fhir_jobs;
pub mod fhir_service_requests_unresolved;
pub mod http_errors;
pub mod ips_errors;
pub mod kopia_backup;
pub mod load;
pub mod memory;
pub mod migrations;
pub mod patient_communication_errors;
pub mod report_errors;
pub mod server_id;
pub mod sync_facility_stale;
pub mod sync_lookup;
pub mod sync_restart_loop;
pub mod sync_session_errors;
pub mod sync_sessions;
pub mod tailscale;
pub mod tamanu_found;
Expand Down Expand Up @@ -143,5 +155,91 @@ pub fn all() -> Vec<CheckEntry> {
entry!("sync_sessions", sync_sessions),
entry!("fhir_jobs", fhir_jobs),
entry!("kopia_backup", kopia_backup),
entry!(
"certificate_notification_errors",
certificate_notification_errors
),
entry!("ips_errors", ips_errors),
entry!("patient_communication_errors", patient_communication_errors),
entry!("report_errors", report_errors),
entry!("fhir_job_errors", fhir_job_errors),
entry!("sync_session_errors", sync_session_errors),
entry!("sync_facility_stale", sync_facility_stale),
entry!("sync_lookup", sync_lookup),
entry!("sync_restart_loop", sync_restart_loop),
entry!(
"fhir_service_requests_unresolved",
fhir_service_requests_unresolved
),
]
}

#[cfg(test)]
pub mod test_support {
//! Helpers for DB-backed check tests.
//!
//! Each migrated check is central-only and DB-backed, so its tests need a
//! [`CheckContext`] wired to one of the local `tamanu-central` /
//! `tamanu-facility` databases. These connect lazily and return `None` when
//! the DB is unavailable so the suite degrades gracefully off-CI.

use std::sync::Arc;

use node_semver::Version;

use bestool_tamanu::{ApiServerKind, config::TamanuConfig};

use super::CheckContext;

fn central_config() -> TamanuConfig {
serde_json::from_value(serde_json::json!({
"db": { "name": "tamanu-central", "username": "u", "password": "p" },
}))
.expect("central test config should parse")
}

fn facility_config() -> TamanuConfig {
serde_json::from_value(serde_json::json!({
"db": { "name": "tamanu-facility", "username": "u", "password": "p" },
"serverFacilityIds": ["facility-1"],
}))
.expect("facility test config should parse")
}

async fn connect(db_name: &str) -> Option<Arc<tokio_postgres::Client>> {
let url = format!("postgresql://localhost/{db_name}");
match bestool_postgres::pool::connect_one(&url, "bestool-alertd-test").await {
Ok(client) => Some(Arc::new(client)),
Err(_) => None,
}
}

/// A central [`CheckContext`] backed by `tamanu-central`, or `None` if that
/// DB can't be reached.
pub async fn central_ctx() -> Option<CheckContext> {
let db = connect("tamanu-central").await?;
Some(CheckContext {
tamanu_version: Version::parse("0.0.0").unwrap(),
tamanu_root: std::path::PathBuf::from("/nonexistent"),
config: Arc::new(central_config()),
kind: ApiServerKind::Central,
database_url: "postgresql://localhost/tamanu-central".into(),
db: Some(db),
http_client: reqwest::Client::new(),
})
}

/// A facility [`CheckContext`] with no DB; central-only checks skip on it
/// before ever touching the database.
pub fn facility_ctx() -> CheckContext {
CheckContext {
tamanu_version: Version::parse("0.0.0").unwrap(),
tamanu_root: std::path::PathBuf::from("/nonexistent"),
config: Arc::new(facility_config()),
kind: ApiServerKind::Facility,
database_url: "postgresql://localhost/tamanu-facility".into(),
db: None,
http_client: reqwest::Client::new(),
}
}
}
61 changes: 61 additions & 0 deletions crates/alertd/src/doctor/checks/certificate_notification_errors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//! Certificate notifications that errored within the lookback window.
//!
//! Ported from the `certificate-notification-error` YAML alert.

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

const NAME: &str = "certificate_notification_errors";
const SQL: &str = "SELECT * FROM certificate_notifications \
WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC";

// Lookback window for recent-error checks; revisit in the Phase 4 threshold
// review.
const LOOKBACK_HOURS: i64 = 1;

pub async fn run(ctx: CheckContext) -> Check {
if ctx.kind != ApiServerKind::Central {
return Check::skip(
NAME,
"not applicable on facility server",
"central-only check",
);
}
let Some(client) = ctx.db.as_ref() else {
return Check::skip(NAME, "no DB connection", "db unavailable");
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
client,
"certificate_notification_errors",
"no recent certificate notification errors",
"certificate notification errors: ",
SQL,
&[&since],
)
.await
}

#[cfg(test)]
mod tests {
use crate::doctor::checks::test_support::{central_ctx, facility_ctx};

#[tokio::test]
async fn runs_against_central() {
let Some(ctx) = central_ctx().await else {
return;
};
let check = super::run(ctx).await;
assert_eq!(check.name, "certificate_notification_errors");
}

#[tokio::test]
async fn skips_on_facility() {
let check = super::run(facility_ctx()).await;
assert!(check.status.is_skip());
}
}
63 changes: 63 additions & 0 deletions crates/alertd/src/doctor/checks/fhir_job_errors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
//! FHIR jobs that recorded an error within the lookback window.
//!
//! Ported from the `fhir-error` YAML alert. Distinct from `fhir_jobs`, which
//! measures live queue depth: this surfaces individual jobs that errored
//! recently.

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

const NAME: &str = "fhir_job_errors";
const SQL: &str =
"SELECT * FROM fhir.jobs WHERE error IS NOT NULL AND created_at > $1 ORDER BY created_at DESC";

// Lookback window for recent-error checks; revisit in the Phase 4 threshold
// review.
const LOOKBACK_HOURS: i64 = 1;

pub async fn run(ctx: CheckContext) -> Check {
if ctx.kind != ApiServerKind::Central {
return Check::skip(
NAME,
"not applicable on facility server",
"central-only check",
);
}
let Some(client) = ctx.db.as_ref() else {
return Check::skip(NAME, "no DB connection", "db unavailable");
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
client,
"fhir_job_errors",
"no recent FHIR job errors",
"FHIR job errors: ",
SQL,
&[&since],
)
.await
}

#[cfg(test)]
mod tests {
use crate::doctor::checks::test_support::{central_ctx, facility_ctx};

#[tokio::test]
async fn runs_against_central() {
let Some(ctx) = central_ctx().await else {
return;
};
let check = super::run(ctx).await;
assert_eq!(check.name, "fhir_job_errors");
}

#[tokio::test]
async fn skips_on_facility() {
let check = super::run(facility_ctx()).await;
assert!(check.status.is_skip());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
//! FHIR service requests that have stayed unresolved for too long.
//!
//! Ported from the `fhir-unresolvable-service-requests-labs` YAML alert. Fails
//! when a FHIR service request linked to a lab request has been unresolved for
//! over an hour.

use super::{CheckContext, util::fail_if_any_rows};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

const NAME: &str = "fhir_service_requests_unresolved";
const SQL: &str = "SELECT lr.display_id AS lab_request_id, \
ROUND(EXTRACT(EPOCH FROM (NOW() - fsr.last_updated)) / 60)::text AS duration_minutes \
FROM fhir.service_requests fsr JOIN lab_requests lr ON fsr.upstream_id = lr.id \
WHERE fsr.resolved = FALSE AND NOW() - fsr.last_updated > INTERVAL '1 hours'";

pub async fn run(ctx: CheckContext) -> Check {
if ctx.kind != ApiServerKind::Central {
return Check::skip(
NAME,
"not applicable on facility server",
"central-only check",
);
}
let Some(client) = ctx.db.as_ref() else {
return Check::skip(NAME, "no DB connection", "db unavailable");
};

fail_if_any_rows(
client,
NAME,
"no unresolved FHIR service requests",
"unresolved FHIR service requests: ",
SQL,
&[],
)
.await
}

#[cfg(test)]
mod tests {
use crate::doctor::checks::test_support::{central_ctx, facility_ctx};

#[tokio::test]
async fn runs_against_central() {
let Some(ctx) = central_ctx().await else {
return;
};
let check = super::run(ctx).await;
assert_eq!(check.name, "fhir_service_requests_unresolved");
}

#[tokio::test]
async fn skips_on_facility() {
let check = super::run(facility_ctx()).await;
assert!(check.status.is_skip());
}
}
60 changes: 60 additions & 0 deletions crates/alertd/src/doctor/checks/ips_errors.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//! IPS requests that errored within the lookback window.
//!
//! Ported from the `ips-error` YAML alert.

use jiff::{Timestamp, ToSpan};

use super::{CheckContext, util::fail_if_any_rows};
use crate::doctor::check::Check;
use bestool_tamanu::ApiServerKind;

const NAME: &str = "ips_errors";
const SQL: &str = "SELECT * FROM ips_requests WHERE status = 'Error' AND created_at > $1 ORDER BY created_at DESC";

// Lookback window for recent-error checks; revisit in the Phase 4 threshold
// review.
const LOOKBACK_HOURS: i64 = 1;

pub async fn run(ctx: CheckContext) -> Check {
if ctx.kind != ApiServerKind::Central {
return Check::skip(
NAME,
"not applicable on facility server",
"central-only check",
);
}
let Some(client) = ctx.db.as_ref() else {
return Check::skip(NAME, "no DB connection", "db unavailable");
};

let since = Timestamp::now() - LOOKBACK_HOURS.hours();
fail_if_any_rows(
client,
"ips_errors",
"no recent IPS request errors",
"IPS request errors: ",
SQL,
&[&since],
)
.await
}

#[cfg(test)]
mod tests {
use crate::doctor::checks::test_support::{central_ctx, facility_ctx};

#[tokio::test]
async fn runs_against_central() {
let Some(ctx) = central_ctx().await else {
return;
};
let check = super::run(ctx).await;
assert_eq!(check.name, "ips_errors");
}

#[tokio::test]
async fn skips_on_facility() {
let check = super::run(facility_ctx()).await;
assert!(check.status.is_skip());
}
}
Loading
Loading