diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout index 7fab3b7fc51..4042661ce2f 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout @@ -1742,6 +1742,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1893,6 +1895,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -2137,6 +2141,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout index 2adda2263ec..e55036fc53d 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-mupdate-update-flow-stdout @@ -337,6 +337,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -472,6 +474,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -595,6 +599,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout index 966dcb7bba3..ee84665c790 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-nexus-generation-autobump-stdout @@ -762,6 +762,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -955,6 +957,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1148,6 +1152,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout index 24958a3806c..d4cdc36431f 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-target-release-stdout @@ -746,6 +746,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -939,6 +941,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1132,6 +1136,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout index 29d08b50c5e..62580885401 100644 --- a/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout +++ b/dev-tools/reconfigurator-cli/tests/output/cmds-unsafe-zone-mgs-stdout @@ -800,6 +800,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -993,6 +995,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected @@ -1186,6 +1190,8 @@ LEDGERED SLED CONFIG reconciler task status: idle (finished at after running for s) reference measurements: (measurement set is empty) + fmd: + no faults reported SMF SERVICES STATUS no data on SMF services has been collected diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index f47fec9e040..43fc424bc0d 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -34,6 +34,7 @@ use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, + inv_fmd_host_case, inv_fmd_resource, inv_fmd_status, inv_host_phase_1_active_slot, inv_host_phase_1_flash_hash, inv_internal_dns, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, @@ -64,6 +65,8 @@ use omicron_common::update::OmicronInstallManifestSource; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::DatasetKind; use omicron_uuid_kinds::DatasetUuid; +use omicron_uuid_kinds::FmdHostCaseKind; +use omicron_uuid_kinds::FmdResourceKind; use omicron_uuid_kinds::InternalZpoolKind; use omicron_uuid_kinds::MupdateKind; use omicron_uuid_kinds::MupdateOverrideKind; @@ -85,6 +88,10 @@ use omicron_uuid_kinds::{CollectionUuid, OmicronZoneUuid}; use sled_agent_types::inventory::BootImageHeader; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdHostCase; +use sled_agent_types::inventory::FmdInventory; +use sled_agent_types::inventory::FmdInventoryError; +use sled_agent_types::inventory::FmdResource; use sled_agent_types::inventory::HostPhase2DesiredContents; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::ManifestBootInventory; @@ -2128,6 +2135,163 @@ impl InvSvcEnabledNotOnlineParseError { } } +impl_enum_type!( + FmdInventoryErrorKindEnum: + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)] + pub enum FmdInventoryErrorKind; + + // Enum values + FmdError => b"fmd_error" + TooManyCases => b"too_many_cases" + TooManyResources => b"too_many_resources" +); + +impl From + for FmdInventoryErrorKind +{ + fn from(value: sled_agent_types::inventory::FmdInventoryErrorKind) -> Self { + use sled_agent_types::inventory::FmdInventoryErrorKind as ApiKind; + match value { + ApiKind::FmdError => FmdInventoryErrorKind::FmdError, + ApiKind::TooManyCases => FmdInventoryErrorKind::TooManyCases, + ApiKind::TooManyResources => { + FmdInventoryErrorKind::TooManyResources + } + } + } +} + +impl From + for sled_agent_types::inventory::FmdInventoryErrorKind +{ + fn from(value: FmdInventoryErrorKind) -> Self { + use sled_agent_types::inventory::FmdInventoryErrorKind as ApiKind; + match value { + FmdInventoryErrorKind::FmdError => ApiKind::FmdError, + FmdInventoryErrorKind::TooManyCases => ApiKind::TooManyCases, + FmdInventoryErrorKind::TooManyResources => { + ApiKind::TooManyResources + } + } + } +} + +/// One row per (collection, sled) recording the outcome of FMD inventory +/// collection. Both `error_kind` and `error_message` are `NULL` when the +/// daemon was queried successfully; both are set when collection failed. +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_status)] +pub struct InvFmdStatus { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub error_kind: Option, + pub error_message: Option, +} + +impl InvFmdStatus { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + result: &Result, + ) -> Self { + let (error_kind, error_message) = match result { + Ok(_) => (None, None), + Err(err) => (Some(err.kind.into()), Some(err.message.clone())), + }; + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + error_kind, + error_message, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_host_case)] +pub struct InvFmdHostCase { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub case_id: DbTypedUuid, + pub code: String, + pub url: String, + pub event: Option, +} + +impl InvFmdHostCase { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + case: &FmdHostCase, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + case_id: case.uuid.into(), + code: case.code.clone(), + url: case.url.clone(), + event: case.event.clone(), + } + } +} + +impl From for FmdHostCase { + fn from(row: InvFmdHostCase) -> Self { + Self { + uuid: row.case_id.into(), + code: row.code, + url: row.url, + event: row.event, + } + } +} + +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_fmd_resource)] +pub struct InvFmdResource { + pub inv_collection_id: DbTypedUuid, + pub sled_id: DbTypedUuid, + pub resource_id: DbTypedUuid, + pub fmri: String, + pub case_id: DbTypedUuid, + pub faulty: bool, + pub unusable: bool, + pub invisible: bool, +} + +impl InvFmdResource { + pub fn new( + inv_collection_id: CollectionUuid, + sled_id: SledUuid, + resource: &FmdResource, + ) -> Self { + Self { + inv_collection_id: inv_collection_id.into(), + sled_id: sled_id.into(), + resource_id: resource.uuid.into(), + fmri: resource.fmri.clone(), + case_id: resource.case_id.into(), + faulty: resource.faulty, + unusable: resource.unusable, + invisible: resource.invisible, + } + } +} + +impl From for FmdResource { + fn from(row: InvFmdResource) -> Self { + Self { + uuid: row.resource_id.into(), + fmri: row.fmri, + case_id: row.case_id.into(), + faulty: row.faulty, + unusable: row.unusable, + invisible: row.invisible, + } + } +} + // See [`sled_agent_types::inventory::SvcEnabledNotOnlineState`]. impl_enum_type!( InvSvcEnabledNotOnlineStateEnum: diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 3071ce5d4a5..8ad0fb9a422 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(258, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(259, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ pub static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(259, "inv-fmd"), KnownVersion::new(258, "lookup-unmarked-ereports-by-class"), KnownVersion::new(257, "add-disk-adoption-requests"), KnownVersion::new(256, "bgp-unnumbered-peer-cleanup"), diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index cd47ca48dcf..f3ad63aad5e 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -39,6 +39,9 @@ use nexus_db_model::InvCollectionError; use nexus_db_model::InvConfigReconcilerStatus; use nexus_db_model::InvConfigReconcilerStatusKind; use nexus_db_model::InvDataset; +use nexus_db_model::InvFmdHostCase; +use nexus_db_model::InvFmdResource; +use nexus_db_model::InvFmdStatus; use nexus_db_model::InvHostPhase1ActiveSlot; use nexus_db_model::InvHostPhase1FlashHash; use nexus_db_model::InvInternalDns; @@ -456,6 +459,46 @@ impl DataStore { }) .collect(); + // Pull FMD inventory out of all sled agents. We always record one + // status row per sled (capturing the success/failure discriminant) + // and, when collection succeeded, a row per case and per resource. + let fmd_status_rows: Vec<_> = collection + .sled_agents + .iter() + .map(|sled_agent| { + InvFmdStatus::new( + collection_id, + sled_agent.sled_id, + &sled_agent.fmd, + ) + }) + .collect(); + let fmd_host_case_rows: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + let cases = sled_agent.fmd.as_ref().ok().map(|inv| &inv.cases); + cases.into_iter().flatten().map(|case| { + InvFmdHostCase::new(collection_id, sled_agent.sled_id, case) + }) + }) + .collect(); + let fmd_resource_rows: Vec<_> = collection + .sled_agents + .iter() + .flat_map(|sled_agent| { + let resources = + sled_agent.fmd.as_ref().ok().map(|inv| &inv.resources); + resources.into_iter().flatten().map(|resource| { + InvFmdResource::new( + collection_id, + sled_agent.sled_id, + resource, + ) + }) + }) + .collect(); + // Build up a list of `OmicronSledConfig`s we need to insert. Each sled // has 0-3: // @@ -1430,7 +1473,62 @@ impl DataStore { } } + // Insert FMD status rows (one per sled). + { + use nexus_db_schema::schema::inv_fmd_status::dsl; + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_status_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_status) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } + + // Insert FMD host case rows (zero or more per sled). + { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_host_case_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_host_case) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } + + // Insert FMD resource rows (zero or more per sled). + { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + + let batch_size = SQL_BATCH_SIZE.get().try_into().unwrap(); + let mut rows = fmd_resource_rows.into_iter(); + loop { + let some_rows = + rows.by_ref().take(batch_size).collect::>(); + if some_rows.is_empty() { + break; + } + let _ = diesel::insert_into(dsl::inv_fmd_resource) + .values(some_rows) + .execute_async(&conn) + .await?; + } + } // Insert rows for all the sled config reconciler disk results { @@ -2164,6 +2262,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets: usize, nlast_reconciliation_zone_results: usize, nlast_reconciliation_measurements: usize, + nfmd_status: usize, + nfmd_host_cases: usize, + nfmd_resources: usize, nzone_manifest_zones: usize, nzone_manifest_measurements: usize, nzone_manifest_non_boot: usize, @@ -2204,6 +2305,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets, nlast_reconciliation_zone_results, nlast_reconciliation_measurements, + nfmd_status, + nfmd_host_cases, + nfmd_resources, nzone_manifest_zones, nzone_manifest_measurements, nzone_manifest_non_boot, @@ -2382,6 +2486,31 @@ impl DataStore { .await? }; + // Remove FMD inventory rows. + let nfmd_status = { + use nexus_db_schema::schema::inv_fmd_status::dsl; + diesel::delete(dsl::inv_fmd_status.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nfmd_host_cases = { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + diesel::delete(dsl::inv_fmd_host_case.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; + let nfmd_resources = { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + diesel::delete(dsl::inv_fmd_resource.filter( + dsl::inv_collection_id.eq(db_collection_id), + )) + .execute_async(&conn) + .await? + }; // Remove rows associated with zone resolver inventory. let nzone_manifest_zones = { @@ -2596,6 +2725,9 @@ impl DataStore { nlast_reconciliation_orphaned_datasets, nlast_reconciliation_zone_results, nlast_reconciliation_measurements, + nfmd_status, + nfmd_host_cases, + nfmd_resources, nzone_manifest_zones, nzone_manifest_measurements, nzone_manifest_non_boot, @@ -2647,6 +2779,9 @@ impl DataStore { nlast_reconciliation_zone_results, "nlast_reconciliation_measurements" => nlast_reconciliation_measurements, + "nfmd_status" => nfmd_status, + "nfmd_host_cases" => nfmd_host_cases, + "nfmd_resources" => nfmd_resources, "nzone_manifest_zones" => nzone_manifest_zones, "nzone_manifest_measurements" => nzone_manifest_measurements, "nzone_manifest_non_boot" => nzone_manifest_non_boot, @@ -4051,6 +4186,103 @@ impl DataStore { measurements }; + // Load all FMD inventory rows. The producer's per-sled bounds + // (`FMD_MAX_CASES` / `FMD_MAX_RESOURCES`) keep this size predictable, + // so we don't paginate. + let mut fmd_status_by_sled: BTreeMap< + SledUuid, + Option<(nexus_db_model::FmdInventoryErrorKind, String)>, + > = { + use nexus_db_schema::schema::inv_fmd_status::dsl; + let rows = dsl::inv_fmd_status + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdStatus::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + rows.into_iter() + .map(|row| { + let err = match (row.error_kind, row.error_message) { + (Some(kind), Some(message)) => Some((kind, message)), + (None, None) => None, + _ => unreachable!( + "inv_fmd_status CHECK constraint enforces \ + error_kind and error_message agree on NULL" + ), + }; + (row.sled_id.into(), err) + }) + .collect() + }; + + let mut fmd_cases_by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = { + use nexus_db_schema::schema::inv_fmd_host_case::dsl; + let rows = dsl::inv_fmd_host_case + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdHostCase::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + let mut by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = BTreeMap::new(); + for row in rows { + let sled_id: SledUuid = row.sled_id.into(); + by_sled + .entry(sled_id) + .or_default() + .insert_unique(row.into()) + .map_err(|err| { + Error::internal_error(&format!( + "unexpected duplicate FMD case: {}", + InlineErrorChain::new(&err) + )) + })?; + } + by_sled + }; + + let mut fmd_resources_by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = { + use nexus_db_schema::schema::inv_fmd_resource::dsl; + let rows = dsl::inv_fmd_resource + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvFmdResource::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + let mut by_sled: BTreeMap< + SledUuid, + IdOrdMap, + > = BTreeMap::new(); + for row in rows { + let sled_id: SledUuid = row.sled_id.into(); + by_sled + .entry(sled_id) + .or_default() + .insert_unique(row.into()) + .map_err(|err| { + Error::internal_error(&format!( + "unexpected duplicate FMD resource: {}", + InlineErrorChain::new(&err) + )) + })?; + } + by_sled + }; + // Load all the config reconciler zone results; build a map of maps // keyed by sled ID. let mut last_reconciliation_zone_results = { @@ -4637,6 +4869,28 @@ impl DataStore { reference_measurements: last_reconciliation_measurements .remove(&sled_id) .unwrap_or_default(), + fmd: { + use sled_agent_types::inventory::{ + FmdInventory, FmdInventoryError, + }; + let cases = + fmd_cases_by_sled.remove(&sled_id).unwrap_or_default(); + let resources = fmd_resources_by_sled + .remove(&sled_id) + .unwrap_or_default(); + // The status row's (error_kind, error_message) columns + // distinguish Ok (both NULL) from Err (both set). If no + // row exists at all (older collection predating this + // migration), fall back to Ok with whatever case/resource + // rows we found, which will normally be empty. + match fmd_status_by_sled.remove(&sled_id) { + Some(Some((kind, message))) => Err(FmdInventoryError { + kind: kind.into(), + message, + }), + _ => Ok(FmdInventory { cases, resources }), + } + }, }; sled_agents .insert_unique(sled_agent) diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index d2288443874..ae4220d7e16 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -58,6 +58,7 @@ define_enums! { DownstairsClientStoppedReasonEnum => "downstairs_client_stopped_reason_type", EreporterTypeEnum => "ereporter_type", FailureDomainEnum => "failure_domain", + FmdInventoryErrorKindEnum => "fmd_inventory_error_kind", HwM2SlotEnum => "hw_m2_slot", HwPowerStateEnum => "hw_power_state", HwRotSlotEnum => "hw_rot_slot", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 40378afb9ef..03db71dacfc 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1805,6 +1805,39 @@ table! { } } +table! { + inv_fmd_status (inv_collection_id, sled_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + error_kind -> Nullable, + error_message -> Nullable, + } +} + +table! { + inv_fmd_host_case (inv_collection_id, sled_id, case_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + case_id -> Uuid, + code -> Text, + url -> Text, + event -> Nullable, + } +} + +table! { + inv_fmd_resource (inv_collection_id, sled_id, resource_id) { + inv_collection_id -> Uuid, + sled_id -> Uuid, + resource_id -> Uuid, + fmri -> Text, + case_id -> Uuid, + faulty -> Bool, + unusable -> Bool, + invisible -> Bool, + } +} + table! { inv_sled_agent (inv_collection_id, sled_id) { inv_collection_id -> Uuid, diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 08ff5ea2538..fb9e4890a8b 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -678,6 +678,7 @@ impl CollectionBuilder { smf_services_enabled_not_online: inventory .smf_services_enabled_not_online, reference_measurements: inventory.reference_measurements, + fmd: inventory.fmd, }; self.sleds diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 90c502b677b..feac32b6166 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -1098,6 +1098,29 @@ pub fn sled_agent( result: ConfigReconcilerInventoryResult::Ok, }); + // Synthesize a representative FMD payload: a single faulted resource + // diagnosed by a single case. This keeps the per-table-population test + // happy and gives downstream golden-output tests something to render. + let case_id = omicron_uuid_kinds::FmdHostCaseUuid::new_v4(); + let resource_id = omicron_uuid_kinds::FmdResourceUuid::new_v4(); + let mut fmd_cases = iddqd::IdOrdMap::new(); + fmd_cases.insert_overwrite(sled_agent_types::inventory::FmdHostCase { + uuid: case_id, + code: "PCIEX-8000-DJ".to_string(), + url: "http://illumos.org/msg/PCIEX-8000-DJ".to_string(), + event: Some(serde_json::json!({"class": "fault.io.pci.bus"})), + }); + let mut fmd_resources = iddqd::IdOrdMap::new(); + fmd_resources.insert_overwrite(sled_agent_types::inventory::FmdResource { + uuid: resource_id, + fmri: "dev:////pci@af,0/pci1022,1483@3,5".to_string(), + case_id, + faulty: true, + unusable: false, + invisible: false, + }); + let fmd = Ok(FmdInventory { cases: fmd_cases, resources: fmd_resources }); + Inventory { baseboard, reservoir_size: ByteCount::from(1024), @@ -1116,6 +1139,6 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, - fmd: Ok(FmdInventory::default()), + fmd, } } diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index 887fb615599..8854d724f26 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -35,6 +35,8 @@ use serde_with::serde_as; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventory; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventoryResult; use sled_agent_types_versions::latest::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types_versions::latest::inventory::FmdInventory; +use sled_agent_types_versions::latest::inventory::FmdInventoryError; use sled_agent_types_versions::latest::inventory::InventoryDataset; use sled_agent_types_versions::latest::inventory::InventoryDisk; use sled_agent_types_versions::latest::inventory::InventoryZpool; @@ -649,6 +651,7 @@ pub struct SledAgent { pub file_source_resolver: OmicronFileSourceResolverInventory, pub smf_services_enabled_not_online: SvcsEnabledNotOnlineResult, pub reference_measurements: IdOrdMap, + pub fmd: Result, } impl IdOrdItem for SledAgent { diff --git a/nexus/types/src/inventory/display.rs b/nexus/types/src/inventory/display.rs index 0964ba23d92..a3848992852 100644 --- a/nexus/types/src/inventory/display.rs +++ b/nexus/types/src/inventory/display.rs @@ -625,6 +625,7 @@ fn display_sleds( file_source_resolver, smf_services_enabled_not_online, reference_measurements, + fmd, } = sled; writeln!( @@ -914,6 +915,14 @@ fn display_sleds( } } + writeln!(indented, "fmd:")?; + let mut indent2 = IndentWriter::new(" ", &mut indented); + write!( + indent2, + "{}", + sled_agent_types_versions::latest::inventory::FmdInventoryResultDisplay::new(fmd), + )?; + f = indented.into_inner(); display_svcs_enabled_not_online(smf_services_enabled_not_online, f)?; } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 4a9a2400c93..97621190020 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -5228,6 +5228,73 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_svc_enabled_not_online_parse_error PRIMARY KEY (inv_collection_id, sled_id, id) ); +CREATE TYPE IF NOT EXISTS omicron.public.fmd_inventory_error_kind AS ENUM ( + -- Catch-all for FMD-side failures: daemon unreachable, listing cases + -- or resources failed, or the platform doesn't have FMD at all. The + -- accompanying `error_message` carries specifics. + 'fmd_error', + -- Number of FMD cases reported by the sled exceeded the producer's + -- limit; no partial data is recorded. + 'too_many_cases', + -- Number of FMD resources reported by the sled exceeded the limit. + 'too_many_resources' +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + -- Classifies the failure mode when FMD inventory collection failed. + -- NULL iff `error_message` is NULL (FMD was successfully collected). + error_kind omicron.public.fmd_inventory_error_kind, + -- Display() of the original error; informational only, do not parse. + -- The `error_kind` discriminator is the structured signal. + -- NULL iff `error_kind` is NULL. + error_message TEXT, + + CONSTRAINT error_kind_and_message_together CHECK ( + (error_kind IS NULL) = (error_message IS NULL) + ), + + PRIMARY KEY (inv_collection_id, sled_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + case_id UUID NOT NULL, + code TEXT NOT NULL, + url TEXT NOT NULL, + -- The full FMD fault event payload as JSON, if present. Stored as + -- JSONB without parsing — Nexus does not interpret the FMD event + -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + event JSONB, + + PRIMARY KEY (inv_collection_id, sled_id, case_id) +); + +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + resource_id UUID NOT NULL, + -- Fault Management Resource Identifier + -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + fmri TEXT NOT NULL, + -- (foreign key into `inv_fmd_host_case`, with the same + -- (inv_collection_id, sled_id)) + case_id UUID NOT NULL, + faulty BOOL NOT NULL, + unusable BOOL NOT NULL, + invisible BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_id, resource_id) +); + /* * Various runtime configuration switches for reconfigurator * @@ -8587,7 +8654,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '258.0.0', NULL) + (TRUE, NOW(), NOW(), '259.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/inv-fmd/up01.sql b/schema/crdb/inv-fmd/up01.sql new file mode 100644 index 00000000000..3aac46a3768 --- /dev/null +++ b/schema/crdb/inv-fmd/up01.sql @@ -0,0 +1,11 @@ +CREATE TYPE IF NOT EXISTS omicron.public.fmd_inventory_error_kind AS ENUM ( + -- Catch-all for FMD-side failures: daemon unreachable, listing cases + -- or resources failed, or the platform doesn't have FMD at all. The + -- accompanying `error_message` carries specifics. + 'fmd_error', + -- Number of FMD cases reported by the sled exceeded the producer's + -- limit; no partial data is recorded. + 'too_many_cases', + -- Number of FMD resources reported by the sled exceeded the limit. + 'too_many_resources' +); diff --git a/schema/crdb/inv-fmd/up02.sql b/schema/crdb/inv-fmd/up02.sql new file mode 100644 index 00000000000..13f5f65a3b6 --- /dev/null +++ b/schema/crdb/inv-fmd/up02.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_status ( + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + -- Classifies the failure mode when FMD inventory collection failed. + -- NULL iff `error_message` is NULL (FMD was successfully collected). + error_kind omicron.public.fmd_inventory_error_kind, + -- Display() of the original error; informational only, do not parse. + -- The `error_kind` discriminator is the structured signal. + -- NULL iff `error_kind` is NULL. + error_message TEXT, + + CONSTRAINT error_kind_and_message_together CHECK ( + (error_kind IS NULL) = (error_message IS NULL) + ), + + PRIMARY KEY (inv_collection_id, sled_id) +); diff --git a/schema/crdb/inv-fmd/up03.sql b/schema/crdb/inv-fmd/up03.sql new file mode 100644 index 00000000000..daba123b7e4 --- /dev/null +++ b/schema/crdb/inv-fmd/up03.sql @@ -0,0 +1,15 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_host_case ( + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + case_id UUID NOT NULL, + code TEXT NOT NULL, + url TEXT NOT NULL, + -- The full FMD fault event payload as JSON, if present. Stored as + -- JSONB without parsing — Nexus does not interpret the FMD event + -- schema; it round-trips verbatim for downstream tooling (e.g. omdb). + event JSONB, + + PRIMARY KEY (inv_collection_id, sled_id, case_id) +); diff --git a/schema/crdb/inv-fmd/up04.sql b/schema/crdb/inv-fmd/up04.sql new file mode 100644 index 00000000000..bb6ee964f24 --- /dev/null +++ b/schema/crdb/inv-fmd/up04.sql @@ -0,0 +1,18 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_fmd_resource ( + -- (foreign key into `inv_collection` table) + inv_collection_id UUID NOT NULL, + -- guaranteed to match a row in this collection's `inv_sled_agent` + sled_id UUID NOT NULL, + resource_id UUID NOT NULL, + -- Fault Management Resource Identifier + -- (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + fmri TEXT NOT NULL, + -- (foreign key into `inv_fmd_host_case`, with the same + -- (inv_collection_id, sled_id)) + case_id UUID NOT NULL, + faulty BOOL NOT NULL, + unusable BOOL NOT NULL, + invisible BOOL NOT NULL, + + PRIMARY KEY (inv_collection_id, sled_id, resource_id) +); diff --git a/sled-agent/types/versions/src/impls/inventory.rs b/sled-agent/types/versions/src/impls/inventory.rs index 6a4fcb9a87a..bca742e28d1 100644 --- a/sled-agent/types/versions/src/impls/inventory.rs +++ b/sled-agent/types/versions/src/impls/inventory.rs @@ -21,9 +21,10 @@ use tufaceous_artifact::{ArtifactHash, KnownArtifactKind}; use crate::latest::inventory::{ BootImageHeader, BootPartitionContents, BootPartitionDetails, - ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - HostPhase2DesiredContents, HostPhase2DesiredSlots, ManifestBootInventory, - ManifestInventory, ManifestNonBootInventory, MupdateOverrideBootInventory, + ConfigReconcilerInventory, ConfigReconcilerInventoryResult, FmdHostCase, + FmdInventory, FmdInventoryError, FmdResource, HostPhase2DesiredContents, + HostPhase2DesiredSlots, ManifestBootInventory, ManifestInventory, + ManifestNonBootInventory, MupdateOverrideBootInventory, MupdateOverrideInventory, MupdateOverrideNonBootInventory, NetworkInterface, OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZoneConfig, OmicronZoneImageSource, OmicronZoneType, @@ -910,6 +911,116 @@ impl fmt::Display for SingleMeasurementInventoryDisplay<'_> { } } +/// a displayer for the FMD inventory result on a sled +pub struct FmdInventoryResultDisplay<'a> { + inner: &'a Result, +} + +impl<'a> FmdInventoryResultDisplay<'a> { + pub fn new(result: &'a Result) -> Self { + Self { inner: result } + } +} + +impl fmt::Display for FmdInventoryResultDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.inner { + Ok(inv) => write!(f, "{}", inv.display()), + Err(err) => writeln!(f, "FMD collection failed: {err}"), + } + } +} + +impl FmdInventory { + pub fn display(&self) -> FmdInventoryDisplay<'_> { + FmdInventoryDisplay { inner: self } + } +} + +/// a displayer for [`FmdInventory`] +pub struct FmdInventoryDisplay<'a> { + inner: &'a FmdInventory, +} + +impl fmt::Display for FmdInventoryDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdInventory { cases, resources } = self.inner; + if cases.is_empty() && resources.is_empty() { + writeln!(f, "no faults reported")?; + return Ok(()); + } + writeln!(f, "cases ({}):", cases.len())?; + for case in cases { + let mut indent = IndentWriter::new(" ", &mut *f); + write!(indent, "{}", case.display())?; + } + writeln!(f, "resources ({}):", resources.len())?; + for resource in resources { + let mut indent = IndentWriter::new(" ", &mut *f); + write!(indent, "{}", resource.display())?; + } + Ok(()) + } +} + +impl FmdHostCase { + pub fn display(&self) -> FmdHostCaseDisplay<'_> { + FmdHostCaseDisplay { inner: self } + } +} + +/// a displayer for [`FmdHostCase`] +pub struct FmdHostCaseDisplay<'a> { + inner: &'a FmdHostCase, +} + +impl fmt::Display for FmdHostCaseDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdHostCase { uuid, code, url, event } = self.inner; + writeln!(f, "case {uuid} ({code})")?; + writeln!(f, " url: {url}")?; + // The event payload is the FMD nvlist serialized to JSON. We + // intentionally do not interpret it; round-trip pretty-printing + // is enough to make it human-readable. + if let Some(event) = event { + match serde_json::to_string_pretty(event) { + Ok(rendered) => { + writeln!(f, " event:")?; + let mut indent = IndentWriter::new(" ", &mut *f); + writeln!(indent, "{rendered}")?; + } + Err(_) => writeln!(f, " event: ")?, + } + } + Ok(()) + } +} + +impl FmdResource { + pub fn display(&self) -> FmdResourceDisplay<'_> { + FmdResourceDisplay { inner: self } + } +} + +/// a displayer for [`FmdResource`] +pub struct FmdResourceDisplay<'a> { + inner: &'a FmdResource, +} + +impl fmt::Display for FmdResourceDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FmdResource { uuid, fmri, case_id, faulty, unusable, invisible } = + self.inner; + writeln!(f, "resource {uuid} (case {case_id})")?; + writeln!(f, " fmri: {fmri}")?; + writeln!( + f, + " faulty: {faulty}, unusable: {unusable}, invisible: {invisible}" + )?; + Ok(()) + } +} + #[derive(Debug, thiserror::Error, PartialEq, Eq)] #[error("unrecognized zpool health value `{0}`")] pub struct ZpoolHealthParseError(pub String); diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 60591115103..d7369cda2e6 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -192,6 +192,10 @@ pub mod inventory { pub use crate::v40::inventory::FmdResource; pub use crate::v40::inventory::Inventory; + pub use crate::impls::inventory::FmdHostCaseDisplay; + pub use crate::impls::inventory::FmdInventoryDisplay; + pub use crate::impls::inventory::FmdInventoryResultDisplay; + pub use crate::impls::inventory::FmdResourceDisplay; pub use crate::impls::inventory::ManifestBootInventoryDisplay; pub use crate::impls::inventory::ManifestInventoryDisplay; pub use crate::impls::inventory::ManifestNonBootInventoryDisplay;