diff --git a/.cargo/xtask.toml b/.cargo/xtask.toml index 0a5c6960a0b..76d7b342d94 100644 --- a/.cargo/xtask.toml +++ b/.cargo/xtask.toml @@ -50,3 +50,12 @@ binary_allow_list = [ "sled-agent", "sled-agent-sim", ] + +# libfmd_adm is the illumos Fault Management Daemon admin library, used by +# sled-agent to collect FMD case/resource information for inventory. +[libraries."libfmd_adm.so.1"] +binary_allow_list = [ + "omicron-dev", + "sled-agent", + "sled-agent-sim", +] diff --git a/Cargo.lock b/Cargo.lock index 70a728f0dc7..38646cfc1d9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3512,6 +3512,7 @@ dependencies = [ "clap", "colored 2.2.0", "dhcproto", + "fmd-adm-sys", "futures", "hickory-resolver 0.25.2", "http", @@ -3522,6 +3523,7 @@ dependencies = [ "ispf", "macaddr", "nexus-lockstep-client", + "omicron-rpaths", "omicron-sled-agent", "omicron-test-utils", "omicron-uuid-kinds", @@ -3812,6 +3814,23 @@ dependencies = [ "spin", ] +[[package]] +name = "fmd-adm" +version = "0.3.0" +source = "git+https://github.com/oxidecomputer/fmd-adm?rev=846361bf0a698a8c7efefd97b2828b9aa74858c4#846361bf0a698a8c7efefd97b2828b9aa74858c4" +dependencies = [ + "fmd-adm-sys", + "illumos-nvpair", + "libc", + "thiserror 2.0.18", + "uuid", +] + +[[package]] +name = "fmd-adm-sys" +version = "0.4.1" +source = "git+https://github.com/oxidecomputer/fmd-adm?rev=846361bf0a698a8c7efefd97b2828b9aa74858c4#846361bf0a698a8c7efefd97b2828b9aa74858c4" + [[package]] name = "fnv" version = "1.0.7" @@ -5286,6 +5305,21 @@ dependencies = [ "num_enum 0.5.11", ] +[[package]] +name = "illumos-nvpair" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fe4718e52c3654c4e6c6501ce8387d334acd0910b7b7bb3a554aad0262d09f" +dependencies = [ + "illumos-nvpair-sys", +] + +[[package]] +name = "illumos-nvpair-sys" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4b8219c9c8c2c844dfd5772ec0bda5cd2a81d78c4579aba97f699721d46ab24" + [[package]] name = "illumos-sys-hdrs" version = "0.1.0" @@ -6977,6 +7011,7 @@ dependencies = [ "dropshot 0.17.0", "ereport-types", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-types", @@ -7169,6 +7204,7 @@ dependencies = [ "cockroach-admin-types", "dns-service-client", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-messages", @@ -7183,6 +7219,7 @@ dependencies = [ "ntp-admin-client", "omicron-cockroach-metrics", "omicron-common", + "omicron-rpaths", "omicron-sled-agent", "omicron-uuid-kinds", "omicron-workspace-hack", @@ -7267,6 +7304,7 @@ dependencies = [ "async-bb8-diesel", "chrono", "diesel", + "fmd-adm-sys", "futures", "httptest", "ipnetwork", @@ -7388,6 +7426,7 @@ dependencies = [ "camino", "camino-tempfile", "clap", + "fmd-adm-sys", "nexus-db-queries", "nexus-lockstep-client", "nexus-reconfigurator-preparation", @@ -7424,6 +7463,7 @@ dependencies = [ "clickhouse-admin-types", "cockroach-admin-client", "diesel", + "fmd-adm-sys", "futures", "httptest", "iddqd", @@ -7608,6 +7648,7 @@ name = "nexus-saga-recovery" version = "0.1.0" dependencies = [ "chrono", + "fmd-adm-sys", "futures", "nexus-auth", "nexus-db-model", @@ -7664,6 +7705,7 @@ dependencies = [ "dns-service-client", "dpd-client 0.1.0 (git+https://github.com/oxidecomputer/dendrite?rev=187aee7de2e50f907099ea06c04aac96c3455665)", "dropshot 0.17.0", + "fmd-adm-sys", "futures", "gateway-messages", "gateway-test-utils", @@ -8267,6 +8309,7 @@ dependencies = [ "csv", "dropshot 0.17.0", "expectorate", + "fmd-adm-sys", "http", "illumos-utils", "nexus-test-utils", @@ -8408,6 +8451,7 @@ dependencies = [ "clap", "dropshot 0.17.0", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-test-utils", @@ -8636,6 +8680,7 @@ dependencies = [ "ereport-types", "expectorate", "fatfs", + "fmd-adm-sys", "futures", "gateway-client", "gateway-messages", @@ -8795,6 +8840,7 @@ dependencies = [ "clap", "dropshot 0.17.0", "expectorate", + "fmd-adm-sys", "http", "nexus-test-utils", "ntp-admin-api", @@ -8851,6 +8897,7 @@ dependencies = [ "dyn-clone", "ereport-types", "expectorate", + "fmd-adm-sys", "futures", "gateway-client", "gateway-messages", @@ -9131,6 +9178,8 @@ dependencies = [ "expectorate", "flate2", "flume", + "fmd-adm", + "fmd-adm-sys", "futures", "glob", "guppy", @@ -9160,6 +9209,7 @@ dependencies = [ "omicron-common", "omicron-ddm-admin-client", "omicron-ledger", + "omicron-rpaths", "omicron-test-utils", "omicron-uuid-kinds", "omicron-workspace-hack", diff --git a/Cargo.toml b/Cargo.toml index 7780221d91f..3229f3fdbc5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -510,6 +510,8 @@ filetime = "0.2.26" flate2 = "1.1.2" float-ord = "0.3.2" flume = "0.11.1" +fmd-adm = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "846361bf0a698a8c7efefd97b2828b9aa74858c4" } +fmd-adm-sys = { git = "https://github.com/oxidecomputer/fmd-adm", rev = "846361bf0a698a8c7efefd97b2828b9aa74858c4" } foreign-types = "0.3.2" fs-err = "3.1.1" futures = "0.3.31" diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index f1e777f70cd..1293d57c4c4 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -60,6 +60,11 @@ progenitor::generate_api!( ExternalIpConfig = sled_agent_types_versions::latest::instance::ExternalIpConfig, ExternalIpv4Config = sled_agent_types_versions::latest::instance::ExternalIpv4Config, ExternalIpv6Config = sled_agent_types_versions::latest::instance::ExternalIpv6Config, + FmdHostCase = sled_agent_types_versions::latest::inventory::FmdHostCase, + FmdInventory = sled_agent_types_versions::latest::inventory::FmdInventory, + FmdInventoryError = sled_agent_types_versions::latest::inventory::FmdInventoryError, + FmdInventoryErrorKind = sled_agent_types_versions::latest::inventory::FmdInventoryErrorKind, + FmdResource = sled_agent_types_versions::latest::inventory::FmdResource, Generation = omicron_common::api::external::Generation, Hostname = omicron_common::api::external::Hostname, ImportExportPolicy = sled_agent_types_versions::latest::early_networking::ImportExportPolicy, diff --git a/cockroach-admin/Cargo.toml b/cockroach-admin/Cargo.toml index d3a7fca7713..903968981b5 100644 --- a/cockroach-admin/Cargo.toml +++ b/cockroach-admin/Cargo.toml @@ -39,6 +39,10 @@ toml.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true nexus-test-utils.workspace = true diff --git a/dev-tools/omdb/Cargo.toml b/dev-tools/omdb/Cargo.toml index 27de5c27917..aff8370a912 100644 --- a/dev-tools/omdb/Cargo.toml +++ b/dev-tools/omdb/Cargo.toml @@ -95,6 +95,10 @@ update-engine.workspace = true url.workspace = true uuid.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] camino-tempfile.workspace = true expectorate.workspace = true diff --git a/dev-tools/omicron-dev/Cargo.toml b/dev-tools/omicron-dev/Cargo.toml index 46c44e484ad..a21ed7e90a6 100644 --- a/dev-tools/omicron-dev/Cargo.toml +++ b/dev-tools/omicron-dev/Cargo.toml @@ -31,6 +31,10 @@ signal-hook-tokio.workspace = true tokio.workspace = true toml.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true omicron-dev-lib.workspace = true diff --git a/end-to-end-tests/Cargo.toml b/end-to-end-tests/Cargo.toml index 630be83c57c..cc6ee0c30a4 100644 --- a/end-to-end-tests/Cargo.toml +++ b/end-to-end-tests/Cargo.toml @@ -3,10 +3,14 @@ name = "end-to-end-tests" version = "0.1.0" edition.workspace = true license = "MPL-2.0" +build = "build.rs" [lints] workspace = true +[build-dependencies] +omicron-rpaths.workspace = true + [dependencies] anstyle.workspace = true anyhow = { workspace = true, features = ["backtrace"] } @@ -50,3 +54,7 @@ thiserror.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } toml.workspace = true uuid.workspace = true + +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true diff --git a/end-to-end-tests/build.rs b/end-to-end-tests/build.rs new file mode 100644 index 00000000000..1ba9acd41c9 --- /dev/null +++ b/end-to-end-tests/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 6e072b2ec73..acb8784355c 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -166,6 +166,10 @@ omicron-workspace-hack.workspace = true omicron-uuid-kinds.workspace = true zip = { workspace = true, features = ["jiff-02"] } +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] async-bb8-diesel.workspace = true camino-tempfile.workspace = true diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 1009e73ae92..476c2cec224 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -84,6 +84,10 @@ omicron-workspace-hack.workspace = true # only enabled during tests or via the `testing` feature omicron-test-utils = { workspace = true, optional = true } +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [features] # Enable to export `TestDatabase` testing = ["omicron-test-utils"] diff --git a/nexus/db-queries/src/db/datastore/physical_disk.rs b/nexus/db-queries/src/db/datastore/physical_disk.rs index 52911703e06..d314ed5454e 100644 --- a/nexus/db-queries/src/db/datastore/physical_disk.rs +++ b/nexus/db-queries/src/db/datastore/physical_disk.rs @@ -773,9 +773,9 @@ mod test { use omicron_test_utils::dev; use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::inventory::{ - Baseboard, ConfigReconcilerInventoryStatus, Inventory, InventoryDisk, - OmicronFileSourceResolverInventory, SledCpuFamily, SledRole, - SvcsEnabledNotOnlineResult, + Baseboard, ConfigReconcilerInventoryStatus, FmdInventory, Inventory, + InventoryDisk, OmicronFileSourceResolverInventory, SledCpuFamily, + SledRole, SvcsEnabledNotOnlineResult, }; use std::num::NonZeroU32; @@ -1137,6 +1137,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: Ok(FmdInventory::default()), }, ) .unwrap(); diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml index bb00753173d..19ac5173231 100644 --- a/nexus/inventory/Cargo.toml +++ b/nexus/inventory/Cargo.toml @@ -3,10 +3,14 @@ name = "nexus-inventory" version = "0.1.0" edition.workspace = true license = "MPL-2.0" +build = "build.rs" [lints] workspace = true +[build-dependencies] +omicron-rpaths.workspace = true + [dependencies] anyhow.workspace = true base64.workspace = true @@ -46,6 +50,10 @@ uuid.workspace = true omicron-cockroach-metrics.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true gateway-test-utils.workspace = true diff --git a/nexus/inventory/build.rs b/nexus/inventory/build.rs new file mode 100644 index 00000000000..1ba9acd41c9 --- /dev/null +++ b/nexus/inventory/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index 289ffc99eb4..90c502b677b 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -51,6 +51,7 @@ use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryResult; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::InventoryDataset; @@ -1115,5 +1116,6 @@ pub fn sled_agent( file_source_resolver, smf_services_enabled_not_online, reference_measurements, + fmd: Ok(FmdInventory::default()), } } diff --git a/nexus/metrics-producer-gc/Cargo.toml b/nexus/metrics-producer-gc/Cargo.toml index b211ea685d4..5e6519d521c 100644 --- a/nexus/metrics-producer-gc/Cargo.toml +++ b/nexus/metrics-producer-gc/Cargo.toml @@ -27,6 +27,10 @@ pq-sys = "*" omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] async-bb8-diesel.workspace = true diesel.workspace = true diff --git a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs index a0db9244248..0cede8c217c 100644 --- a/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs +++ b/nexus/mgs-updates/src/test_util/host_phase_2_test_state.rs @@ -239,6 +239,7 @@ mod api_impl { use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; + use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::HostPhase2DesiredContents; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; @@ -377,6 +378,7 @@ mod api_impl { remove_mupdate_override: None, boot_partitions, }), + fmd: Ok(FmdInventory::default()), file_source_resolver: OmicronFileSourceResolverInventory { zone_manifest: ManifestInventory { boot_disk_path: Utf8PathBuf::new(), diff --git a/nexus/reconfigurator/cli-integration-tests/Cargo.toml b/nexus/reconfigurator/cli-integration-tests/Cargo.toml index 960cf94c953..6ce589023c3 100644 --- a/nexus/reconfigurator/cli-integration-tests/Cargo.toml +++ b/nexus/reconfigurator/cli-integration-tests/Cargo.toml @@ -19,6 +19,10 @@ omicron-workspace-hack.workspace = true pq-sys = "*" reconfigurator-cli.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] camino.workspace = true camino-tempfile.workspace = true diff --git a/nexus/reconfigurator/execution/Cargo.toml b/nexus/reconfigurator/execution/Cargo.toml index 30339ca61b4..430e4664b9a 100644 --- a/nexus/reconfigurator/execution/Cargo.toml +++ b/nexus/reconfigurator/execution/Cargo.toml @@ -49,6 +49,10 @@ pq-sys = "*" omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] async-bb8-diesel.workspace = true diesel.workspace = true diff --git a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs index 02664e598d4..20f450ec294 100644 --- a/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs +++ b/nexus/reconfigurator/planning/src/mgs_updates/test_helpers.rs @@ -39,6 +39,7 @@ use sled_agent_types::inventory::BootPartitionContents; use sled_agent_types::inventory::BootPartitionDetails; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::HostPhase2DesiredSlots; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::OmicronFileSourceResolverInventory; @@ -1374,6 +1375,7 @@ impl<'a> TestBoardCollectionBuilder<'a> { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: Ok(FmdInventory::default()), }, ) .unwrap(); diff --git a/nexus/reconfigurator/planning/src/system.rs b/nexus/reconfigurator/planning/src/system.rs index 93d7f1d9bed..b13104d8ce9 100644 --- a/nexus/reconfigurator/planning/src/system.rs +++ b/nexus/reconfigurator/planning/src/system.rs @@ -61,6 +61,7 @@ use omicron_uuid_kinds::ZpoolUuid; use sled_agent_types::inventory::Baseboard; use sled_agent_types::inventory::ConfigReconcilerInventory; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; +use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::Inventory; use sled_agent_types::inventory::InventoryDataset; use sled_agent_types::inventory::InventoryDisk; @@ -1505,6 +1506,7 @@ impl Sled { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: iddqd::IdOrdMap::new(), + fmd: Ok(FmdInventory::default()), } }; @@ -1689,6 +1691,7 @@ impl Sled { reference_measurements: inv_sled_agent .reference_measurements .clone(), + fmd: Ok(FmdInventory::default()), }; Sled { diff --git a/nexus/saga-recovery/Cargo.toml b/nexus/saga-recovery/Cargo.toml index 7154496580e..978407c111f 100644 --- a/nexus/saga-recovery/Cargo.toml +++ b/nexus/saga-recovery/Cargo.toml @@ -26,6 +26,10 @@ tokio.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] nexus-auth.workspace = true nexus-db-queries.workspace = true diff --git a/nexus/test-utils/Cargo.toml b/nexus/test-utils/Cargo.toml index 6382048b13e..4cd399aa910 100644 --- a/nexus/test-utils/Cargo.toml +++ b/nexus/test-utils/Cargo.toml @@ -66,5 +66,9 @@ tokio-util.workspace = true transient-dns-server.workspace = true uuid.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [features] omicron-dev = ["omicron-test-utils/seed-gen"] diff --git a/ntp-admin/Cargo.toml b/ntp-admin/Cargo.toml index 3d0a4aa3bd7..33545bb3790 100644 --- a/ntp-admin/Cargo.toml +++ b/ntp-admin/Cargo.toml @@ -35,6 +35,10 @@ toml.workspace = true omicron-workspace-hack.workspace = true +[target.'cfg(target_os = "illumos")'.dependencies] +# See omicron-rpaths for more about the "fmd-adm-sys" dependency. +fmd-adm-sys.workspace = true + [dev-dependencies] expectorate.workspace = true nexus-test-utils.workspace = true diff --git a/openapi/sled-agent/sled-agent-39.0.0-d75fec.json.gitstub b/openapi/sled-agent/sled-agent-39.0.0-d75fec.json.gitstub new file mode 100644 index 00000000000..21dea3a9d5c --- /dev/null +++ b/openapi/sled-agent/sled-agent-39.0.0-d75fec.json.gitstub @@ -0,0 +1 @@ +a2d3940c5aebc875c714c64a75f4383325b78d82:openapi/sled-agent/sled-agent-39.0.0-d75fec.json diff --git a/openapi/sled-agent/sled-agent-39.0.0-d75fec.json b/openapi/sled-agent/sled-agent-40.0.0-600e45.json similarity index 97% rename from openapi/sled-agent/sled-agent-39.0.0-d75fec.json rename to openapi/sled-agent/sled-agent-40.0.0-600e45.json index ea24de5ff0b..fa6c04f9104 100644 --- a/openapi/sled-agent/sled-agent-39.0.0-d75fec.json +++ b/openapi/sled-agent/sled-agent-40.0.0-600e45.json @@ -7,7 +7,7 @@ "url": "https://oxide.computer", "email": "api@oxide.computer" }, - "version": "39.0.0" + "version": "40.0.0" }, "paths": { "/artifacts": { @@ -5392,6 +5392,189 @@ ], "additionalProperties": false }, + "FmdHostCase": { + "description": "A diagnosed fault case from the illumos Fault Management Daemon on a sled.", + "type": "object", + "properties": { + "code": { + "description": "Diagnostic code (e.g. \"PCIEX-8000-DJ\").", + "type": "string" + }, + "event": { + "nullable": true, + "description": "Full fault event payload as JSON, if present. Contains the fault-list with classes, certainties, affected FMRIs, and other diagnostic detail." + }, + "url": { + "description": "URL for human-readable information about this fault (e.g. `http://illumos.org/msg/PCIEX-8000-DJ`).", + "type": "string" + }, + "uuid": { + "description": "Unique identifier for this case.", + "allOf": [ + { + "$ref": "#/components/schemas/FmdHostCaseUuid" + } + ] + } + }, + "required": [ + "code", + "url", + "uuid" + ] + }, + "FmdHostCaseUuid": { + "x-rust-type": { + "crate": "omicron-uuid-kinds", + "path": "omicron_uuid_kinds::FmdHostCaseUuid", + "version": "*" + }, + "type": "string", + "format": "uuid" + }, + "FmdInventory": { + "description": "Successfully collected FMD fault data.", + "type": "object", + "properties": { + "cases": { + "title": "IdOrdMap", + "x-rust-type": { + "crate": "iddqd", + "parameters": [ + { + "$ref": "#/components/schemas/FmdHostCase" + } + ], + "path": "iddqd::IdOrdMap", + "version": "*" + }, + "type": "array", + "items": { + "$ref": "#/components/schemas/FmdHostCase" + }, + "uniqueItems": true + }, + "resources": { + "title": "IdOrdMap", + "x-rust-type": { + "crate": "iddqd", + "parameters": [ + { + "$ref": "#/components/schemas/FmdResource" + } + ], + "path": "iddqd::IdOrdMap", + "version": "*" + }, + "type": "array", + "items": { + "$ref": "#/components/schemas/FmdResource" + }, + "uniqueItems": true + } + }, + "required": [ + "cases", + "resources" + ] + }, + "FmdInventoryError": { + "description": "An error reported by sled-agent in place of an [`FmdInventory`].\n\n`kind` is a typed discriminator suitable for filtering / monitoring. `message` is a human-readable description (built via `Display`); it is informational only and should not be parsed.", + "type": "object", + "properties": { + "kind": { + "$ref": "#/components/schemas/FmdInventoryErrorKind" + }, + "message": { + "type": "string" + } + }, + "required": [ + "kind", + "message" + ] + }, + "FmdInventoryErrorKind": { + "description": "Classification of an [`FmdInventoryError`].\n\n`FmdError` is a catch-all for any FMD-side failure: the daemon was unreachable, a case/resource listing failed, or the platform doesn't have FMD at all. The accompanying message disambiguates these cases. `TooManyCases` and `TooManyResources` are first-class because exceeding those bounds is operationally distinct from a transient FMD failure.", + "oneOf": [ + { + "description": "Catch-all for FMD-side failures.", + "type": "string", + "enum": [ + "fmd_error" + ] + }, + { + "description": "Number of FMD cases exceeded [`FMD_MAX_CASES`].", + "type": "string", + "enum": [ + "too_many_cases" + ] + }, + { + "description": "Number of FMD resources exceeded [`FMD_MAX_RESOURCES`].", + "type": "string", + "enum": [ + "too_many_resources" + ] + } + ] + }, + "FmdResource": { + "description": "A resource affected by a diagnosed fault.", + "type": "object", + "properties": { + "case_id": { + "description": "UUID of the case that diagnosed this fault.", + "allOf": [ + { + "$ref": "#/components/schemas/FmdHostCaseUuid" + } + ] + }, + "faulty": { + "description": "Whether the resource is marked faulty.", + "type": "boolean" + }, + "fmri": { + "description": "Fault Management Resource Identifier (e.g. \"dev:////pci@af,0/pci1022,1483@3,5\").", + "type": "string" + }, + "invisible": { + "description": "Whether the resource is marked invisible.", + "type": "boolean" + }, + "unusable": { + "description": "Whether the resource is marked unusable.", + "type": "boolean" + }, + "uuid": { + "description": "Unique identifier for this resource entry.", + "allOf": [ + { + "$ref": "#/components/schemas/FmdResourceUuid" + } + ] + } + }, + "required": [ + "case_id", + "faulty", + "fmri", + "invisible", + "unusable", + "uuid" + ] + }, + "FmdResourceUuid": { + "x-rust-type": { + "crate": "omicron-uuid-kinds", + "path": "omicron_uuid_kinds::FmdResourceUuid", + "version": "*" + }, + "type": "string", + "format": "uuid" + }, "Generation": { "description": "Generation numbers stored in the database, used for optimistic concurrency control", "type": "integer", @@ -6028,6 +6211,45 @@ "file_source_resolver": { "$ref": "#/components/schemas/OmicronFileSourceResolverInventory" }, + "fmd": { + "x-rust-type": { + "crate": "std", + "parameters": [ + { + "$ref": "#/components/schemas/FmdInventory" + }, + { + "$ref": "#/components/schemas/FmdInventoryError" + } + ], + "path": "::std::result::Result", + "version": "*" + }, + "oneOf": [ + { + "type": "object", + "properties": { + "ok": { + "$ref": "#/components/schemas/FmdInventory" + } + }, + "required": [ + "ok" + ] + }, + { + "type": "object", + "properties": { + "err": { + "$ref": "#/components/schemas/FmdInventoryError" + } + }, + "required": [ + "err" + ] + } + ] + }, "last_reconciliation": { "nullable": true, "allOf": [ @@ -6101,6 +6323,7 @@ "datasets", "disks", "file_source_resolver", + "fmd", "reconciler_status", "reference_measurements", "reservoir_size", diff --git a/openapi/sled-agent/sled-agent-latest.json b/openapi/sled-agent/sled-agent-latest.json index b9bc5d0635c..5f70b30a1ee 120000 --- a/openapi/sled-agent/sled-agent-latest.json +++ b/openapi/sled-agent/sled-agent-latest.json @@ -1 +1 @@ -sled-agent-39.0.0-d75fec.json \ No newline at end of file +sled-agent-40.0.0-600e45.json \ No newline at end of file diff --git a/rpaths/src/lib.rs b/rpaths/src/lib.rs index 381b0e370e2..3f95f67e368 100644 --- a/rpaths/src/lib.rs +++ b/rpaths/src/lib.rs @@ -7,30 +7,40 @@ //! ## The least you need to know //! //! This build-time crate is used by several top-level Omicron crates to set -//! RPATH so that libpq can be found at runtime. This is necessary because these -//! crates depend on "diesel", which depends on "pq-sys", which links in "libpq". -//! But Cargo/Rust have no built-in way to set the RPATH so that libpq can -//! actually be found at runtime. (See below.) So we've developed the pattern -//! here instead. It works like this: +//! RPATH so that native libraries linked via *-sys crates can be found at +//! runtime. Currently we do this for two libraries: //! -//! 1. Any crate that depends on pq-sys, directly or not, needs to follow these -//! instructions. Generally, we depend on pq-sys _indirectly_, by virtue of -//! depending on Diesel. +//! - **libpq** (via pq-sys, pulled in by diesel) +//! - **libfmd_adm** (via fmd-adm-sys, pulled in by fmd-adm in sled-agent) +//! +//! Cargo/Rust have no built-in way to set the RPATH for a transitively-linked +//! native library. (See below.) So we've developed the pattern here instead. +//! It works like this: +//! +//! 1. Any crate that depends (directly or transitively) on a -sys crate from +//! the list above needs to follow these instructions. Often the dep is +//! indirect — pq-sys arrives via diesel, fmd-adm-sys via omicron-sled-agent. //! 2. Affected crates (e.g., omicron-nexus) have a build.rs that just calls //! `omicron_rpath::configure_default_omicron_rpaths()`. -//! 3. These crates must also add a dependency on "pq-sys", usually version "*". -//! (This dependency is unfortunate but necessary in order for us to get the -//! metadata emitted by pq-sys that tells it where it found libpq. Since we -//! don't directly use pq-sys in the crate, we don't care what version it is. -//! We specify "*" so that when Cargo dedups our dependency with the one in -//! Diesel, we pick up whatever would be picked up anyway, and we'll get its -//! metadata.) -//! 4. At the top level of Omicron (in the workspace Cargo.toml), we use a -//! patched version of pq-sys that emits metadata that's used by -//! `configure_default_omicron_rpaths()`. +//! 3. These crates must also add a *direct* dependency on the corresponding +//! -sys crate(s), usually version "*". This is unfortunate but necessary +//! so that Cargo exposes the `DEP_*_LIBDIRS` env var to our build.rs. +//! Since we don't actually use the -sys crate in the parent crate, we +//! don't care what version it is, and "*" lets Cargo dedup with whatever +//! the transitive dep already pulled in. Use a target-gated dep +//! (`[target.'cfg(target_os = "illumos")'.dependencies]`) when the +//! library only exists on illumos (e.g. fmd-adm-sys). +//! 4. The metadata that drives this comes from the -sys crate's build.rs: +//! - For pq-sys, we maintain a fork (see `[patch.crates-io.pq-sys]` +//! in the workspace Cargo.toml) that emits `cargo:LIBDIRS=...`. +//! - For fmd-adm-sys, the upstream crate emits the metadata directly, +//! so no patch is needed. //! -//! This crate is factored (over-engineered, really) so that we can extend this -//! pattern to other native libraries in the future. +//! `configure_default_omicron_rpaths()` scans for every `DEP_*_LIBDIRS` env +//! var in `RPATH_ENV_VARS`. Each crate's build.rs makes the same call — +//! only the env vars Cargo actually sets (corresponding to that crate's +//! direct deps) contribute RPATH entries, so callers don't have to know +//! which libraries they pull in. //! //! ## More details //! @@ -57,9 +67,10 @@ //! to include RPATH entries in the binary instead. //! //! As of 1.56, Cargo supports the "cargo:rustc-link-arg" instruction for use by -//! [Build Scripts][3] to pass arbitrary options to the linker. We use that here -//! to tell the linker to include the correct RPATH entry for our one native -//! dependency that's affected by this (libpq, exposed via the pq-sys package). +//! [Build Scripts][3] to pass arbitrary options to the linker. We use that +//! here to tell the linker to include the correct RPATH entries for the +//! native dependencies affected by this (currently libpq via pq-sys and +//! libfmd_adm via fmd-adm-sys). //! //! A subtle but critical point here is that the RPATH is knowable only by the //! system that's building the top-level executable binary. This mechanism can't @@ -94,10 +105,11 @@ /// Tells Cargo to pass linker arguments that specify the right RPATH for Omicron /// binaries -// This currently assumes that all Omicron binaries link to the same set of -// native libraries. As a result, we use a fixed list of libraries. In the -// future, if they depend on different combinations, we can accept different -// arguments here that specify exactly which ones are expected to be found. +// +// We scan a fixed set of `DEP_*_LIBDIRS` env vars (see `RPATH_ENV_VARS`). +// Only those that are actually set contribute to the RPATH — so each crate +// just calls this once, and only the libraries that crate actually depends +// on get configured. No per-caller customization needed. pub fn configure_default_omicron_rpaths() { internal::configure_default_omicron_rpaths(); // If no 'rerun-if-*' directives are emitted, cargo conservatively [1] @@ -131,6 +143,21 @@ mod internal { configure_rpaths_from_env_var(&mut rpaths, &env_var_name); } + // If none of the expected env vars were set, the caller opted into + // this plumbing but has no direct `*-sys` dep that would contribute. + // Only enforced on illumos: on Linux some `*-sys` deps (e.g. + // fmd-adm-sys) are target-gated to illumos, so a caller can + // legitimately contribute nothing on Linux. + #[cfg(target_os = "illumos")] + assert!( + !rpaths.is_empty(), + "omicron-rpaths: configure_default_omicron_rpaths() was called \ + but none of {:?} were set. Add a direct dep on the \ + corresponding *-sys crate(s) to your Cargo.toml, or remove \ + the call from build.rs.", + RPATH_ENV_VARS, + ); + for r in rpaths { println!("{}", emit_rpath(&r)); } @@ -145,39 +172,27 @@ mod internal { /// variables may itself look like a path, not just a directory. That is, /// these are colon-separated lists of directories. /// - /// Currently, we only do this for libpq ("pq-sys" package), but this pattern - /// could be generalized for other native libraries. - pub static RPATH_ENV_VARS: &'static [&'static str] = &["DEP_PQ_LIBDIRS"]; + /// We scan all of these on every build.rs call. Only env vars that are + /// actually set contribute RPATH entries — a crate that doesn't depend on + /// (say) fmd-adm-sys simply won't have `DEP_FMD_ADM_LIBDIRS` set, and we + /// skip it silently. + pub static RPATH_ENV_VARS: &'static [&'static str] = + &["DEP_PQ_LIBDIRS", "DEP_FMD_ADM_LIBDIRS"]; /// Tells Cargo to pass linker arguments that specify RPATHs from the - /// environment variable `env_var_name` + /// environment variable `env_var_name`, if it is set. /// - /// Panics if the environment variable is not set or contains non-UTF8 data. - /// This might be surprising, since environment variables are optional in - /// most build-time mechanisms. We opt for strictness here because in fact - /// we _do_ expect these to always be set, and if they're not, it's most - /// likely that somebody has forgotten to include a required dependency. We - /// want to tell them that rather than silently produce unrunnable binaries. + /// If the env var is unset, this does nothing: the corresponding + /// dependency is not present in the current crate's dep tree. If the + /// crate *does* need that library at runtime, the missing dep will + /// surface as a build-time or runtime link failure later. pub fn configure_rpaths_from_env_var( rpaths: &mut Vec, env_var_name: &OsStr, ) { - // If you see this message, that means that the build script for some - // Omicron crate is trying to configure RPATHs for a native library, but - // the environment variable that's supposed to contain the RPATH - // information for that library is unset. That most likely means that - // the crate you're building is lacking a direct dependency on the - // '*-sys' crate, or else that the '*-sys' crate's build script failed - // to set this metadata. - let env_var_value = - std::env::var_os(env_var_name).unwrap_or_else(|| { - panic!( - "omicron-rpaths: expected {:?} to be set in the \ - environment, but found it unset. (Is the current \ - crate missing a dependency on a *-sys crate?)", - env_var_name, - ) - }); + let Some(env_var_value) = std::env::var_os(env_var_name) else { + return; + }; configure_rpaths_from_path(rpaths, &env_var_value).unwrap_or_else( |error| { @@ -214,16 +229,17 @@ mod internal { use std::os::unix::ffi::OsStrExt; #[test] - #[should_panic = "omicron-rpaths: expected \"SHOULD_NOT_EXIST\" \ - to be set in the environment, but found it unset"] - fn test_configure_rpaths_from_bad_envvar() { + fn test_configure_rpaths_from_unset_envvar() { use super::configure_rpaths_from_env_var; + // Unset env vars are silently ignored: the dependency that + // would set them simply isn't in this crate's dep tree. let mut v = Vec::new(); configure_rpaths_from_env_var( &mut v, &OsString::from("SHOULD_NOT_EXIST"), ); + assert!(v.is_empty()); } #[test] diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index ddffa9ab618..0f73653213d 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -4,10 +4,14 @@ description = "Services for managing sled-local resources" version = "0.1.0" edition.workspace = true license = "MPL-2.0" +build = "build.rs" [lints] workspace = true +[build-dependencies] +omicron-rpaths.workspace = true + [dependencies] anyhow.workspace = true async-trait.workspace = true @@ -133,6 +137,10 @@ zip.workspace = true zone.workspace = true [target.'cfg(target_os = "illumos")'.dependencies] +fmd-adm.workspace = true +# Direct dep on fmd-adm-sys so this crate sees DEP_FMD_ADM_LIBDIRS in build.rs +# and can configure RPATH via omicron-rpaths. +fmd-adm-sys.workspace = true opte-ioctl.workspace = true [dev-dependencies] diff --git a/sled-agent/api/src/lib.rs b/sled-agent/api/src/lib.rs index e9bc63a6d99..0ee2567f5e1 100644 --- a/sled-agent/api/src/lib.rs +++ b/sled-agent/api/src/lib.rs @@ -21,7 +21,7 @@ use omicron_common::api::internal::{ }; use sled_agent_types_versions::{ latest, v1, v4, v6, v7, v9, v10, v11, v12, v14, v16, v17, v18, v20, v22, - v24, v25, v26, v28, v29, v30, v31, v33, v34, v39, + v24, v25, v26, v28, v29, v30, v31, v33, v34, v37, v39, }; use sled_diagnostics::SledDiagnosticsQueryOutput; use slog_error_chain::InlineErrorChain; @@ -38,6 +38,7 @@ api_versions!([ // | example for the next person. // v // (next_int, IDENT), + (40, ADD_FMD_TO_INVENTORY), (39, BOOTSTORE_SERVICE_NAT_GENERATION), (38, RENAME_PORT_FEC_SPEED_TO_LINK_FEC_SPEED), (37, MODIFY_SVC_ENABLED_NOT_ONLINE_STATE), @@ -1037,12 +1038,27 @@ pub trait SledAgentApi { #[endpoint { method = GET, path = "/inventory", - versions = VERSION_MODIFY_SVC_ENABLED_NOT_ONLINE_STATE.., + versions = VERSION_ADD_FMD_TO_INVENTORY.., }] async fn inventory( rqctx: RequestContext, ) -> Result, HttpError>; + /// Fetch basic information about this sled + #[endpoint { + operation_id = "inventory", + method = GET, + path = "/inventory", + versions = VERSION_MODIFY_SVC_ENABLED_NOT_ONLINE_STATE..VERSION_ADD_FMD_TO_INVENTORY, + }] + async fn inventory_v37( + rqctx: RequestContext, + ) -> Result, HttpError> { + Self::inventory(rqctx).await.map(|HttpResponseOk(inv)| { + HttpResponseOk(v37::inventory::Inventory::from(inv)) + }) + } + /// Fetch basic information about this sled #[endpoint { operation_id = "inventory", @@ -1053,7 +1069,7 @@ pub trait SledAgentApi { async fn inventory_v34( rqctx: RequestContext, ) -> Result, HttpError> { - Self::inventory(rqctx).await.map(|HttpResponseOk(inv)| { + Self::inventory_v37(rqctx).await.map(|HttpResponseOk(inv)| { HttpResponseOk(v34::inventory::Inventory::from(inv)) }) } diff --git a/sled-agent/build.rs b/sled-agent/build.rs new file mode 100644 index 00000000000..1ba9acd41c9 --- /dev/null +++ b/sled-agent/build.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// See omicron-rpaths for documentation. +// NOTE: This file MUST be kept in sync with the other build.rs files in this +// repository. +fn main() { + omicron_rpaths::configure_default_omicron_rpaths(); +} diff --git a/sled-agent/rack-setup/src/plan/service.rs b/sled-agent/rack-setup/src/plan/service.rs index 2ef6d79489a..48de2b1d3e9 100644 --- a/sled-agent/rack-setup/src/plan/service.rs +++ b/sled-agent/rack-setup/src/plan/service.rs @@ -1362,6 +1362,7 @@ mod tests { use oxnet::Ipv6Net; use sled_agent_types::early_networking::RackNetworkConfig; use sled_agent_types::inventory::ConfigReconcilerInventoryStatus; + use sled_agent_types::inventory::FmdInventory; use sled_agent_types::inventory::OmicronFileSourceResolverInventory; use sled_agent_types::inventory::SledCpuFamily; use sled_agent_types::inventory::SvcsEnabledNotOnlineResult; @@ -1534,6 +1535,7 @@ mod tests { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: Ok(FmdInventory::default()), }, is_scrimlet, ) diff --git a/sled-agent/rack-setup/src/service.rs b/sled-agent/rack-setup/src/service.rs index 1ed5e2fb32b..d4e18fefcd0 100644 --- a/sled-agent/rack-setup/src/service.rs +++ b/sled-agent/rack-setup/src/service.rs @@ -1896,9 +1896,10 @@ mod test { use sled_agent_types::{ early_networking::RackNetworkConfig, inventory::{ - Baseboard, ConfigReconcilerInventoryStatus, Inventory, - InventoryDisk, OmicronFileSourceResolverInventory, OmicronZoneType, - SledCpuFamily, SledRole, SvcsEnabledNotOnlineResult, + Baseboard, ConfigReconcilerInventoryStatus, FmdInventory, + Inventory, InventoryDisk, OmicronFileSourceResolverInventory, + OmicronZoneType, SledCpuFamily, SledRole, + SvcsEnabledNotOnlineResult, }, }; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -1959,6 +1960,7 @@ mod test { smf_services_enabled_not_online: SvcsEnabledNotOnlineResult::DataUnavailable, reference_measurements: IdOrdMap::new(), + fmd: Ok(FmdInventory::default()), }, true, ) diff --git a/sled-agent/src/fmd.rs b/sled-agent/src/fmd.rs new file mode 100644 index 00000000000..c7cd8e3a60d --- /dev/null +++ b/sled-agent/src/fmd.rs @@ -0,0 +1,322 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Collects fault information from the illumos Fault Management Daemon (FMD). + +use sled_agent_types::inventory::{FmdInventory, FmdInventoryError}; +use slog::Logger; + +#[cfg(target_os = "illumos")] +mod illumos { + use fmd_adm::{FmdAdm, InvisibleResources, NvList, NvValue}; + use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, GenericUuid}; + use sled_agent_types::inventory::{ + FMD_MAX_CASES, FMD_MAX_RESOURCES, FmdHostCase, FmdInventory, + FmdInventoryError, FmdInventoryErrorKind, FmdResource, + }; + use slog::Logger; + use slog::warn; + use slog_error_chain::InlineErrorChain; + + pub(super) fn nvvalue_to_json(value: &NvValue) -> serde_json::Value { + match value { + NvValue::Boolean => serde_json::Value::Bool(true), + NvValue::BooleanValue(b) => serde_json::Value::Bool(*b), + NvValue::Byte(n) => serde_json::json!(*n), + NvValue::Int8(n) => serde_json::json!(*n), + NvValue::UInt8(n) => serde_json::json!(*n), + NvValue::Int16(n) => serde_json::json!(*n), + NvValue::UInt16(n) => serde_json::json!(*n), + NvValue::Int32(n) => serde_json::json!(*n), + NvValue::UInt32(n) => serde_json::json!(*n), + NvValue::Int64(n) => serde_json::json!(*n), + NvValue::UInt64(n) => serde_json::json!(*n), + NvValue::Double(f) => serde_json::json!(*f), + NvValue::String(s) => serde_json::Value::String(s.clone()), + NvValue::Hrtime(n) => serde_json::json!(*n), + NvValue::NvList(nvl) => nvlist_to_json(nvl), + NvValue::BooleanArray(arr) => serde_json::json!(arr), + NvValue::ByteArray(arr) => serde_json::json!(arr), + NvValue::Int8Array(arr) => serde_json::json!(arr), + NvValue::UInt8Array(arr) => serde_json::json!(arr), + NvValue::Int16Array(arr) => serde_json::json!(arr), + NvValue::UInt16Array(arr) => serde_json::json!(arr), + NvValue::Int32Array(arr) => serde_json::json!(arr), + NvValue::UInt32Array(arr) => serde_json::json!(arr), + NvValue::Int64Array(arr) => serde_json::json!(arr), + NvValue::UInt64Array(arr) => serde_json::json!(arr), + NvValue::StringArray(arr) => serde_json::json!(arr), + NvValue::NvListArray(arr) => { + let items: Vec = + arr.iter().map(nvlist_to_json).collect(); + serde_json::Value::Array(items) + } + NvValue::Unknown { type_code } => { + serde_json::json!({ + "_unknown_type": format!("{type_code:?}") + }) + } + } + } + + pub(super) fn nvlist_to_json(nvl: &NvList) -> serde_json::Value { + let mut map = serde_json::Map::new(); + for (name, value) in nvl { + map.insert(name.to_string(), nvvalue_to_json(value)); + } + serde_json::Value::Object(map) + } + + pub(super) fn collect( + log: Logger, + ) -> Result { + let adm = match FmdAdm::open() { + Ok(adm) => adm, + Err(e) => { + let err = InlineErrorChain::new(&e); + warn!(log, "failed to open fmd"; &err); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::FmdError, + message: format!("failed to open fmd: {err}"), + }); + } + }; + + let raw_cases = match adm.cases(None) { + Ok(cases) => cases, + Err(e) => { + let err = InlineErrorChain::new(&e); + warn!(log, "failed to list fmd cases"; &err); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::FmdError, + message: format!("failed to list fmd cases: {err}"), + }); + } + }; + let case_count = raw_cases.len(); + if case_count as u64 > u64::from(FMD_MAX_CASES) { + warn!( + log, "too many fmd cases reported, refusing partial inventory"; + "count" => case_count, "limit" => FMD_MAX_CASES, + ); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::TooManyCases, + message: format!( + "too many fmd cases ({case_count} > limit {FMD_MAX_CASES})" + ), + }); + } + let cases: iddqd::IdOrdMap<_> = raw_cases + .into_iter() + .map(|c| { + let fmd_adm::CaseInfo { uuid, code, url, event } = c; + FmdHostCase { + uuid: FmdHostCaseUuid::from_untyped_uuid(uuid), + code, + url, + event: event.as_ref().map(nvlist_to_json), + } + }) + .collect(); + + let raw_resources = match adm.resources(InvisibleResources::Included) { + Ok(resources) => resources, + Err(e) => { + let err = InlineErrorChain::new(&e); + warn!(log, "failed to list fmd resources"; &err); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::FmdError, + message: format!("failed to list fmd resources: {err}"), + }); + } + }; + let resource_count = raw_resources.len(); + if resource_count as u64 > u64::from(FMD_MAX_RESOURCES) { + warn!( + log, + "too many fmd resources reported, refusing partial inventory"; + "count" => resource_count, "limit" => FMD_MAX_RESOURCES, + ); + return Err(FmdInventoryError { + kind: FmdInventoryErrorKind::TooManyResources, + message: format!( + "too many fmd resources \ + ({resource_count} > limit {FMD_MAX_RESOURCES})" + ), + }); + } + let resources: iddqd::IdOrdMap<_> = raw_resources + .into_iter() + .map(|r| { + let fmd_adm::ResourceInfo { + fmri, + uuid, + case, + faulty, + unusable, + invisible, + } = r; + FmdResource { + fmri, + uuid: FmdResourceUuid::from_untyped_uuid(uuid), + case_id: FmdHostCaseUuid::from_untyped_uuid(case), + faulty, + unusable, + invisible, + } + }) + .collect(); + + Ok(FmdInventory { cases, resources }) + } +} + +pub(crate) async fn collect_fmd_inventory( + log: &Logger, +) -> Result { + #[cfg(target_os = "illumos")] + { + // FMD queries go through door calls to fmd(1M) and can block, so run + // them on a blocking-friendly thread rather than stalling the runtime. + // The expect is safe: omicron compiles with `panic = "abort"`, so a + // panic inside the blocking task aborts the whole process and the + // `JoinHandle` should not be able to return `Err`. + let log = log.clone(); + tokio::task::spawn_blocking(move || illumos::collect(log)) + .await + .expect("fmd collection task panicked") + } + #[cfg(not(target_os = "illumos"))] + { + let _ = log; + Err(FmdInventoryError { + kind: sled_agent_types::inventory::FmdInventoryErrorKind::FmdError, + message: "fmd not supported on this platform".to_string(), + }) + } +} + +#[cfg(test)] +#[cfg(target_os = "illumos")] +mod tests { + use super::illumos::nvvalue_to_json; + use fmd_adm::NvValue; + + #[test] + fn boolean_presence() { + assert_eq!(nvvalue_to_json(&NvValue::Boolean), serde_json::json!(true)); + } + + #[test] + fn boolean_value() { + assert_eq!( + nvvalue_to_json(&NvValue::BooleanValue(false)), + serde_json::json!(false), + ); + assert_eq!( + nvvalue_to_json(&NvValue::BooleanValue(true)), + serde_json::json!(true), + ); + } + + #[test] + fn integers() { + assert_eq!(nvvalue_to_json(&NvValue::Byte(42)), serde_json::json!(42)); + assert_eq!(nvvalue_to_json(&NvValue::Int8(-1)), serde_json::json!(-1)); + assert_eq!( + nvvalue_to_json(&NvValue::UInt8(255)), + serde_json::json!(255) + ); + assert_eq!( + nvvalue_to_json(&NvValue::Int16(-32000)), + serde_json::json!(-32000), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt16(65535)), + serde_json::json!(65535), + ); + assert_eq!( + nvvalue_to_json(&NvValue::Int32(-100_000)), + serde_json::json!(-100_000), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt32(4_000_000_000)), + serde_json::json!(4_000_000_000u64), + ); + assert_eq!( + nvvalue_to_json(&NvValue::Int64(i64::MIN)), + serde_json::json!(i64::MIN), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt64(u64::MAX)), + serde_json::json!(u64::MAX), + ); + } + + #[test] + fn double() { + assert_eq!( + nvvalue_to_json(&NvValue::Double(4.2069)), + serde_json::json!(4.2069), + ); + } + + #[test] + fn string() { + assert_eq!( + nvvalue_to_json(&NvValue::String("hello".to_string())), + serde_json::json!("hello"), + ); + } + + #[test] + fn hrtime() { + assert_eq!( + nvvalue_to_json(&NvValue::Hrtime(1_000_000_000)), + serde_json::json!(1_000_000_000i64), + ); + } + + #[test] + fn integer_arrays() { + assert_eq!( + nvvalue_to_json(&NvValue::Int32Array(vec![1, 2, 3])), + serde_json::json!([1, 2, 3]), + ); + assert_eq!( + nvvalue_to_json(&NvValue::UInt8Array(vec![0, 128, 255])), + serde_json::json!([0, 128, 255]), + ); + } + + #[test] + fn boolean_array() { + assert_eq!( + nvvalue_to_json(&NvValue::BooleanArray(vec![true, false, true])), + serde_json::json!([true, false, true]), + ); + } + + #[test] + fn string_array() { + assert_eq!( + nvvalue_to_json(&NvValue::StringArray(vec![ + "a".to_string(), + "b".to_string(), + ])), + serde_json::json!(["a", "b"]), + ); + } + + #[test] + fn unknown_type() { + // The type_code is a data_type_t from the illumos nvpair FFI. + // We just format it via Debug. + let val = NvValue::Unknown { type_code: 0 }; + let json = nvvalue_to_json(&val); + // Should be an object with a single "_unknown_type" key. + assert!(json.is_object()); + assert!(json.get("_unknown_type").unwrap().is_string()); + } +} diff --git a/sled-agent/src/lib.rs b/sled-agent/src/lib.rs index d6f7458aa8f..4db3cdfc4e1 100644 --- a/sled-agent/src/lib.rs +++ b/sled-agent/src/lib.rs @@ -20,6 +20,7 @@ mod backing_fs; pub mod bootstrap; pub mod config; mod ddm_reconciler; +mod fmd; pub(crate) mod hardware_monitor; mod http_entrypoints; mod instance; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index d92a837a403..a992f103d81 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -61,8 +61,8 @@ use sled_agent_types::instance::{ }; use sled_agent_types::inventory::{ ConfigReconcilerInventory, ConfigReconcilerInventoryResult, - ConfigReconcilerInventoryStatus, HostPhase2DesiredSlots, Inventory, - InventoryDataset, InventoryDisk, InventoryZpool, + ConfigReconcilerInventoryStatus, FmdInventory, HostPhase2DesiredSlots, + Inventory, InventoryDataset, InventoryDisk, InventoryZpool, OmicronFileSourceResolverInventory, OmicronSledConfig, OmicronZonesConfig, SingleMeasurementInventory, SledRole, ZpoolHealth, }; @@ -994,6 +994,7 @@ impl SledAgent { ), smf_services_enabled_not_online, reference_measurements, + fmd: Ok(FmdInventory::default()), }) } diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 1d1717e39a0..e14491bf47f 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -1277,6 +1277,8 @@ impl SledAgent { let smf_services_enabled_not_online = self.inner.health_monitor.to_inventory(); + let fmd = crate::fmd::collect_fmd_inventory(&self.log).await; + let ReconcilerInventory { disks, zpools, @@ -1304,6 +1306,7 @@ impl SledAgent { file_source_resolver, smf_services_enabled_not_online, reference_measurements: self.inner.measurements.to_inventory(), + fmd, }) } diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs new file mode 100644 index 00000000000..9cb3034ecc5 --- /dev/null +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/inventory.rs @@ -0,0 +1,210 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use iddqd::{IdOrdItem, IdOrdMap, id_upcast}; +use omicron_common::api::external::ByteCount; +use omicron_common::snake_case_result; +use omicron_common::snake_case_result::SnakeCaseResult; +use omicron_uuid_kinds::{FmdHostCaseUuid, FmdResourceUuid, SledUuid}; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use sled_hardware_types::{Baseboard, SledCpuFamily}; +use std::net::SocketAddrV6; + +use crate::v1::inventory::InventoryDataset; +use crate::v1::inventory::InventoryDisk; +use crate::v1::inventory::SledRole; +use crate::v14::inventory::ConfigReconcilerInventoryStatus; +use crate::v14::inventory::OmicronFileSourceResolverInventory; +use crate::v14::inventory::OmicronSledConfig; +use crate::v16::inventory::ConfigReconcilerInventory; +use crate::v16::inventory::SingleMeasurementInventory; +use crate::v24::inventory::InventoryZpool; +use crate::v37; + +/// A diagnosed fault case from the illumos Fault Management Daemon on a sled. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +pub struct FmdHostCase { + /// Unique identifier for this case. + pub uuid: FmdHostCaseUuid, + /// Diagnostic code (e.g. "PCIEX-8000-DJ"). + pub code: String, + /// URL for human-readable information about this fault + /// (e.g. `http://illumos.org/msg/PCIEX-8000-DJ`). + pub url: String, + /// Full fault event payload as JSON, if present. Contains the + /// fault-list with classes, certainties, affected FMRIs, and other + /// diagnostic detail. + pub event: Option, +} + +impl IdOrdItem for FmdHostCase { + type Key<'a> = FmdHostCaseUuid; + + fn key(&self) -> Self::Key<'_> { + self.uuid + } + + id_upcast!(); +} + +/// A resource affected by a diagnosed fault. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema)] +pub struct FmdResource { + /// Fault Management Resource Identifier + /// (e.g. "dev:////pci@af,0/pci1022,1483@3,5"). + pub fmri: String, + /// Unique identifier for this resource entry. + pub uuid: FmdResourceUuid, + /// UUID of the case that diagnosed this fault. + pub case_id: FmdHostCaseUuid, + /// Whether the resource is marked faulty. + pub faulty: bool, + /// Whether the resource is marked unusable. + pub unusable: bool, + /// Whether the resource is marked invisible. + pub invisible: bool, +} + +impl IdOrdItem for FmdResource { + type Key<'a> = FmdResourceUuid; + + fn key(&self) -> Self::Key<'_> { + self.uuid + } + + id_upcast!(); +} + +/// Successfully collected FMD fault data. +#[derive( + Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize, JsonSchema, +)] +pub struct FmdInventory { + pub cases: IdOrdMap, + pub resources: IdOrdMap, +} + +/// Maximum number of FMD cases sled-agent will report for a single sled. +/// Exceeding this returns [`FmdInventoryErrorKind::TooManyCases`] rather than +/// silently truncating: a count this high indicates a pathological state +/// operators should investigate directly via `fmadm`. +pub const FMD_MAX_CASES: u32 = 1000; + +/// Maximum number of FMD resources sled-agent will report for a single sled. +/// See [`FMD_MAX_CASES`] for rationale. +pub const FMD_MAX_RESOURCES: u32 = 1000; + +/// Classification of an [`FmdInventoryError`]. +/// +/// `FmdError` is a catch-all for any FMD-side failure: the daemon was +/// unreachable, a case/resource listing failed, or the platform doesn't have +/// FMD at all. The accompanying message disambiguates these cases. +/// `TooManyCases` and `TooManyResources` are first-class because exceeding +/// those bounds is operationally distinct from a transient FMD failure. +#[derive( + Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, JsonSchema, +)] +#[serde(rename_all = "snake_case")] +pub enum FmdInventoryErrorKind { + /// Catch-all for FMD-side failures. + FmdError, + /// Number of FMD cases exceeded [`FMD_MAX_CASES`]. + TooManyCases, + /// Number of FMD resources exceeded [`FMD_MAX_RESOURCES`]. + TooManyResources, +} + +/// An error reported by sled-agent in place of an [`FmdInventory`]. +/// +/// `kind` is a typed discriminator suitable for filtering / monitoring. +/// `message` is a human-readable description (built via `Display`); it is +/// informational only and should not be parsed. +#[derive( + Clone, + Debug, + PartialEq, + Eq, + Deserialize, + Serialize, + JsonSchema, + thiserror::Error, +)] +#[error("{message}")] +pub struct FmdInventoryError { + pub kind: FmdInventoryErrorKind, + pub message: String, +} + +/// Identity and basic status information about this sled agent +#[derive(Clone, Debug, Deserialize, JsonSchema, Serialize)] +pub struct Inventory { + pub sled_id: SledUuid, + pub sled_agent_address: SocketAddrV6, + pub sled_role: SledRole, + pub baseboard: Baseboard, + pub usable_hardware_threads: u32, + pub usable_physical_ram: ByteCount, + pub cpu_family: SledCpuFamily, + pub reservoir_size: ByteCount, + pub disks: Vec, + pub zpools: Vec, + pub datasets: Vec, + pub ledgered_sled_config: Option, + pub reconciler_status: ConfigReconcilerInventoryStatus, + pub last_reconciliation: Option, + pub file_source_resolver: OmicronFileSourceResolverInventory, + pub smf_services_enabled_not_online: + v37::inventory::SvcsEnabledNotOnlineResult, + pub reference_measurements: IdOrdMap, + #[serde(with = "snake_case_result")] + #[schemars( + schema_with = "SnakeCaseResult::::json_schema" + )] + pub fmd: Result, +} + +impl From for v37::inventory::Inventory { + fn from(value: Inventory) -> Self { + let Inventory { + sled_id, + sled_agent_address, + sled_role, + baseboard, + usable_hardware_threads, + usable_physical_ram, + cpu_family, + reservoir_size, + disks, + zpools, + datasets, + ledgered_sled_config, + reconciler_status, + last_reconciliation, + file_source_resolver, + smf_services_enabled_not_online, + reference_measurements, + fmd: _, + } = value; + Self { + sled_id, + sled_agent_address, + sled_role, + baseboard, + usable_hardware_threads, + usable_physical_ram, + cpu_family, + reservoir_size, + disks, + zpools, + datasets, + ledgered_sled_config, + reconciler_status, + last_reconciliation, + file_source_resolver, + smf_services_enabled_not_online, + reference_measurements, + } + } +} diff --git a/sled-agent/types/versions/src/add_fmd_to_inventory/mod.rs b/sled-agent/types/versions/src/add_fmd_to_inventory/mod.rs new file mode 100644 index 00000000000..ef475fd6e32 --- /dev/null +++ b/sled-agent/types/versions/src/add_fmd_to_inventory/mod.rs @@ -0,0 +1,10 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Version `ADD_FMD_TO_INVENTORY` of the Sled Agent API. +//! +//! This version adds FMD (Fault Management Daemon) data to the sled inventory +//! response, exposing diagnosed faults and affected resources. + +pub mod inventory; diff --git a/sled-agent/types/versions/src/latest.rs b/sled-agent/types/versions/src/latest.rs index 70b6973e130..60591115103 100644 --- a/sled-agent/types/versions/src/latest.rs +++ b/sled-agent/types/versions/src/latest.rs @@ -178,12 +178,20 @@ pub mod inventory { pub use crate::v34::inventory::SvcState; pub use crate::v34::inventory::SvcsError; - pub use crate::v37::inventory::Inventory; pub use crate::v37::inventory::SvcEnabledNotOnline; pub use crate::v37::inventory::SvcEnabledNotOnlineState; pub use crate::v37::inventory::SvcsEnabledNotOnline; pub use crate::v37::inventory::SvcsEnabledNotOnlineResult; + pub use crate::v40::inventory::FMD_MAX_CASES; + pub use crate::v40::inventory::FMD_MAX_RESOURCES; + pub use crate::v40::inventory::FmdHostCase; + pub use crate::v40::inventory::FmdInventory; + pub use crate::v40::inventory::FmdInventoryError; + pub use crate::v40::inventory::FmdInventoryErrorKind; + pub use crate::v40::inventory::FmdResource; + pub use crate::v40::inventory::Inventory; + pub use crate::impls::inventory::ManifestBootInventoryDisplay; pub use crate::impls::inventory::ManifestInventoryDisplay; pub use crate::impls::inventory::ManifestNonBootInventoryDisplay; diff --git a/sled-agent/types/versions/src/lib.rs b/sled-agent/types/versions/src/lib.rs index 571237e96a6..27b112414ab 100644 --- a/sled-agent/types/versions/src/lib.rs +++ b/sled-agent/types/versions/src/lib.rs @@ -85,6 +85,8 @@ pub mod v37; pub mod v39; #[path = "add_nexus_lockstep_port_to_inventory/mod.rs"] pub mod v4; +#[path = "add_fmd_to_inventory/mod.rs"] +pub mod v40; #[path = "add_probe_put_endpoint/mod.rs"] pub mod v6; #[path = "multicast_support/mod.rs"] diff --git a/uuid-kinds/src/lib.rs b/uuid-kinds/src/lib.rs index 1e55b4ad0e1..4687e2d5710 100644 --- a/uuid-kinds/src/lib.rs +++ b/uuid-kinds/src/lib.rs @@ -60,6 +60,8 @@ impl_typed_uuid_kinds! { ExternalIp = {}, ExternalSubnet = {}, ExternalZpool = {}, + FmdHostCase = {}, + FmdResource = {}, Instance = {}, InternalZpool = {}, LoopbackAddress = {},