diff --git a/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb b/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb index 50f4f3d7..7de551e1 100644 --- a/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb +++ b/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb @@ -19,23 +19,25 @@ class Handler < Chef::Handler class StartHandler < Handler def report - # This is informational only, and gives us a fraction more - # information in /var/log/chef/client.log and in the default - # attributes (until next run) for debugging purposes. - # However, it will only take effect after the handler has been - # installed in /etc/chef/client.rb *and* chef-client daemon - # has subsequently been restarted; the - # reload_chef_client_config hack doesn't work with - # start_handlers since it reloads the config too late, after - # the start handlers have already been triggered. - start_mode = record_maintenance_mode_before_this_chef_run - Chef::Log.info("Pacemaker maintenance mode currently %s" % - [start_mode ? "on" : "off"]) - - if maintenance_mode_set_via_this_chef_run? - # Sanity check: this should never happen because we're using - # default attributes which get wiped for each chef-client run. - raise "BUG: Pacemaker maintenance mode was already set at the start of this run! (pid #$$)" + # Check we're not in maintenance mode. This could happen for two + # reasons: + # + # 1. A previous chef-client run failed, so we shouldn't + # risk compounding problems by trying again until the + # root cause is addressed. + # + # 2. Someone/something other than Chef set the node into + # maintenance mode. That should be rare, but when it + # happens, we shouldn't interfere. + # + # So in both cases, we should abort the run immediately with a + # helpful message. + if maintenance_mode? + raise \ + "Pacemaker maintenance mode was already set on " \ + "#{node.hostname}; aborting! Please diagnose why this was the " \ + "case, fix the root cause, and then unset maintenance mode via " \ + "HAWK or by running 'crm node ready' on the node." end end end @@ -51,8 +53,8 @@ def report Chef::Log.info("Taking node out of Pacemaker maintenance mode") system("crm --wait node ready") else - # This shouldn't happen, and suggests that one of the recipes - # is interfering in a way it shouldn't. + # This shouldn't happen, and suggests that something is + # interfering in a way it shouldn't. raise "Something took node out of maintenance mode during run!" end else diff --git a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb index b0ea4f63..8ae34590 100644 --- a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb +++ b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb @@ -18,19 +18,45 @@ module CrowbarPacemaker # A mixin for Chef::Pacemaker::Handler subclasses, and also for the # Chef::Provider::PacemakerService LWRP. module MaintenanceModeHelpers - def maintenance_mode? - # See https://bugzilla.suse.com/show_bug.cgi?id=870696 - !! (%x(crm_attribute -G -N #{node.hostname} -n maintenance -q) =~ /^on$/) + def cluster_up? + # For once, we want 2>&1 to come before >/dev/null, not after! + # This is because we want to capture STDERR and ditch STDOUT. + cibadmin = `cibadmin -Ql 2>&1 >/dev/null` + case cibadmin + when /Connection refused/, /Transport endpoint is not connected/ + Chef::Log.warn("Cluster is down") + return false + when /command not found/ + Chef::Log.warn("cibadmin not found; was pacemaker deinstalled?") + return false + end + + if !$?.success? + Chef::Log.warn("cibadmin -Ql failed when checking Pacemaker maintenance mode!") + Chef::Log.warn(cibadmin) + return nil # unknown + end + + Chef::Log.debug("Cluster is up") + true end - def record_maintenance_mode_before_this_chef_run - # Via Chef::Pacemaker::StartHandler we track whether anything - # has put the node into Pacemaker maintenance mode prior to this - # chef-client run. This may come in handy during debugging. - # - # We use a default attribute so that it will get reset at the - # beginning of each chef-client run. - node.default[:pacemaker][:maintenance_mode][$$][:at_start] = maintenance_mode? + def maintenance_mode? + case cluster_up? + when nil # unknown + Chef::Log.warn("Something wrong, so treating as if in maintenance " + + "mode; will need manual intervention.") + return true + when false + # Cluster is not up, so let things proceed so that Chef can + # start it if appropriate. + Chef::Log.info("Cluster is down; not in maintenance mode") + return false + end + + Chef::Log.debug("Checking maintenance mode status") + # See https://bugzilla.suse.com/show_bug.cgi?id=870696 + `crm_attribute -G -N #{node.hostname} -n maintenance -q` =~ /^on$/ end def set_maintenance_mode_via_this_chef_run @@ -42,14 +68,14 @@ def set_maintenance_mode_via_this_chef_run # # We use a default attribute so that it will get reset at the # beginning of each chef-client run. - node.default[:pacemaker][:maintenance_mode][$$][:via_chef] = true + node.default[:pacemaker][:maintenance_mode][$PID][:via_chef] = true end def maintenance_mode_set_via_this_chef_run? # The "== true" is required because Chef::Node::Attribute does # auto-vivification on read (!), so the value will be initialized # to an empty Chef::Node::Attribute if not already set to true. - node.default[:pacemaker][:maintenance_mode][$$][:via_chef] == true + node.default[:pacemaker][:maintenance_mode][$PID][:via_chef] == true end end end diff --git a/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb b/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb index 67eb40a7..f1b85dc5 100644 --- a/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb +++ b/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb @@ -57,7 +57,7 @@ if loaded Chef::Log.debug("Pacemaker maintenance handlers already installed") else - Chef::Log.info("Pacemaker maintenance handlers not installed; " + + Chef::Log.info("Pacemaker maintenance handlers not installed; " \ "scheduling Chef config reload") ruby_block "reload_chef_client_config" do block { Chef::Config.from_file("/etc/chef/client.rb") }