From 90e8a95c87ed6037f9ea32eae42fc548e630c65e Mon Sep 17 00:00:00 2001 From: Adam Spiers Date: Fri, 4 Sep 2015 19:49:20 +0100 Subject: [PATCH 1/3] immediately abort chef-client if maintenance mode is set The reasoning is explained in the comments. I'm not sure why I ever thought it was a good idea to allow a chef-client run to proceed if maintenance mode is set. It's not even as if that approach could ever restore an ill node to full health, because we were deliberately checking whether the maintenance mode was set prior to the chef-client run, and if so, leaving it set. This way, if a node is left in maintenance mode, it will be discovered sooner, resulting in the cloud operator being alerted to a degraded cluster sooner. --- .../default/pacemaker_maintenance_handlers.rb | 40 ++++++++++--------- .../libraries/maintenance_mode_helpers.rb | 10 ----- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb b/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb index 50f4f3d7..7de551e1 100644 --- a/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb +++ b/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb @@ -19,23 +19,25 @@ class Handler < Chef::Handler class StartHandler < Handler def report - # This is informational only, and gives us a fraction more - # information in /var/log/chef/client.log and in the default - # attributes (until next run) for debugging purposes. - # However, it will only take effect after the handler has been - # installed in /etc/chef/client.rb *and* chef-client daemon - # has subsequently been restarted; the - # reload_chef_client_config hack doesn't work with - # start_handlers since it reloads the config too late, after - # the start handlers have already been triggered. - start_mode = record_maintenance_mode_before_this_chef_run - Chef::Log.info("Pacemaker maintenance mode currently %s" % - [start_mode ? "on" : "off"]) - - if maintenance_mode_set_via_this_chef_run? - # Sanity check: this should never happen because we're using - # default attributes which get wiped for each chef-client run. - raise "BUG: Pacemaker maintenance mode was already set at the start of this run! (pid #$$)" + # Check we're not in maintenance mode. This could happen for two + # reasons: + # + # 1. A previous chef-client run failed, so we shouldn't + # risk compounding problems by trying again until the + # root cause is addressed. + # + # 2. Someone/something other than Chef set the node into + # maintenance mode. That should be rare, but when it + # happens, we shouldn't interfere. + # + # So in both cases, we should abort the run immediately with a + # helpful message. + if maintenance_mode? + raise \ + "Pacemaker maintenance mode was already set on " \ + "#{node.hostname}; aborting! Please diagnose why this was the " \ + "case, fix the root cause, and then unset maintenance mode via " \ + "HAWK or by running 'crm node ready' on the node." end end end @@ -51,8 +53,8 @@ def report Chef::Log.info("Taking node out of Pacemaker maintenance mode") system("crm --wait node ready") else - # This shouldn't happen, and suggests that one of the recipes - # is interfering in a way it shouldn't. + # This shouldn't happen, and suggests that something is + # interfering in a way it shouldn't. raise "Something took node out of maintenance mode during run!" end else diff --git a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb index b0ea4f63..d7b3a175 100644 --- a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb +++ b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb @@ -23,16 +23,6 @@ def maintenance_mode? !! (%x(crm_attribute -G -N #{node.hostname} -n maintenance -q) =~ /^on$/) end - def record_maintenance_mode_before_this_chef_run - # Via Chef::Pacemaker::StartHandler we track whether anything - # has put the node into Pacemaker maintenance mode prior to this - # chef-client run. This may come in handy during debugging. - # - # We use a default attribute so that it will get reset at the - # beginning of each chef-client run. - node.default[:pacemaker][:maintenance_mode][$$][:at_start] = maintenance_mode? - end - def set_maintenance_mode_via_this_chef_run # We track whether anything has put the node into Pacemaker # maintenance mode during this chef-client run, so we know From 090f565f2364315e940153ae8d5ebcab4dc29a08 Mon Sep 17 00:00:00 2001 From: Adam Spiers Date: Fri, 4 Sep 2015 20:12:04 +0100 Subject: [PATCH 2/3] fix some rubocop offences --- .../crowbar-pacemaker/libraries/maintenance_mode_helpers.rb | 6 +++--- .../cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb index d7b3a175..30e9f120 100644 --- a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb +++ b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb @@ -20,7 +20,7 @@ module CrowbarPacemaker module MaintenanceModeHelpers def maintenance_mode? # See https://bugzilla.suse.com/show_bug.cgi?id=870696 - !! (%x(crm_attribute -G -N #{node.hostname} -n maintenance -q) =~ /^on$/) + `crm_attribute -G -N #{node.hostname} -n maintenance -q` =~ /^on$/ end def set_maintenance_mode_via_this_chef_run @@ -32,14 +32,14 @@ def set_maintenance_mode_via_this_chef_run # # We use a default attribute so that it will get reset at the # beginning of each chef-client run. - node.default[:pacemaker][:maintenance_mode][$$][:via_chef] = true + node.default[:pacemaker][:maintenance_mode][$PID][:via_chef] = true end def maintenance_mode_set_via_this_chef_run? # The "== true" is required because Chef::Node::Attribute does # auto-vivification on read (!), so the value will be initialized # to an empty Chef::Node::Attribute if not already set to true. - node.default[:pacemaker][:maintenance_mode][$$][:via_chef] == true + node.default[:pacemaker][:maintenance_mode][$PID][:via_chef] == true end end end diff --git a/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb b/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb index 67eb40a7..f1b85dc5 100644 --- a/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb +++ b/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb @@ -57,7 +57,7 @@ if loaded Chef::Log.debug("Pacemaker maintenance handlers already installed") else - Chef::Log.info("Pacemaker maintenance handlers not installed; " + + Chef::Log.info("Pacemaker maintenance handlers not installed; " \ "scheduling Chef config reload") ruby_block "reload_chef_client_config" do block { Chef::Config.from_file("/etc/chef/client.rb") } From 43590c49a642cbbe15efe1a6d38b81d8396acbf5 Mon Sep 17 00:00:00 2001 From: Adam Spiers Date: Wed, 5 Nov 2014 00:42:30 +0100 Subject: [PATCH 3/3] fix #maintenance_mode? return value Make #maintenance_mode? return a sensible value even when Pacemaker is down or uninstalled. This is especially helpful when it is invoked by Chef's start_handler. Cherry-picked from crowbar/barclamp-pacemaker@7dff6c1. --- .../libraries/maintenance_mode_helpers.rb | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb index 30e9f120..8ae34590 100644 --- a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb +++ b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb @@ -18,7 +18,43 @@ module CrowbarPacemaker # A mixin for Chef::Pacemaker::Handler subclasses, and also for the # Chef::Provider::PacemakerService LWRP. module MaintenanceModeHelpers + def cluster_up? + # For once, we want 2>&1 to come before >/dev/null, not after! + # This is because we want to capture STDERR and ditch STDOUT. + cibadmin = `cibadmin -Ql 2>&1 >/dev/null` + case cibadmin + when /Connection refused/, /Transport endpoint is not connected/ + Chef::Log.warn("Cluster is down") + return false + when /command not found/ + Chef::Log.warn("cibadmin not found; was pacemaker deinstalled?") + return false + end + + if !$?.success? + Chef::Log.warn("cibadmin -Ql failed when checking Pacemaker maintenance mode!") + Chef::Log.warn(cibadmin) + return nil # unknown + end + + Chef::Log.debug("Cluster is up") + true + end + def maintenance_mode? + case cluster_up? + when nil # unknown + Chef::Log.warn("Something wrong, so treating as if in maintenance " + + "mode; will need manual intervention.") + return true + when false + # Cluster is not up, so let things proceed so that Chef can + # start it if appropriate. + Chef::Log.info("Cluster is down; not in maintenance mode") + return false + end + + Chef::Log.debug("Checking maintenance mode status") # See https://bugzilla.suse.com/show_bug.cgi?id=870696 `crm_attribute -G -N #{node.hostname} -n maintenance -q` =~ /^on$/ end