crowbar · aspiers · Sep 4, 2015 · Sep 4, 2015 · Nov 4, 2014 · houndci-bot
diff --git a/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb b/chef/cookbooks/crowbar-pacemaker/files/default/pacemaker_maintenance_handlers.rb
@@ -19,23 +19,25 @@ class Handler < Chef::Handler
 
     class StartHandler < Handler
       def report
-        # This is informational only, and gives us a fraction more
-        # information in /var/log/chef/client.log and in the default
-        # attributes (until next run) for debugging purposes.
-        # However, it will only take effect after the handler has been
-        # installed in /etc/chef/client.rb *and* chef-client daemon
-        # has subsequently been restarted; the
-        # reload_chef_client_config hack doesn't work with
-        # start_handlers since it reloads the config too late, after
-        # the start handlers have already been triggered.
-        start_mode = record_maintenance_mode_before_this_chef_run
-        Chef::Log.info("Pacemaker maintenance mode currently %s" %
-                       [start_mode ? "on" : "off"])
-
-        if maintenance_mode_set_via_this_chef_run?
-          # Sanity check: this should never happen because we're using
-          # default attributes which get wiped for each chef-client run.
-          raise "BUG: Pacemaker maintenance mode was already set at the start of this run! (pid #$$)"
+        # Check we're not in maintenance mode.  This could happen for two
+        # reasons:
+        #
+        #   1. A previous chef-client run failed, so we shouldn't
+        #      risk compounding problems by trying again until the
+        #      root cause is addressed.
+        #
+        #   2. Someone/something other than Chef set the node into
+        #      maintenance mode.  That should be rare, but when it
+        #      happens, we shouldn't interfere.
+        #
+        # So in both cases, we should abort the run immediately with a
+        # helpful message.
+        if maintenance_mode?
+          raise \
+            "Pacemaker maintenance mode was already set on " \
+            "#{node.hostname}; aborting! Please diagnose why this was the " \
+            "case, fix the root cause, and then unset maintenance mode via " \
+            "HAWK or by running 'crm node ready' on the node."
         end
       end
     end
@@ -51,8 +53,8 @@ def report
             Chef::Log.info("Taking node out of Pacemaker maintenance mode")
             system("crm --wait node ready")
           else
-            # This shouldn't happen, and suggests that one of the recipes
-            # is interfering in a way it shouldn't.
+            # This shouldn't happen, and suggests that something is
+            # interfering in a way it shouldn't.
             raise "Something took node out of maintenance mode during run!"
           end
         else

diff --git a/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb b/chef/cookbooks/crowbar-pacemaker/libraries/maintenance_mode_helpers.rb
@@ -18,19 +18,45 @@ module CrowbarPacemaker
   # A mixin for Chef::Pacemaker::Handler subclasses, and also for the
   # Chef::Provider::PacemakerService LWRP.
   module MaintenanceModeHelpers
-    def maintenance_mode?
-      # See https://bugzilla.suse.com/show_bug.cgi?id=870696
-      !! (%x(crm_attribute -G -N #{node.hostname} -n maintenance -q) =~ /^on$/)
+    def cluster_up?
+      # For once, we want 2>&1 to come before >/dev/null, not after!
+      # This is because we want to capture STDERR and ditch STDOUT.
+      cibadmin = `cibadmin -Ql 2>&1 >/dev/null`
+      case cibadmin
+      when /Connection refused/, /Transport endpoint is not connected/
+        Chef::Log.warn("Cluster is down")
+        return false
+      when /command not found/
+        Chef::Log.warn("cibadmin not found; was pacemaker deinstalled?")
+        return false
+      end
+
+      if !$?.success?
+        Chef::Log.warn("cibadmin -Ql failed when checking Pacemaker maintenance mode!")
+        Chef::Log.warn(cibadmin)
+        return nil # unknown
+      end
+
+      Chef::Log.debug("Cluster is up")
+      true
     end
 
-    def record_maintenance_mode_before_this_chef_run
-      # Via Chef::Pacemaker::StartHandler we track whether anything
-      # has put the node into Pacemaker maintenance mode prior to this
-      # chef-client run.  This may come in handy during debugging.
-      #
-      # We use a default attribute so that it will get reset at the
-      # beginning of each chef-client run.
-      node.default[:pacemaker][:maintenance_mode][$$][:at_start] = maintenance_mode?
+    def maintenance_mode?
+      case cluster_up?
+      when nil # unknown
+        Chef::Log.warn("Something wrong, so treating as if in maintenance " +
+          "mode; will need manual intervention.")
+        return true
+      when false
+        # Cluster is not up, so let things proceed so that Chef can
+        # start it if appropriate.
+        Chef::Log.info("Cluster is down; not in maintenance mode")
+        return false
+      end
+
+      Chef::Log.debug("Checking maintenance mode status")
+      # See https://bugzilla.suse.com/show_bug.cgi?id=870696
+      `crm_attribute -G -N #{node.hostname} -n maintenance -q` =~ /^on$/
     end
 
     def set_maintenance_mode_via_this_chef_run
@@ -42,14 +68,14 @@ def set_maintenance_mode_via_this_chef_run
       #
       # We use a default attribute so that it will get reset at the
       # beginning of each chef-client run.
-      node.default[:pacemaker][:maintenance_mode][$$][:via_chef] = true
+      node.default[:pacemaker][:maintenance_mode][$PID][:via_chef] = true
     end
 
     def maintenance_mode_set_via_this_chef_run?
       # The "== true" is required because Chef::Node::Attribute does
       # auto-vivification on read (!), so the value will be initialized
       # to an empty Chef::Node::Attribute if not already set to true.
-      node.default[:pacemaker][:maintenance_mode][$$][:via_chef] == true
+      node.default[:pacemaker][:maintenance_mode][$PID][:via_chef] == true
     end
   end
 end
diff --git a/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb b/chef/cookbooks/crowbar-pacemaker/recipes/maintenance-mode.rb
@@ -57,7 +57,7 @@
 if loaded
   Chef::Log.debug("Pacemaker maintenance handlers already installed")
 else
-  Chef::Log.info("Pacemaker maintenance handlers not installed; " +
+  Chef::Log.info("Pacemaker maintenance handlers not installed; " \
                  "scheduling Chef config reload")
   ruby_block "reload_chef_client_config" do
     block { Chef::Config.from_file("/etc/chef/client.rb") }