From c2fdb99c5aff4400f6a59326b477590b78cb0b4d Mon Sep 17 00:00:00 2001
From: Jon Cope <jcope@redhat.com>
Date: Tue, 7 Apr 2026 11:04:59 -0500
Subject: [PATCH 1/3] fix(kube-apiserver): Add fail-fast RBAC bootstrap hook
 deadlock detection

On MicroShift restart, the RBAC bootstrap hook can deadlock when etcd
contains existing data. The hook uses context.TODO() for API calls,
which has no timeout. When the loopback client hangs, this creates a
circular dependency where the hook waits for the API server while the
API server waits for the hook to complete.

This change adds a parallel deadlock detector that:
- Monitors /readyz/poststarthook/rbac/bootstrap-roles specifically
- Checks if etcd is healthy while the hook is stuck
- Detects deadlock in ~15 seconds instead of waiting 60 seconds
- Restarts microshift-etcd.scope to recover from the deadlock

This breaks the crash loop by detecting the condition early and taking
recovery action at the MicroShift level, without requiring changes to
vendored upstream Kubernetes code.

Related upstream issues: kubernetes/kubernetes#86715, #97119

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pkg/controllers/kube-apiserver.go | 135 +++++++++++++++++++++++++++++-
 1 file changed, 134 insertions(+), 1 deletion(-)

diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go
index 70cb89dcdc..d11b2afcaa 100644
--- a/pkg/controllers/kube-apiserver.go
+++ b/pkg/controllers/kube-apiserver.go
@@ -23,6 +23,7 @@ import (
 	"io"
 	"net"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -55,6 +56,12 @@ import (
 
 const (
 	kubeAPIStartupTimeout = 60
+	// rbacHookDeadlockTimeout is the time to wait for the RBAC bootstrap hook
+	// before declaring a deadlock. This is shorter than kubeAPIStartupTimeout
+	// to allow for faster recovery.
+	rbacHookDeadlockTimeout = 15
+	// rbacHookCheckInterval is how often to check the RBAC hook status
+	rbacHookCheckInterval = 2
 )
 
 var (
@@ -348,7 +355,13 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
 		return err
 	}
 
-	// run readiness check
+	// Channel to signal RBAC hook deadlock detection
+	rbacDeadlockDetected := make(chan struct{})
+
+	// Run RBAC hook deadlock detector
+	go s.detectRBACHookDeadlock(ctx, restClient, rbacDeadlockDetected)
+
+	// Run standard readiness check
 	go func() {
 		err := wait.PollUntilContextTimeout(ctx, time.Second, kubeAPIStartupTimeout*time.Second, true, func(ctx context.Context) (bool, error) {
 			var status int
@@ -420,7 +433,127 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
 		return err
 	case perr := <-panicChannel:
 		panic(perr)
+	case <-rbacDeadlockDetected:
+		klog.Error("RBAC bootstrap hook deadlock detected - restarting microshift-etcd.scope to recover")
+		if err := restartMicroshiftEtcdScope(); err != nil {
+			klog.Errorf("Failed to restart microshift-etcd.scope: %v", err)
+		}
+		return fmt.Errorf("RBAC bootstrap hook deadlock detected after %d seconds", rbacHookDeadlockTimeout)
+	}
+}
+
+// detectRBACHookDeadlock monitors the RBAC bootstrap hook status and detects deadlock conditions.
+// A deadlock is detected when:
+// 1. The RBAC hook is not completing (stuck in "not finished" state)
+// 2. etcd is healthy and responsive
+// This indicates the circular dependency where the hook waits for API server
+// while API server waits for the hook.
+func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient rest.Interface, deadlockDetected chan<- struct{}) {
+	// Wait a few seconds before starting detection to allow normal startup
+	select {
+	case <-ctx.Done():
+		return
+	case <-time.After(5 * time.Second):
 	}
+
+	checkCount := 0
+	maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay
+
+	for checkCount < maxChecks {
+		select {
+		case <-ctx.Done():
+			return
+		case <-time.After(rbacHookCheckInterval * time.Second):
+		}
+
+		checkCount++
+
+		// Check RBAC hook status
+		var status int
+		err := restClient.Get().AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").Do(ctx).StatusCode(&status).Error()
+
+		// If hook is ready, no deadlock
+		if err == nil && status == 200 {
+			klog.V(4).Info("RBAC bootstrap hook completed successfully")
+			return
+		}
+
+		// Hook not ready - check if etcd is healthy
+		etcdHealthy, etcdErr := isEtcdHealthy(ctx)
+		if etcdErr != nil {
+			klog.V(4).Infof("Could not check etcd health: %v", etcdErr)
+			continue
+		}
+
+		if etcdHealthy {
+			klog.Warningf("RBAC bootstrap hook not ready (check %d/%d), but etcd is healthy - potential deadlock",
+				checkCount, maxChecks)
+		} else {
+			// etcd not healthy - not a deadlock, just waiting for etcd
+			klog.V(4).Infof("RBAC hook waiting, etcd not yet healthy (check %d/%d)", checkCount, maxChecks)
+			// Reset counter since this isn't a deadlock condition
+			checkCount = 0
+		}
+	}
+
+	// Reached max checks with etcd healthy but hook not completing - deadlock detected
+	klog.Error("RBAC bootstrap hook deadlock confirmed: etcd healthy but hook not completing")
+	close(deadlockDetected)
+}
+
+// isEtcdHealthy checks if etcd is responsive by attempting to connect and get status.
+func isEtcdHealthy(ctx context.Context) (bool, error) {
+	certsDir := cryptomaterial.CertsDirectory(config.DataDir)
+	etcdAPIServerClientCertDir := cryptomaterial.EtcdAPIServerClientCertDir(certsDir)
+
+	tlsInfo := transport.TLSInfo{
+		CertFile:      cryptomaterial.ClientCertPath(etcdAPIServerClientCertDir),
+		KeyFile:       cryptomaterial.ClientKeyPath(etcdAPIServerClientCertDir),
+		TrustedCAFile: cryptomaterial.CACertPath(cryptomaterial.EtcdSignerDir(certsDir)),
+	}
+	tlsConfig, err := tlsInfo.ClientConfig()
+	if err != nil {
+		return false, fmt.Errorf("failed to create TLS config: %w", err)
+	}
+
+	// Use a short timeout for health check
+	checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
+	defer cancel()
+
+	client, err := clientv3.New(clientv3.Config{
+		Endpoints:   []string{"https://localhost:2379"},
+		DialTimeout: 1 * time.Second,
+		TLS:         tlsConfig,
+		Context:     checkCtx,
+	})
+	if err != nil {
+		return false, fmt.Errorf("failed to create etcd client: %w", err)
+	}
+	defer func() { _ = client.Close() }()
+
+	_, err = client.Status(checkCtx, "localhost:2379")
+	if err != nil {
+		return false, nil // etcd not healthy, but not an error condition
+	}
+
+	return true, nil
+}
+
+// restartMicroshiftEtcdScope restarts the microshift-etcd.scope to recover from deadlock.
+// This forces a clean restart of etcd which can help break the circular dependency.
+func restartMicroshiftEtcdScope() error {
+	klog.Info("Stopping microshift-etcd.scope for recovery")
+
+	stopCmd := exec.Command("systemctl", "stop", "microshift-etcd.scope")
+	if out, err := stopCmd.CombinedOutput(); err != nil {
+		return fmt.Errorf("failed to stop microshift-etcd.scope: %w, output: %s", err, string(out))
+	}
+
+	// Wait briefly for cleanup
+	time.Sleep(1 * time.Second)
+
+	klog.Info("microshift-etcd.scope stopped - MicroShift will restart")
+	return nil
 }
 
 func discoverEtcdServers(ctx context.Context, kubeconfigPath string) ([]string, error) {

From 6a02e6227fe7354c508e5d9e75caad970dcdfe68 Mon Sep 17 00:00:00 2001
From: Jon Cope <jcope@redhat.com>
Date: Tue, 7 Apr 2026 15:35:24 -0500
Subject: [PATCH 2/3] fix(kube-apiserver): Add wall-clock deadline to prevent
 flapping

Add rbacHookMaxWaitDuration (30s) as an absolute deadline that cannot
be reset by etcd health state changes. This prevents a flapping etcd
from extending the deadlock detection indefinitely.

The existing checkCount logic is preserved for detecting deadlock when
etcd is consistently healthy, but the wall-clock deadline provides a
hard upper bound regardless of etcd state transitions.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pkg/controllers/kube-apiserver.go | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go
index d11b2afcaa..c0bcd4f47f 100644
--- a/pkg/controllers/kube-apiserver.go
+++ b/pkg/controllers/kube-apiserver.go
@@ -62,6 +62,10 @@ const (
 	rbacHookDeadlockTimeout = 15
 	// rbacHookCheckInterval is how often to check the RBAC hook status
 	rbacHookCheckInterval = 2
+	// rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook
+	// regardless of etcd health state changes. This prevents flapping from extending
+	// detection indefinitely.
+	rbacHookMaxWaitDuration = 30 * time.Second
 )
 
 var (
@@ -456,10 +460,18 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
 	case <-time.After(5 * time.Second):
 	}
 
+	// Track wall-clock deadline to prevent flapping from extending detection indefinitely
+	startTime := time.Now()
 	checkCount := 0
 	maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay
 
 	for checkCount < maxChecks {
+		// Check absolute deadline first - this cannot be reset by etcd state changes
+		if time.Since(startTime) >= rbacHookMaxWaitDuration {
+			klog.Errorf("RBAC bootstrap hook exceeded maximum wait duration of %v", rbacHookMaxWaitDuration)
+			break
+		}
+
 		select {
 		case <-ctx.Done():
 			return
@@ -486,18 +498,20 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
 		}
 
 		if etcdHealthy {
-			klog.Warningf("RBAC bootstrap hook not ready (check %d/%d), but etcd is healthy - potential deadlock",
-				checkCount, maxChecks)
+			klog.Warningf("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock",
+				checkCount, maxChecks, time.Since(startTime).Round(time.Second))
 		} else {
 			// etcd not healthy - not a deadlock, just waiting for etcd
 			klog.V(4).Infof("RBAC hook waiting, etcd not yet healthy (check %d/%d)", checkCount, maxChecks)
 			// Reset counter since this isn't a deadlock condition
+			// Note: wall-clock deadline (startTime) is NOT reset - flapping cannot extend indefinitely
 			checkCount = 0
 		}
 	}
 
 	// Reached max checks with etcd healthy but hook not completing - deadlock detected
-	klog.Error("RBAC bootstrap hook deadlock confirmed: etcd healthy but hook not completing")
+	klog.Errorf("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing",
+		time.Since(startTime).Round(time.Second))
 	close(deadlockDetected)
 }
 

From 184e1cbc2f366a12d80f19f31555bb5e69e68647 Mon Sep 17 00:00:00 2001
From: Jon Cope <jcope@redhat.com>
Date: Thu, 16 Apr 2026 15:47:57 -0500
Subject: [PATCH 3/3] fix(kube-apiserver): Harden RBAC deadlock detection
 against false positives

- Only increment checkCount when deadlock predicate confirmed
  (RBAC not ready AND etcd healthy)
- Skip counting when RBAC probe or etcd health check errors
- On wall-clock timeout, only trigger recovery if checkCount >= maxChecks
- Add 1s timeout to RBAC probe to prevent hanging on unresponsive API
- Add 5s timeout to systemctl stop to prevent recovery path from stalling
- Extract rbacHookPollDelayStart constant for clarity

Prevents false positive deadlock detection when etcd flaps or probes
error, ensuring close(deadlockDetected) only fires after confirming
the deadlock condition the required number of times.

Related: kubernetes/kubernetes#86715, #97119

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pkg/controllers/kube-apiserver.go | 58 ++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go
index c0bcd4f47f..54d6e8af9f 100644
--- a/pkg/controllers/kube-apiserver.go
+++ b/pkg/controllers/kube-apiserver.go
@@ -61,7 +61,8 @@ const (
 	// to allow for faster recovery.
 	rbacHookDeadlockTimeout = 15
 	// rbacHookCheckInterval is how often to check the RBAC hook status
-	rbacHookCheckInterval = 2
+	rbacHookPollDelayStart = 5 * time.Second
+	rbacHookCheckInterval  = 2
 	// rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook
 	// regardless of etcd health state changes. This prevents flapping from extending
 	// detection indefinitely.
@@ -452,24 +453,38 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
 // 2. etcd is healthy and responsive
 // This indicates the circular dependency where the hook waits for API server
 // while API server waits for the hook.
+//
+// Closed upstream Kubernetes issues:
+// https://github.com/kubernetes/kubernetes/issues/86715
+// https://github.com/kubernetes/kubernetes/issues/97119
 func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient rest.Interface, deadlockDetected chan<- struct{}) {
 	// Wait a few seconds before starting detection to allow normal startup
 	select {
 	case <-ctx.Done():
 		return
-	case <-time.After(5 * time.Second):
+	case <-time.After(rbacHookPollDelayStart):
 	}
 
+	checkCount := 0
+	maxChecks := int((rbacHookDeadlockTimeout - rbacHookPollDelayStart) / rbacHookCheckInterval) // Account for initial delay
 	// Track wall-clock deadline to prevent flapping from extending detection indefinitely
 	startTime := time.Now()
-	checkCount := 0
-	maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay
 
-	for checkCount < maxChecks {
+	for {
 		// Check absolute deadline first - this cannot be reset by etcd state changes
 		if time.Since(startTime) >= rbacHookMaxWaitDuration {
 			klog.Errorf("RBAC bootstrap hook exceeded maximum wait duration of %v", rbacHookMaxWaitDuration)
-			break
+			// Only trigger deadlock recovery if we've confirmed the predicate enough times
+			if checkCount >= maxChecks {
+				break // Fall through to close(deadlockDetected)
+			}
+			// Timeout but not confirmed deadlock - exit without triggering recovery
+			return
+		}
+
+		// Check if we've confirmed deadlock enough times
+		if checkCount >= maxChecks {
+			break // Fall through to close(deadlockDetected)
 		}
 
 		select {
@@ -478,11 +493,15 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
 		case <-time.After(rbacHookCheckInterval * time.Second):
 		}
 
-		checkCount++
-
 		// Check RBAC hook status
+		probeCtx, cancel := context.WithTimeout(ctx, time.Second)
 		var status int
-		err := restClient.Get().AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").Do(ctx).StatusCode(&status).Error()
+		err := restClient.Get().
+			AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").
+			Do(probeCtx).
+			StatusCode(&status).
+			Error()
+		cancel()
 
 		// If hook is ready, no deadlock
 		if err == nil && status == 200 {
@@ -490,14 +509,23 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
 			return
 		}
 
-		// Hook not ready - check if etcd is healthy
+		// If RBAC probe errored, skip this iteration (don't count toward deadlock)
+		if err != nil {
+			klog.V(4).Infof("RBAC probe error (not counting toward deadlock): %v", err)
+			continue
+		}
+
+		// Hook not ready (status != 200) - check if etcd is healthy
 		etcdHealthy, etcdErr := isEtcdHealthy(ctx)
 		if etcdErr != nil {
-			klog.V(4).Infof("Could not check etcd health: %v", etcdErr)
+			klog.V(4).Infof("Could not check etcd health (not counting toward deadlock): %v", etcdErr)
 			continue
 		}
 
 		if etcdHealthy {
+			// Only increment when BOTH conditions are met:
+			// RBAC probe returned not-ready AND etcd is healthy
+			checkCount++
 			klog.Warningf("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock",
 				checkCount, maxChecks, time.Since(startTime).Round(time.Second))
 		} else {
@@ -509,7 +537,7 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
 		}
 	}
 
-	// Reached max checks with etcd healthy but hook not completing - deadlock detected
+	// Only reached when checkCount >= maxChecks (deadlock confirmed)
 	klog.Errorf("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing",
 		time.Since(startTime).Round(time.Second))
 	close(deadlockDetected)
@@ -558,7 +586,11 @@ func isEtcdHealthy(ctx context.Context) (bool, error) {
 func restartMicroshiftEtcdScope() error {
 	klog.Info("Stopping microshift-etcd.scope for recovery")
 
-	stopCmd := exec.Command("systemctl", "stop", "microshift-etcd.scope")
+	// Set a timeout in case systemd or DBus stalls and the fail-fast recovery path hangs and Run never returns
+	cmdCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	stopCmd := exec.CommandContext(cmdCtx, "systemctl", "stop", "microshift-etcd.scope")
 	if out, err := stopCmd.CombinedOutput(); err != nil {
 		return fmt.Errorf("failed to stop microshift-etcd.scope: %w, output: %s", err, string(out))
 	}