From c2fdb99c5aff4400f6a59326b477590b78cb0b4d Mon Sep 17 00:00:00 2001 From: Jon Cope Date: Tue, 7 Apr 2026 11:04:59 -0500 Subject: [PATCH 1/3] fix(kube-apiserver): Add fail-fast RBAC bootstrap hook deadlock detection On MicroShift restart, the RBAC bootstrap hook can deadlock when etcd contains existing data. The hook uses context.TODO() for API calls, which has no timeout. When the loopback client hangs, this creates a circular dependency where the hook waits for the API server while the API server waits for the hook to complete. This change adds a parallel deadlock detector that: - Monitors /readyz/poststarthook/rbac/bootstrap-roles specifically - Checks if etcd is healthy while the hook is stuck - Detects deadlock in ~15 seconds instead of waiting 60 seconds - Restarts microshift-etcd.scope to recover from the deadlock This breaks the crash loop by detecting the condition early and taking recovery action at the MicroShift level, without requiring changes to vendored upstream Kubernetes code. Related upstream issues: kubernetes/kubernetes#86715, #97119 Co-Authored-By: Claude Opus 4.5 --- pkg/controllers/kube-apiserver.go | 135 +++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go index 70cb89dcdc..d11b2afcaa 100644 --- a/pkg/controllers/kube-apiserver.go +++ b/pkg/controllers/kube-apiserver.go @@ -23,6 +23,7 @@ import ( "io" "net" "os" + "os/exec" "path/filepath" "strconv" "strings" @@ -55,6 +56,12 @@ import ( const ( kubeAPIStartupTimeout = 60 + // rbacHookDeadlockTimeout is the time to wait for the RBAC bootstrap hook + // before declaring a deadlock. This is shorter than kubeAPIStartupTimeout + // to allow for faster recovery. + rbacHookDeadlockTimeout = 15 + // rbacHookCheckInterval is how often to check the RBAC hook status + rbacHookCheckInterval = 2 ) var ( @@ -348,7 +355,13 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped return err } - // run readiness check + // Channel to signal RBAC hook deadlock detection + rbacDeadlockDetected := make(chan struct{}) + + // Run RBAC hook deadlock detector + go s.detectRBACHookDeadlock(ctx, restClient, rbacDeadlockDetected) + + // Run standard readiness check go func() { err := wait.PollUntilContextTimeout(ctx, time.Second, kubeAPIStartupTimeout*time.Second, true, func(ctx context.Context) (bool, error) { var status int @@ -420,7 +433,127 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped return err case perr := <-panicChannel: panic(perr) + case <-rbacDeadlockDetected: + klog.Error("RBAC bootstrap hook deadlock detected - restarting microshift-etcd.scope to recover") + if err := restartMicroshiftEtcdScope(); err != nil { + klog.Errorf("Failed to restart microshift-etcd.scope: %v", err) + } + return fmt.Errorf("RBAC bootstrap hook deadlock detected after %d seconds", rbacHookDeadlockTimeout) + } +} + +// detectRBACHookDeadlock monitors the RBAC bootstrap hook status and detects deadlock conditions. +// A deadlock is detected when: +// 1. The RBAC hook is not completing (stuck in "not finished" state) +// 2. etcd is healthy and responsive +// This indicates the circular dependency where the hook waits for API server +// while API server waits for the hook. +func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient rest.Interface, deadlockDetected chan<- struct{}) { + // Wait a few seconds before starting detection to allow normal startup + select { + case <-ctx.Done(): + return + case <-time.After(5 * time.Second): } + + checkCount := 0 + maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay + + for checkCount < maxChecks { + select { + case <-ctx.Done(): + return + case <-time.After(rbacHookCheckInterval * time.Second): + } + + checkCount++ + + // Check RBAC hook status + var status int + err := restClient.Get().AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").Do(ctx).StatusCode(&status).Error() + + // If hook is ready, no deadlock + if err == nil && status == 200 { + klog.V(4).Info("RBAC bootstrap hook completed successfully") + return + } + + // Hook not ready - check if etcd is healthy + etcdHealthy, etcdErr := isEtcdHealthy(ctx) + if etcdErr != nil { + klog.V(4).Infof("Could not check etcd health: %v", etcdErr) + continue + } + + if etcdHealthy { + klog.Warningf("RBAC bootstrap hook not ready (check %d/%d), but etcd is healthy - potential deadlock", + checkCount, maxChecks) + } else { + // etcd not healthy - not a deadlock, just waiting for etcd + klog.V(4).Infof("RBAC hook waiting, etcd not yet healthy (check %d/%d)", checkCount, maxChecks) + // Reset counter since this isn't a deadlock condition + checkCount = 0 + } + } + + // Reached max checks with etcd healthy but hook not completing - deadlock detected + klog.Error("RBAC bootstrap hook deadlock confirmed: etcd healthy but hook not completing") + close(deadlockDetected) +} + +// isEtcdHealthy checks if etcd is responsive by attempting to connect and get status. +func isEtcdHealthy(ctx context.Context) (bool, error) { + certsDir := cryptomaterial.CertsDirectory(config.DataDir) + etcdAPIServerClientCertDir := cryptomaterial.EtcdAPIServerClientCertDir(certsDir) + + tlsInfo := transport.TLSInfo{ + CertFile: cryptomaterial.ClientCertPath(etcdAPIServerClientCertDir), + KeyFile: cryptomaterial.ClientKeyPath(etcdAPIServerClientCertDir), + TrustedCAFile: cryptomaterial.CACertPath(cryptomaterial.EtcdSignerDir(certsDir)), + } + tlsConfig, err := tlsInfo.ClientConfig() + if err != nil { + return false, fmt.Errorf("failed to create TLS config: %w", err) + } + + // Use a short timeout for health check + checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + + client, err := clientv3.New(clientv3.Config{ + Endpoints: []string{"https://localhost:2379"}, + DialTimeout: 1 * time.Second, + TLS: tlsConfig, + Context: checkCtx, + }) + if err != nil { + return false, fmt.Errorf("failed to create etcd client: %w", err) + } + defer func() { _ = client.Close() }() + + _, err = client.Status(checkCtx, "localhost:2379") + if err != nil { + return false, nil // etcd not healthy, but not an error condition + } + + return true, nil +} + +// restartMicroshiftEtcdScope restarts the microshift-etcd.scope to recover from deadlock. +// This forces a clean restart of etcd which can help break the circular dependency. +func restartMicroshiftEtcdScope() error { + klog.Info("Stopping microshift-etcd.scope for recovery") + + stopCmd := exec.Command("systemctl", "stop", "microshift-etcd.scope") + if out, err := stopCmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to stop microshift-etcd.scope: %w, output: %s", err, string(out)) + } + + // Wait briefly for cleanup + time.Sleep(1 * time.Second) + + klog.Info("microshift-etcd.scope stopped - MicroShift will restart") + return nil } func discoverEtcdServers(ctx context.Context, kubeconfigPath string) ([]string, error) { From 6a02e6227fe7354c508e5d9e75caad970dcdfe68 Mon Sep 17 00:00:00 2001 From: Jon Cope Date: Tue, 7 Apr 2026 15:35:24 -0500 Subject: [PATCH 2/3] fix(kube-apiserver): Add wall-clock deadline to prevent flapping Add rbacHookMaxWaitDuration (30s) as an absolute deadline that cannot be reset by etcd health state changes. This prevents a flapping etcd from extending the deadlock detection indefinitely. The existing checkCount logic is preserved for detecting deadlock when etcd is consistently healthy, but the wall-clock deadline provides a hard upper bound regardless of etcd state transitions. Co-Authored-By: Claude Opus 4.5 --- pkg/controllers/kube-apiserver.go | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go index d11b2afcaa..c0bcd4f47f 100644 --- a/pkg/controllers/kube-apiserver.go +++ b/pkg/controllers/kube-apiserver.go @@ -62,6 +62,10 @@ const ( rbacHookDeadlockTimeout = 15 // rbacHookCheckInterval is how often to check the RBAC hook status rbacHookCheckInterval = 2 + // rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook + // regardless of etcd health state changes. This prevents flapping from extending + // detection indefinitely. + rbacHookMaxWaitDuration = 30 * time.Second ) var ( @@ -456,10 +460,18 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r case <-time.After(5 * time.Second): } + // Track wall-clock deadline to prevent flapping from extending detection indefinitely + startTime := time.Now() checkCount := 0 maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay for checkCount < maxChecks { + // Check absolute deadline first - this cannot be reset by etcd state changes + if time.Since(startTime) >= rbacHookMaxWaitDuration { + klog.Errorf("RBAC bootstrap hook exceeded maximum wait duration of %v", rbacHookMaxWaitDuration) + break + } + select { case <-ctx.Done(): return @@ -486,18 +498,20 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r } if etcdHealthy { - klog.Warningf("RBAC bootstrap hook not ready (check %d/%d), but etcd is healthy - potential deadlock", - checkCount, maxChecks) + klog.Warningf("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock", + checkCount, maxChecks, time.Since(startTime).Round(time.Second)) } else { // etcd not healthy - not a deadlock, just waiting for etcd klog.V(4).Infof("RBAC hook waiting, etcd not yet healthy (check %d/%d)", checkCount, maxChecks) // Reset counter since this isn't a deadlock condition + // Note: wall-clock deadline (startTime) is NOT reset - flapping cannot extend indefinitely checkCount = 0 } } // Reached max checks with etcd healthy but hook not completing - deadlock detected - klog.Error("RBAC bootstrap hook deadlock confirmed: etcd healthy but hook not completing") + klog.Errorf("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing", + time.Since(startTime).Round(time.Second)) close(deadlockDetected) } From 184e1cbc2f366a12d80f19f31555bb5e69e68647 Mon Sep 17 00:00:00 2001 From: Jon Cope Date: Thu, 16 Apr 2026 15:47:57 -0500 Subject: [PATCH 3/3] fix(kube-apiserver): Harden RBAC deadlock detection against false positives - Only increment checkCount when deadlock predicate confirmed (RBAC not ready AND etcd healthy) - Skip counting when RBAC probe or etcd health check errors - On wall-clock timeout, only trigger recovery if checkCount >= maxChecks - Add 1s timeout to RBAC probe to prevent hanging on unresponsive API - Add 5s timeout to systemctl stop to prevent recovery path from stalling - Extract rbacHookPollDelayStart constant for clarity Prevents false positive deadlock detection when etcd flaps or probes error, ensuring close(deadlockDetected) only fires after confirming the deadlock condition the required number of times. Related: kubernetes/kubernetes#86715, #97119 Co-Authored-By: Claude Opus 4.5 --- pkg/controllers/kube-apiserver.go | 58 ++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go index c0bcd4f47f..54d6e8af9f 100644 --- a/pkg/controllers/kube-apiserver.go +++ b/pkg/controllers/kube-apiserver.go @@ -61,7 +61,8 @@ const ( // to allow for faster recovery. rbacHookDeadlockTimeout = 15 // rbacHookCheckInterval is how often to check the RBAC hook status - rbacHookCheckInterval = 2 + rbacHookPollDelayStart = 5 * time.Second + rbacHookCheckInterval = 2 // rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook // regardless of etcd health state changes. This prevents flapping from extending // detection indefinitely. @@ -452,24 +453,38 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped // 2. etcd is healthy and responsive // This indicates the circular dependency where the hook waits for API server // while API server waits for the hook. +// +// Closed upstream Kubernetes issues: +// https://github.com/kubernetes/kubernetes/issues/86715 +// https://github.com/kubernetes/kubernetes/issues/97119 func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient rest.Interface, deadlockDetected chan<- struct{}) { // Wait a few seconds before starting detection to allow normal startup select { case <-ctx.Done(): return - case <-time.After(5 * time.Second): + case <-time.After(rbacHookPollDelayStart): } + checkCount := 0 + maxChecks := int((rbacHookDeadlockTimeout - rbacHookPollDelayStart) / rbacHookCheckInterval) // Account for initial delay // Track wall-clock deadline to prevent flapping from extending detection indefinitely startTime := time.Now() - checkCount := 0 - maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay - for checkCount < maxChecks { + for { // Check absolute deadline first - this cannot be reset by etcd state changes if time.Since(startTime) >= rbacHookMaxWaitDuration { klog.Errorf("RBAC bootstrap hook exceeded maximum wait duration of %v", rbacHookMaxWaitDuration) - break + // Only trigger deadlock recovery if we've confirmed the predicate enough times + if checkCount >= maxChecks { + break // Fall through to close(deadlockDetected) + } + // Timeout but not confirmed deadlock - exit without triggering recovery + return + } + + // Check if we've confirmed deadlock enough times + if checkCount >= maxChecks { + break // Fall through to close(deadlockDetected) } select { @@ -478,11 +493,15 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r case <-time.After(rbacHookCheckInterval * time.Second): } - checkCount++ - // Check RBAC hook status + probeCtx, cancel := context.WithTimeout(ctx, time.Second) var status int - err := restClient.Get().AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").Do(ctx).StatusCode(&status).Error() + err := restClient.Get(). + AbsPath("/readyz/poststarthook/rbac/bootstrap-roles"). + Do(probeCtx). + StatusCode(&status). + Error() + cancel() // If hook is ready, no deadlock if err == nil && status == 200 { @@ -490,14 +509,23 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r return } - // Hook not ready - check if etcd is healthy + // If RBAC probe errored, skip this iteration (don't count toward deadlock) + if err != nil { + klog.V(4).Infof("RBAC probe error (not counting toward deadlock): %v", err) + continue + } + + // Hook not ready (status != 200) - check if etcd is healthy etcdHealthy, etcdErr := isEtcdHealthy(ctx) if etcdErr != nil { - klog.V(4).Infof("Could not check etcd health: %v", etcdErr) + klog.V(4).Infof("Could not check etcd health (not counting toward deadlock): %v", etcdErr) continue } if etcdHealthy { + // Only increment when BOTH conditions are met: + // RBAC probe returned not-ready AND etcd is healthy + checkCount++ klog.Warningf("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock", checkCount, maxChecks, time.Since(startTime).Round(time.Second)) } else { @@ -509,7 +537,7 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r } } - // Reached max checks with etcd healthy but hook not completing - deadlock detected + // Only reached when checkCount >= maxChecks (deadlock confirmed) klog.Errorf("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing", time.Since(startTime).Round(time.Second)) close(deadlockDetected) @@ -558,7 +586,11 @@ func isEtcdHealthy(ctx context.Context) (bool, error) { func restartMicroshiftEtcdScope() error { klog.Info("Stopping microshift-etcd.scope for recovery") - stopCmd := exec.Command("systemctl", "stop", "microshift-etcd.scope") + // Set a timeout in case systemd or DBus stalls and the fail-fast recovery path hangs and Run never returns + cmdCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + stopCmd := exec.CommandContext(cmdCtx, "systemctl", "stop", "microshift-etcd.scope") if out, err := stopCmd.CombinedOutput(); err != nil { return fmt.Errorf("failed to stop microshift-etcd.scope: %w, output: %s", err, string(out)) }