Skip to content

Commit 184e1cb

Browse files
copejonclaude
andcommitted
fix(kube-apiserver): Harden RBAC deadlock detection against false positives
- Only increment checkCount when deadlock predicate confirmed (RBAC not ready AND etcd healthy) - Skip counting when RBAC probe or etcd health check errors - On wall-clock timeout, only trigger recovery if checkCount >= maxChecks - Add 1s timeout to RBAC probe to prevent hanging on unresponsive API - Add 5s timeout to systemctl stop to prevent recovery path from stalling - Extract rbacHookPollDelayStart constant for clarity Prevents false positive deadlock detection when etcd flaps or probes error, ensuring close(deadlockDetected) only fires after confirming the deadlock condition the required number of times. Related: kubernetes/kubernetes#86715, #97119 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 6a02e62 commit 184e1cb

File tree

1 file changed

+45
-13
lines changed

1 file changed

+45
-13
lines changed

pkg/controllers/kube-apiserver.go

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ const (
6161
// to allow for faster recovery.
6262
rbacHookDeadlockTimeout = 15
6363
// rbacHookCheckInterval is how often to check the RBAC hook status
64-
rbacHookCheckInterval = 2
64+
rbacHookPollDelayStart = 5 * time.Second
65+
rbacHookCheckInterval = 2
6566
// rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook
6667
// regardless of etcd health state changes. This prevents flapping from extending
6768
// detection indefinitely.
@@ -452,24 +453,38 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
452453
// 2. etcd is healthy and responsive
453454
// This indicates the circular dependency where the hook waits for API server
454455
// while API server waits for the hook.
456+
//
457+
// Closed upstream Kubernetes issues:
458+
// https://github.com/kubernetes/kubernetes/issues/86715
459+
// https://github.com/kubernetes/kubernetes/issues/97119
455460
func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient rest.Interface, deadlockDetected chan<- struct{}) {
456461
// Wait a few seconds before starting detection to allow normal startup
457462
select {
458463
case <-ctx.Done():
459464
return
460-
case <-time.After(5 * time.Second):
465+
case <-time.After(rbacHookPollDelayStart):
461466
}
462467

468+
checkCount := 0
469+
maxChecks := int((rbacHookDeadlockTimeout - rbacHookPollDelayStart) / rbacHookCheckInterval) // Account for initial delay
463470
// Track wall-clock deadline to prevent flapping from extending detection indefinitely
464471
startTime := time.Now()
465-
checkCount := 0
466-
maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay
467472

468-
for checkCount < maxChecks {
473+
for {
469474
// Check absolute deadline first - this cannot be reset by etcd state changes
470475
if time.Since(startTime) >= rbacHookMaxWaitDuration {
471476
klog.Errorf("RBAC bootstrap hook exceeded maximum wait duration of %v", rbacHookMaxWaitDuration)
472-
break
477+
// Only trigger deadlock recovery if we've confirmed the predicate enough times
478+
if checkCount >= maxChecks {
479+
break // Fall through to close(deadlockDetected)
480+
}
481+
// Timeout but not confirmed deadlock - exit without triggering recovery
482+
return
483+
}
484+
485+
// Check if we've confirmed deadlock enough times
486+
if checkCount >= maxChecks {
487+
break // Fall through to close(deadlockDetected)
473488
}
474489

475490
select {
@@ -478,26 +493,39 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
478493
case <-time.After(rbacHookCheckInterval * time.Second):
479494
}
480495

481-
checkCount++
482-
483496
// Check RBAC hook status
497+
probeCtx, cancel := context.WithTimeout(ctx, time.Second)
484498
var status int
485-
err := restClient.Get().AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").Do(ctx).StatusCode(&status).Error()
499+
err := restClient.Get().
500+
AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").
501+
Do(probeCtx).
502+
StatusCode(&status).
503+
Error()
504+
cancel()
486505

487506
// If hook is ready, no deadlock
488507
if err == nil && status == 200 {
489508
klog.V(4).Info("RBAC bootstrap hook completed successfully")
490509
return
491510
}
492511

493-
// Hook not ready - check if etcd is healthy
512+
// If RBAC probe errored, skip this iteration (don't count toward deadlock)
513+
if err != nil {
514+
klog.V(4).Infof("RBAC probe error (not counting toward deadlock): %v", err)
515+
continue
516+
}
517+
518+
// Hook not ready (status != 200) - check if etcd is healthy
494519
etcdHealthy, etcdErr := isEtcdHealthy(ctx)
495520
if etcdErr != nil {
496-
klog.V(4).Infof("Could not check etcd health: %v", etcdErr)
521+
klog.V(4).Infof("Could not check etcd health (not counting toward deadlock): %v", etcdErr)
497522
continue
498523
}
499524

500525
if etcdHealthy {
526+
// Only increment when BOTH conditions are met:
527+
// RBAC probe returned not-ready AND etcd is healthy
528+
checkCount++
501529
klog.Warningf("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock",
502530
checkCount, maxChecks, time.Since(startTime).Round(time.Second))
503531
} else {
@@ -509,7 +537,7 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
509537
}
510538
}
511539

512-
// Reached max checks with etcd healthy but hook not completing - deadlock detected
540+
// Only reached when checkCount >= maxChecks (deadlock confirmed)
513541
klog.Errorf("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing",
514542
time.Since(startTime).Round(time.Second))
515543
close(deadlockDetected)
@@ -558,7 +586,11 @@ func isEtcdHealthy(ctx context.Context) (bool, error) {
558586
func restartMicroshiftEtcdScope() error {
559587
klog.Info("Stopping microshift-etcd.scope for recovery")
560588

561-
stopCmd := exec.Command("systemctl", "stop", "microshift-etcd.scope")
589+
// Set a timeout in case systemd or DBus stalls and the fail-fast recovery path hangs and Run never returns
590+
cmdCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
591+
defer cancel()
592+
593+
stopCmd := exec.CommandContext(cmdCtx, "systemctl", "stop", "microshift-etcd.scope")
562594
if out, err := stopCmd.CombinedOutput(); err != nil {
563595
return fmt.Errorf("failed to stop microshift-etcd.scope: %w, output: %s", err, string(out))
564596
}

0 commit comments

Comments
 (0)