@@ -61,7 +61,8 @@ const (
6161 // to allow for faster recovery.
6262 rbacHookDeadlockTimeout = 15
6363 // rbacHookCheckInterval is how often to check the RBAC hook status
64- rbacHookCheckInterval = 2
64+ rbacHookPollDelayStart = 5 * time .Second
65+ rbacHookCheckInterval = 2
6566 // rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook
6667 // regardless of etcd health state changes. This prevents flapping from extending
6768 // detection indefinitely.
@@ -452,24 +453,38 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
452453// 2. etcd is healthy and responsive
453454// This indicates the circular dependency where the hook waits for API server
454455// while API server waits for the hook.
456+ //
457+ // Closed upstream Kubernetes issues:
458+ // https://github.com/kubernetes/kubernetes/issues/86715
459+ // https://github.com/kubernetes/kubernetes/issues/97119
455460func (s * KubeAPIServer ) detectRBACHookDeadlock (ctx context.Context , restClient rest.Interface , deadlockDetected chan <- struct {}) {
456461 // Wait a few seconds before starting detection to allow normal startup
457462 select {
458463 case <- ctx .Done ():
459464 return
460- case <- time .After (5 * time . Second ):
465+ case <- time .After (rbacHookPollDelayStart ):
461466 }
462467
468+ checkCount := 0
469+ maxChecks := int ((rbacHookDeadlockTimeout - rbacHookPollDelayStart ) / rbacHookCheckInterval ) // Account for initial delay
463470 // Track wall-clock deadline to prevent flapping from extending detection indefinitely
464471 startTime := time .Now ()
465- checkCount := 0
466- maxChecks := (rbacHookDeadlockTimeout - 5 ) / rbacHookCheckInterval // Account for initial delay
467472
468- for checkCount < maxChecks {
473+ for {
469474 // Check absolute deadline first - this cannot be reset by etcd state changes
470475 if time .Since (startTime ) >= rbacHookMaxWaitDuration {
471476 klog .Errorf ("RBAC bootstrap hook exceeded maximum wait duration of %v" , rbacHookMaxWaitDuration )
472- break
477+ // Only trigger deadlock recovery if we've confirmed the predicate enough times
478+ if checkCount >= maxChecks {
479+ break // Fall through to close(deadlockDetected)
480+ }
481+ // Timeout but not confirmed deadlock - exit without triggering recovery
482+ return
483+ }
484+
485+ // Check if we've confirmed deadlock enough times
486+ if checkCount >= maxChecks {
487+ break // Fall through to close(deadlockDetected)
473488 }
474489
475490 select {
@@ -478,26 +493,39 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
478493 case <- time .After (rbacHookCheckInterval * time .Second ):
479494 }
480495
481- checkCount ++
482-
483496 // Check RBAC hook status
497+ probeCtx , cancel := context .WithTimeout (ctx , time .Second )
484498 var status int
485- err := restClient .Get ().AbsPath ("/readyz/poststarthook/rbac/bootstrap-roles" ).Do (ctx ).StatusCode (& status ).Error ()
499+ err := restClient .Get ().
500+ AbsPath ("/readyz/poststarthook/rbac/bootstrap-roles" ).
501+ Do (probeCtx ).
502+ StatusCode (& status ).
503+ Error ()
504+ cancel ()
486505
487506 // If hook is ready, no deadlock
488507 if err == nil && status == 200 {
489508 klog .V (4 ).Info ("RBAC bootstrap hook completed successfully" )
490509 return
491510 }
492511
493- // Hook not ready - check if etcd is healthy
512+ // If RBAC probe errored, skip this iteration (don't count toward deadlock)
513+ if err != nil {
514+ klog .V (4 ).Infof ("RBAC probe error (not counting toward deadlock): %v" , err )
515+ continue
516+ }
517+
518+ // Hook not ready (status != 200) - check if etcd is healthy
494519 etcdHealthy , etcdErr := isEtcdHealthy (ctx )
495520 if etcdErr != nil {
496- klog .V (4 ).Infof ("Could not check etcd health: %v" , etcdErr )
521+ klog .V (4 ).Infof ("Could not check etcd health (not counting toward deadlock) : %v" , etcdErr )
497522 continue
498523 }
499524
500525 if etcdHealthy {
526+ // Only increment when BOTH conditions are met:
527+ // RBAC probe returned not-ready AND etcd is healthy
528+ checkCount ++
501529 klog .Warningf ("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock" ,
502530 checkCount , maxChecks , time .Since (startTime ).Round (time .Second ))
503531 } else {
@@ -509,7 +537,7 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
509537 }
510538 }
511539
512- // Reached max checks with etcd healthy but hook not completing - deadlock detected
540+ // Only reached when checkCount >= maxChecks ( deadlock confirmed)
513541 klog .Errorf ("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing" ,
514542 time .Since (startTime ).Round (time .Second ))
515543 close (deadlockDetected )
@@ -558,7 +586,11 @@ func isEtcdHealthy(ctx context.Context) (bool, error) {
558586func restartMicroshiftEtcdScope () error {
559587 klog .Info ("Stopping microshift-etcd.scope for recovery" )
560588
561- stopCmd := exec .Command ("systemctl" , "stop" , "microshift-etcd.scope" )
589+ // Set a timeout in case systemd or DBus stalls and the fail-fast recovery path hangs and Run never returns
590+ cmdCtx , cancel := context .WithTimeout (context .Background (), 5 * time .Second )
591+ defer cancel ()
592+
593+ stopCmd := exec .CommandContext (cmdCtx , "systemctl" , "stop" , "microshift-etcd.scope" )
562594 if out , err := stopCmd .CombinedOutput (); err != nil {
563595 return fmt .Errorf ("failed to stop microshift-etcd.scope: %w, output: %s" , err , string (out ))
564596 }
0 commit comments