@@ -23,6 +23,7 @@ import (
2323 "io"
2424 "net"
2525 "os"
26+ "os/exec"
2627 "path/filepath"
2728 "strconv"
2829 "strings"
@@ -55,6 +56,12 @@ import (
5556
5657const (
5758 kubeAPIStartupTimeout = 60
59+ // rbacHookDeadlockTimeout is the time to wait for the RBAC bootstrap hook
60+ // before declaring a deadlock. This is shorter than kubeAPIStartupTimeout
61+ // to allow for faster recovery.
62+ rbacHookDeadlockTimeout = 15
63+ // rbacHookCheckInterval is how often to check the RBAC hook status
64+ rbacHookCheckInterval = 2
5865)
5966
6067var (
@@ -348,7 +355,13 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
348355 return err
349356 }
350357
351- // run readiness check
358+ // Channel to signal RBAC hook deadlock detection
359+ rbacDeadlockDetected := make (chan struct {})
360+
361+ // Run RBAC hook deadlock detector
362+ go s .detectRBACHookDeadlock (ctx , restClient , rbacDeadlockDetected )
363+
364+ // Run standard readiness check
352365 go func () {
353366 err := wait .PollUntilContextTimeout (ctx , time .Second , kubeAPIStartupTimeout * time .Second , true , func (ctx context.Context ) (bool , error ) {
354367 var status int
@@ -420,7 +433,127 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
420433 return err
421434 case perr := <- panicChannel :
422435 panic (perr )
436+ case <- rbacDeadlockDetected :
437+ klog .Error ("RBAC bootstrap hook deadlock detected - restarting microshift-etcd.scope to recover" )
438+ if err := restartMicroshiftEtcdScope (); err != nil {
439+ klog .Errorf ("Failed to restart microshift-etcd.scope: %v" , err )
440+ }
441+ return fmt .Errorf ("RBAC bootstrap hook deadlock detected after %d seconds" , rbacHookDeadlockTimeout )
442+ }
443+ }
444+
445+ // detectRBACHookDeadlock monitors the RBAC bootstrap hook status and detects deadlock conditions.
446+ // A deadlock is detected when:
447+ // 1. The RBAC hook is not completing (stuck in "not finished" state)
448+ // 2. etcd is healthy and responsive
449+ // This indicates the circular dependency where the hook waits for API server
450+ // while API server waits for the hook.
451+ func (s * KubeAPIServer ) detectRBACHookDeadlock (ctx context.Context , restClient rest.Interface , deadlockDetected chan <- struct {}) {
452+ // Wait a few seconds before starting detection to allow normal startup
453+ select {
454+ case <- ctx .Done ():
455+ return
456+ case <- time .After (5 * time .Second ):
423457 }
458+
459+ checkCount := 0
460+ maxChecks := (rbacHookDeadlockTimeout - 5 ) / rbacHookCheckInterval // Account for initial delay
461+
462+ for checkCount < maxChecks {
463+ select {
464+ case <- ctx .Done ():
465+ return
466+ case <- time .After (rbacHookCheckInterval * time .Second ):
467+ }
468+
469+ checkCount ++
470+
471+ // Check RBAC hook status
472+ var status int
473+ err := restClient .Get ().AbsPath ("/readyz/poststarthook/rbac/bootstrap-roles" ).Do (ctx ).StatusCode (& status ).Error ()
474+
475+ // If hook is ready, no deadlock
476+ if err == nil && status == 200 {
477+ klog .V (4 ).Info ("RBAC bootstrap hook completed successfully" )
478+ return
479+ }
480+
481+ // Hook not ready - check if etcd is healthy
482+ etcdHealthy , etcdErr := isEtcdHealthy (ctx )
483+ if etcdErr != nil {
484+ klog .V (4 ).Infof ("Could not check etcd health: %v" , etcdErr )
485+ continue
486+ }
487+
488+ if etcdHealthy {
489+ klog .Warningf ("RBAC bootstrap hook not ready (check %d/%d), but etcd is healthy - potential deadlock" ,
490+ checkCount , maxChecks )
491+ } else {
492+ // etcd not healthy - not a deadlock, just waiting for etcd
493+ klog .V (4 ).Infof ("RBAC hook waiting, etcd not yet healthy (check %d/%d)" , checkCount , maxChecks )
494+ // Reset counter since this isn't a deadlock condition
495+ checkCount = 0
496+ }
497+ }
498+
499+ // Reached max checks with etcd healthy but hook not completing - deadlock detected
500+ klog .Error ("RBAC bootstrap hook deadlock confirmed: etcd healthy but hook not completing" )
501+ close (deadlockDetected )
502+ }
503+
504+ // isEtcdHealthy checks if etcd is responsive by attempting to connect and get status.
505+ func isEtcdHealthy (ctx context.Context ) (bool , error ) {
506+ certsDir := cryptomaterial .CertsDirectory (config .DataDir )
507+ etcdAPIServerClientCertDir := cryptomaterial .EtcdAPIServerClientCertDir (certsDir )
508+
509+ tlsInfo := transport.TLSInfo {
510+ CertFile : cryptomaterial .ClientCertPath (etcdAPIServerClientCertDir ),
511+ KeyFile : cryptomaterial .ClientKeyPath (etcdAPIServerClientCertDir ),
512+ TrustedCAFile : cryptomaterial .CACertPath (cryptomaterial .EtcdSignerDir (certsDir )),
513+ }
514+ tlsConfig , err := tlsInfo .ClientConfig ()
515+ if err != nil {
516+ return false , fmt .Errorf ("failed to create TLS config: %w" , err )
517+ }
518+
519+ // Use a short timeout for health check
520+ checkCtx , cancel := context .WithTimeout (ctx , 2 * time .Second )
521+ defer cancel ()
522+
523+ client , err := clientv3 .New (clientv3.Config {
524+ Endpoints : []string {"https://localhost:2379" },
525+ DialTimeout : 1 * time .Second ,
526+ TLS : tlsConfig ,
527+ Context : checkCtx ,
528+ })
529+ if err != nil {
530+ return false , fmt .Errorf ("failed to create etcd client: %w" , err )
531+ }
532+ defer func () { _ = client .Close () }()
533+
534+ _ , err = client .Status (checkCtx , "localhost:2379" )
535+ if err != nil {
536+ return false , nil // etcd not healthy, but not an error condition
537+ }
538+
539+ return true , nil
540+ }
541+
542+ // restartMicroshiftEtcdScope restarts the microshift-etcd.scope to recover from deadlock.
543+ // This forces a clean restart of etcd which can help break the circular dependency.
544+ func restartMicroshiftEtcdScope () error {
545+ klog .Info ("Stopping microshift-etcd.scope for recovery" )
546+
547+ stopCmd := exec .Command ("systemctl" , "stop" , "microshift-etcd.scope" )
548+ if out , err := stopCmd .CombinedOutput (); err != nil {
549+ return fmt .Errorf ("failed to stop microshift-etcd.scope: %w, output: %s" , err , string (out ))
550+ }
551+
552+ // Wait briefly for cleanup
553+ time .Sleep (1 * time .Second )
554+
555+ klog .Info ("microshift-etcd.scope stopped - MicroShift will restart" )
556+ return nil
424557}
425558
426559func discoverEtcdServers (ctx context.Context , kubeconfigPath string ) ([]string , error ) {
0 commit comments