Skip to content

Commit 6de022e

Browse files
copejonclaude
andcommitted
fix(kube-apiserver): Add wall-clock deadline to prevent flapping
Add rbacHookMaxWaitDuration (30s) as an absolute deadline that cannot be reset by etcd health state changes. This prevents a flapping etcd from extending the deadlock detection indefinitely. The existing checkCount logic is preserved for detecting deadlock when etcd is consistently healthy, but the wall-clock deadline provides a hard upper bound regardless of etcd state transitions. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent db029bd commit 6de022e

File tree

1 file changed

+17
-3
lines changed

1 file changed

+17
-3
lines changed

pkg/controllers/kube-apiserver.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ const (
6262
rbacHookDeadlockTimeout = 15
6363
// rbacHookCheckInterval is how often to check the RBAC hook status
6464
rbacHookCheckInterval = 2
65+
// rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook
66+
// regardless of etcd health state changes. This prevents flapping from extending
67+
// detection indefinitely.
68+
rbacHookMaxWaitDuration = 30 * time.Second
6569
)
6670

6771
var (
@@ -456,10 +460,18 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
456460
case <-time.After(5 * time.Second):
457461
}
458462

463+
// Track wall-clock deadline to prevent flapping from extending detection indefinitely
464+
startTime := time.Now()
459465
checkCount := 0
460466
maxChecks := (rbacHookDeadlockTimeout - 5) / rbacHookCheckInterval // Account for initial delay
461467

462468
for checkCount < maxChecks {
469+
// Check absolute deadline first - this cannot be reset by etcd state changes
470+
if time.Since(startTime) >= rbacHookMaxWaitDuration {
471+
klog.Errorf("RBAC bootstrap hook exceeded maximum wait duration of %v", rbacHookMaxWaitDuration)
472+
break
473+
}
474+
463475
select {
464476
case <-ctx.Done():
465477
return
@@ -486,18 +498,20 @@ func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient r
486498
}
487499

488500
if etcdHealthy {
489-
klog.Warningf("RBAC bootstrap hook not ready (check %d/%d), but etcd is healthy - potential deadlock",
490-
checkCount, maxChecks)
501+
klog.Warningf("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock",
502+
checkCount, maxChecks, time.Since(startTime).Round(time.Second))
491503
} else {
492504
// etcd not healthy - not a deadlock, just waiting for etcd
493505
klog.V(4).Infof("RBAC hook waiting, etcd not yet healthy (check %d/%d)", checkCount, maxChecks)
494506
// Reset counter since this isn't a deadlock condition
507+
// Note: wall-clock deadline (startTime) is NOT reset - flapping cannot extend indefinitely
495508
checkCount = 0
496509
}
497510
}
498511

499512
// Reached max checks with etcd healthy but hook not completing - deadlock detected
500-
klog.Error("RBAC bootstrap hook deadlock confirmed: etcd healthy but hook not completing")
513+
klog.Errorf("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing",
514+
time.Since(startTime).Round(time.Second))
501515
close(deadlockDetected)
502516
}
503517

0 commit comments

Comments
 (0)