Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
"k8s.io/client-go/rest"
e2e "k8s.io/kubernetes/test/e2e/framework"
)

type legacyMonitorTests struct {
Expand Down Expand Up @@ -90,15 +91,20 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
junits = append(junits, testOperatorOSUpdateStartedEventRecorded(finalIntervals, w.adminRESTConfig)...)

isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
topology, err := getControlPlaneTopology(w.adminRESTConfig)
if err != nil {
e2e.Logf("failed to get control plane topology: %v", err)
}

if isUpgrade {
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig, topology)...)
level, err := getUpgradeLevel(w.adminRESTConfig)
if err != nil || level == unknownUpgradeLevel {
return nil, fmt.Errorf("failed to determine upgrade level: %w", err)
}
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, w.adminRESTConfig)...)
junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, topology)...)
} else {
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, topology)...)
}

return junits, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (

// exceptionCallback consumes a suspicious condition and returns an
// exception string if does not think the condition should be fatal.
type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) string
type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval) string

type upgradeWindowHolder struct {
startInterval *monitorapi.Interval
Expand All @@ -46,14 +46,8 @@ func checkAuthenticationAvailableExceptions(condition *configv1.ClusterOperatorS
return false
}

func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
topology, err := getControlPlaneTopology(clientConfig)
if err != nil {
logrus.Warnf("Error checking for ControlPlaneTopology configuration (unable to make topology exceptions): %v", err)
}
isSingleNode := topology == configv1.SingleReplicaTopologyMode

except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval, clientConfig *rest.Config) string {
func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, topology configv1.TopologyMode) []*junitapi.JUnitTestCase {
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval) string {
if condition.Status == configv1.ConditionTrue {
if condition.Type == configv1.OperatorAvailable {
return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status)
Expand All @@ -64,30 +58,6 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien
}
}

if isSingleNode {
switch operator {
case "dns":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
strings.Contains(condition.Message, `DNS "default" is unavailable.`) {
return "dns operator is allowed to have Available=False due to serial taint tests on single node"
}
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue &&
strings.Contains(condition.Message, `DNS default is degraded`) {
return "dns operator is allowed to have Degraded=True due to serial taint tests on single node"
}
case "openshift-apiserver":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
strings.Contains(condition.Message, `connect: connection refused`) {
return "openshift apiserver operator is allowed to have Available=False due kube-apiserver force rollout test on single node"
}
case "csi-snapshot-controller":
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
strings.Contains(condition.Message, `Waiting for Deployment`) {
return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on single node"
}
}
}

// For the non-upgrade case, if any operator has Available=False, fail the test.
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
if operator == "authentication" {
Expand Down Expand Up @@ -141,7 +111,7 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien
return "We are not worried about other operator condition blips for stable-system tests yet."
}

return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false)
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, false, topology)
}

func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, error) {
Expand Down Expand Up @@ -260,26 +230,17 @@ func hasUpgradeFailedEvent(eventList monitorapi.Intervals) bool {
return false
}

func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode) []*junitapi.JUnitTestCase {
upgradeWindows := getUpgradeWindows(events)
topology, err := getControlPlaneTopology(clientConfig)
if err != nil {
logrus.Warnf("Error checking for ControlPlaneTopology configuration on upgrade (unable to make topology exceptions): %v", err)
}

isSingleNode := topology == configv1.SingleReplicaTopologyMode
isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode
upgradeFailed := hasUpgradeFailedEvent(events)

except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) string {
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval) string {
// When an upgrade was recorded as failed, we will not care about the operator state transitions
if upgradeFailed {
return "upgrade failed, not recording unexpected operator transitions as failure"
}
// SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade
if isSingleNode {
return "single node is allowed to be unavailable/degraded during upgrades"
}

if condition.Status == configv1.ConditionTrue {
if condition.Type == configv1.OperatorAvailable {
Expand Down Expand Up @@ -430,9 +391,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
}
case "kube-apiserver":
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue {
if isSingleNode && condition.Reason == "NodeInstaller_InstallerPodFailed" {
return "https://issues.redhat.com/browse/OCPBUGS-38678"
}
return "https://issues.redhat.com/browse/OCPBUGS-38661"
}
case "kube-controller-manager":
Expand All @@ -455,7 +413,7 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
return ""
}

return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true)
return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, true, topology)
}

func isVSphere(config *rest.Config) (bool, error) {
Expand Down Expand Up @@ -506,7 +464,7 @@ func checkReplicas(namespace string, operator string, clientConfig *rest.Config)
return 0, fmt.Errorf("Error fetching replicas")
}

func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade bool) []*junitapi.JUnitTestCase {
func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, upgrade bool, topology configv1.TopologyMode) []*junitapi.JUnitTestCase {
ret := []*junitapi.JUnitTestCase{}

var start, stop time.Time
Expand All @@ -527,14 +485,22 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, conditionType)
operatorEvents := eventsByOperator[operatorName]
if topology == configv1.SingleReplicaTopologyMode {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
SkipMessage: &junitapi.SkipMessage{
Message: "Test skipped on a single-node cluster",
},
})
continue
}
if len(operatorEvents) == 0 {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
Duration: duration,
})
continue
}

excepted := []string{}
fatal := []string{}

Expand Down Expand Up @@ -570,7 +536,7 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
if len(concurrentE2E) > 0 {
failure = fmt.Sprintf("%s\n%d tests failed during this blip (%v to %v): %v", failure, len(concurrentE2E), eventInterval.From, eventInterval.To, strings.Join(concurrentE2E, "\n"))
}
exception := except(operatorName, condition, eventInterval, clientConfig)
exception := except(operatorName, condition, eventInterval)
if exception == "" {
fatal = append(fatal, failure)
} else {
Expand Down Expand Up @@ -619,22 +585,12 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
return ret
}

func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, topology configv1.TopologyMode) []*junitapi.JUnitTestCase {
var ret []*junitapi.JUnitTestCase
upgradeWindows := getUpgradeWindows(events)
multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1

isTwoNode := false
isSingleNode := false
if clientConfig != nil {
topology, err := getControlPlaneTopology(clientConfig)
if err != nil {
logrus.Warnf("Error checking for ControlPlaneTopology configuration for MCO co-progressing monitor (unable to apply topology exceptions): %v", err)
} else {
isTwoNode = topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode
isSingleNode = topology == configv1.SingleReplicaTopologyMode
}
}
isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode

var machineConfigProgressingStart time.Time
var eventsInUpgradeWindows monitorapi.Intervals
Expand Down Expand Up @@ -715,6 +671,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
mcTestCase.SkipMessage = &junitapi.SkipMessage{
Message: "Test skipped in a patch-level upgrade test",
}
} else if topology == configv1.SingleReplicaTopologyMode {
mcTestCase.SkipMessage = &junitapi.SkipMessage{
Message: "Test skipped on a single-node cluster",
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
} else if t, ok := coProgressingStart[operatorName]; !ok || t.IsZero() {
output := fmt.Sprintf("clusteroperator/%s was never Progressing=True during the upgrade window from %s to %s", operatorName, start.Format(time.RFC3339), stop.Format(time.RFC3339))
exception = except(operatorName, "")
Expand Down Expand Up @@ -792,22 +752,6 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
if reason == "_ManagedDeploymentsAvailable" {
return "https://issues.redhat.com/browse/OCPBUGS-62633"
}
case "olm":
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that the exceptions about SNO coming from #31172 have been removed.

/cc @tmshort

Copy link
Copy Markdown
Member Author

@hongkailiu hongkailiu May 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have examples of co/operator-lifecycle-manager-packageserver failing on SNO. It is aligned with https://redhat.atlassian.net/browse/OCPBUGS-63672 is fixed except on SNO https://redhat.atlassian.net/browse/OCPBUGS-67210.

Is there any example for co/olm failing on SNO? If yes, that does not explain the high pass rate in Sippy.

I was trying to understand this in Slack but got distracted.


The following is for my own memory, not relevant to the above question.

OCPBUGS-63672 -> (by this comment) OCPBUGS-67210 is confusing because the condition to check went from Progressing to Available.
The reason is that I copied from the output from wrong test case in this comment because the linked job such as this one in my previous comment contains two flaky cases about clusteroperator/operator-lifecycle-manager-packageserver. One for Available and the other for Progressing.

OCPBUGS-67210 should be about Progressing as well.

// CatalogdDeploymentCatalogdControllerManager_Deploying
// OperatorcontrollerDeploymentOperatorControllerControllerManager_Deploying
// On HA, cluster-olm-operator PR #202 (2 replicas + PDB) prevents this.
// On SNO there is only one replica and the node reboot restarts all pods simultaneously.
if strings.HasSuffix(reason, "ControllerManager_Deploying") && isSingleNode {
return "https://issues.redhat.com/browse/OCPBUGS-62635"
}
case "operator-lifecycle-manager-packageserver":
// On HA, isAPIServiceBackendDisrupted() in operator-framework-olm detects terminating
// pods and returns RetryableError to prevent the CSV phase from changing to Failed.
// On SNO the OS-level node reboot kills all pods simultaneously so no terminating pod
// is ever observed; the detection does not fire and Progressing=True is still set.
if reason == "" && isSingleNode {
return "https://issues.redhat.com/browse/OCPBUGS-63672"
}
case "openshift-apiserver":
if isTwoNode && reason == "OperatorConfig_NewGeneration" {
return "openshift-apiserver operator may reconcile openshiftapiserveroperatorconfigs (OperatorConfig_NewGeneration) during DualReplica upgrades while machine-config is progressing"
Expand All @@ -821,6 +765,15 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName)
operatorEvents := eventsByOperator[operatorName]
if topology == configv1.SingleReplicaTopologyMode {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
SkipMessage: &junitapi.SkipMessage{
Message: "Test skipped on a single-node cluster",
},
})
continue
}
if len(operatorEvents) == 0 {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
Expand Down
18 changes: 16 additions & 2 deletions test/extended/machines/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
o "github.com/onsi/gomega"
configv1 "github.com/openshift/api/config/v1"
configclient "github.com/openshift/client-go/config/clientset/versioned"
bmhelper "github.com/openshift/origin/test/extended/baremetal"
configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
"github.com/stretchr/objx"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -24,6 +24,9 @@ import (
"k8s.io/client-go/scale"
e2e "k8s.io/kubernetes/test/e2e/framework"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"

bmhelper "github.com/openshift/origin/test/extended/baremetal"
exutil "github.com/openshift/origin/test/extended/util"
)

const (
Expand Down Expand Up @@ -283,7 +286,18 @@ var _ = g.Describe("[sig-cluster-lifecycle][Feature:Machines][Serial] Managed cl
violations = append(violations, operator)
}
}
o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations)

cfg, err := e2e.LoadConfig()
o.Expect(err).NotTo(o.HaveOccurred())
configV1Client, err := configv1client.NewForConfig(cfg)
o.Expect(err).NotTo(o.HaveOccurred())
topo, err := exutil.GetControlPlaneTopologyFromConfigClient(configV1Client)
if err != nil {
e2e.Logf("failed to get control plane topology: %v", err)
}
if topo != nil && *topo != configv1.SingleReplicaTopologyMode {
o.Expect(violations).To(o.BeEmpty(), "those cluster operators left Progressing=False while cluster was scaling: %v", violations)
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
})

// The 30m timeout is essentially required by the baremetal platform environment,
Expand Down