-
Notifications
You must be signed in to change notification settings - Fork 4.8k
NO-JIRA: Skip CO condition tests on SNO #30775
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,7 @@ import ( | |
|
|
||
| // exceptionCallback consumes a suspicious condition and returns an | ||
| // exception string if does not think the condition should be fatal. | ||
| type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) string | ||
| type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval) string | ||
|
|
||
| type upgradeWindowHolder struct { | ||
| startInterval *monitorapi.Interval | ||
|
|
@@ -46,14 +46,8 @@ func checkAuthenticationAvailableExceptions(condition *configv1.ClusterOperatorS | |
| return false | ||
| } | ||
|
|
||
| func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase { | ||
| topology, err := getControlPlaneTopology(clientConfig) | ||
| if err != nil { | ||
| logrus.Warnf("Error checking for ControlPlaneTopology configuration (unable to make topology exceptions): %v", err) | ||
| } | ||
| isSingleNode := topology == configv1.SingleReplicaTopologyMode | ||
|
|
||
| except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval, clientConfig *rest.Config) string { | ||
| func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, topology configv1.TopologyMode) []*junitapi.JUnitTestCase { | ||
| except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, _ monitorapi.Interval) string { | ||
| if condition.Status == configv1.ConditionTrue { | ||
| if condition.Type == configv1.OperatorAvailable { | ||
| return fmt.Sprintf("%s=%s is the happy case", condition.Type, condition.Status) | ||
|
|
@@ -64,30 +58,6 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien | |
| } | ||
| } | ||
|
|
||
| if isSingleNode { | ||
| switch operator { | ||
| case "dns": | ||
| if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && | ||
| strings.Contains(condition.Message, `DNS "default" is unavailable.`) { | ||
| return "dns operator is allowed to have Available=False due to serial taint tests on single node" | ||
| } | ||
| if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue && | ||
| strings.Contains(condition.Message, `DNS default is degraded`) { | ||
| return "dns operator is allowed to have Degraded=True due to serial taint tests on single node" | ||
| } | ||
| case "openshift-apiserver": | ||
| if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && | ||
| strings.Contains(condition.Message, `connect: connection refused`) { | ||
| return "openshift apiserver operator is allowed to have Available=False due kube-apiserver force rollout test on single node" | ||
| } | ||
| case "csi-snapshot-controller": | ||
| if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && | ||
| strings.Contains(condition.Message, `Waiting for Deployment`) { | ||
| return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on single node" | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // For the non-upgrade case, if any operator has Available=False, fail the test. | ||
| if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse { | ||
| if operator == "authentication" { | ||
|
|
@@ -141,7 +111,7 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien | |
| return "We are not worried about other operator condition blips for stable-system tests yet." | ||
| } | ||
|
|
||
| return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, false) | ||
| return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, false, topology) | ||
| } | ||
|
|
||
| func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, error) { | ||
|
|
@@ -260,26 +230,17 @@ func hasUpgradeFailedEvent(eventList monitorapi.Intervals) bool { | |
| return false | ||
| } | ||
|
|
||
| func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase { | ||
| func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config, topology configv1.TopologyMode) []*junitapi.JUnitTestCase { | ||
| upgradeWindows := getUpgradeWindows(events) | ||
| topology, err := getControlPlaneTopology(clientConfig) | ||
| if err != nil { | ||
| logrus.Warnf("Error checking for ControlPlaneTopology configuration on upgrade (unable to make topology exceptions): %v", err) | ||
| } | ||
|
|
||
| isSingleNode := topology == configv1.SingleReplicaTopologyMode | ||
| isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode | ||
| upgradeFailed := hasUpgradeFailedEvent(events) | ||
|
|
||
| except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) string { | ||
| except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval) string { | ||
| // When an upgrade was recorded as failed, we will not care about the operator state transitions | ||
| if upgradeFailed { | ||
| return "upgrade failed, not recording unexpected operator transitions as failure" | ||
| } | ||
| // SingleNode is expected to go Available=False and Degraded=True for most / all operators during upgrade | ||
| if isSingleNode { | ||
| return "single node is allowed to be unavailable/degraded during upgrades" | ||
| } | ||
|
|
||
| if condition.Status == configv1.ConditionTrue { | ||
| if condition.Type == configv1.OperatorAvailable { | ||
|
|
@@ -430,9 +391,6 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf | |
| } | ||
| case "kube-apiserver": | ||
| if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue { | ||
| if isSingleNode && condition.Reason == "NodeInstaller_InstallerPodFailed" { | ||
| return "https://issues.redhat.com/browse/OCPBUGS-38678" | ||
| } | ||
| return "https://issues.redhat.com/browse/OCPBUGS-38661" | ||
| } | ||
| case "kube-controller-manager": | ||
|
|
@@ -455,7 +413,7 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf | |
| return "" | ||
| } | ||
|
|
||
| return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, clientConfig, true) | ||
| return testOperatorStateTransitions(events, []configv1.ClusterStatusConditionType{configv1.OperatorAvailable, configv1.OperatorDegraded}, except, true, topology) | ||
| } | ||
|
|
||
| func isVSphere(config *rest.Config) (bool, error) { | ||
|
|
@@ -506,7 +464,7 @@ func checkReplicas(namespace string, operator string, clientConfig *rest.Config) | |
| return 0, fmt.Errorf("Error fetching replicas") | ||
| } | ||
|
|
||
| func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, clientConfig *rest.Config, upgrade bool) []*junitapi.JUnitTestCase { | ||
| func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []configv1.ClusterStatusConditionType, except exceptionCallback, upgrade bool, topology configv1.TopologyMode) []*junitapi.JUnitTestCase { | ||
| ret := []*junitapi.JUnitTestCase{} | ||
|
|
||
| var start, stop time.Time | ||
|
|
@@ -527,14 +485,22 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] | |
| bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName) | ||
| testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should not change condition/%v", bzComponent, operatorName, conditionType) | ||
| operatorEvents := eventsByOperator[operatorName] | ||
| if topology == configv1.SingleReplicaTopologyMode { | ||
| ret = append(ret, &junitapi.JUnitTestCase{ | ||
| Name: testName, | ||
| SkipMessage: &junitapi.SkipMessage{ | ||
| Message: "Test skipped on a single-node cluster", | ||
| }, | ||
| }) | ||
| continue | ||
| } | ||
| if len(operatorEvents) == 0 { | ||
| ret = append(ret, &junitapi.JUnitTestCase{ | ||
| Name: testName, | ||
| Duration: duration, | ||
| }) | ||
| continue | ||
| } | ||
|
|
||
| excepted := []string{} | ||
| fatal := []string{} | ||
|
|
||
|
|
@@ -570,7 +536,7 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] | |
| if len(concurrentE2E) > 0 { | ||
| failure = fmt.Sprintf("%s\n%d tests failed during this blip (%v to %v): %v", failure, len(concurrentE2E), eventInterval.From, eventInterval.To, strings.Join(concurrentE2E, "\n")) | ||
| } | ||
| exception := except(operatorName, condition, eventInterval, clientConfig) | ||
| exception := except(operatorName, condition, eventInterval) | ||
| if exception == "" { | ||
| fatal = append(fatal, failure) | ||
| } else { | ||
|
|
@@ -619,22 +585,12 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] | |
| return ret | ||
| } | ||
|
|
||
| func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, clientConfig *rest.Config) []*junitapi.JUnitTestCase { | ||
| func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, topology configv1.TopologyMode) []*junitapi.JUnitTestCase { | ||
| var ret []*junitapi.JUnitTestCase | ||
| upgradeWindows := getUpgradeWindows(events) | ||
| multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1 | ||
|
|
||
| isTwoNode := false | ||
| isSingleNode := false | ||
| if clientConfig != nil { | ||
| topology, err := getControlPlaneTopology(clientConfig) | ||
| if err != nil { | ||
| logrus.Warnf("Error checking for ControlPlaneTopology configuration for MCO co-progressing monitor (unable to apply topology exceptions): %v", err) | ||
| } else { | ||
| isTwoNode = topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode | ||
| isSingleNode = topology == configv1.SingleReplicaTopologyMode | ||
| } | ||
| } | ||
| isTwoNode := topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode | ||
|
|
||
| var machineConfigProgressingStart time.Time | ||
| var eventsInUpgradeWindows monitorapi.Intervals | ||
|
|
@@ -715,6 +671,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, | |
| mcTestCase.SkipMessage = &junitapi.SkipMessage{ | ||
| Message: "Test skipped in a patch-level upgrade test", | ||
| } | ||
| } else if topology == configv1.SingleReplicaTopologyMode { | ||
| mcTestCase.SkipMessage = &junitapi.SkipMessage{ | ||
| Message: "Test skipped on a single-node cluster", | ||
| } | ||
| } else if t, ok := coProgressingStart[operatorName]; !ok || t.IsZero() { | ||
| output := fmt.Sprintf("clusteroperator/%s was never Progressing=True during the upgrade window from %s to %s", operatorName, start.Format(time.RFC3339), stop.Format(time.RFC3339)) | ||
| exception = except(operatorName, "") | ||
|
|
@@ -792,22 +752,6 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, | |
| if reason == "_ManagedDeploymentsAvailable" { | ||
| return "https://issues.redhat.com/browse/OCPBUGS-62633" | ||
| } | ||
| case "olm": | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have examples of Is there any example for I was trying to understand this in Slack but got distracted. The following is for my own memory, not relevant to the above question. OCPBUGS-63672 -> (by this comment) OCPBUGS-67210 is confusing because the condition to check went from Progressing to Available. OCPBUGS-67210 should be about Progressing as well. |
||
| // CatalogdDeploymentCatalogdControllerManager_Deploying | ||
| // OperatorcontrollerDeploymentOperatorControllerControllerManager_Deploying | ||
| // On HA, cluster-olm-operator PR #202 (2 replicas + PDB) prevents this. | ||
| // On SNO there is only one replica and the node reboot restarts all pods simultaneously. | ||
| if strings.HasSuffix(reason, "ControllerManager_Deploying") && isSingleNode { | ||
| return "https://issues.redhat.com/browse/OCPBUGS-62635" | ||
| } | ||
| case "operator-lifecycle-manager-packageserver": | ||
| // On HA, isAPIServiceBackendDisrupted() in operator-framework-olm detects terminating | ||
| // pods and returns RetryableError to prevent the CSV phase from changing to Failed. | ||
| // On SNO the OS-level node reboot kills all pods simultaneously so no terminating pod | ||
| // is ever observed; the detection does not fire and Progressing=True is still set. | ||
| if reason == "" && isSingleNode { | ||
| return "https://issues.redhat.com/browse/OCPBUGS-63672" | ||
| } | ||
| case "openshift-apiserver": | ||
| if isTwoNode && reason == "OperatorConfig_NewGeneration" { | ||
| return "openshift-apiserver operator may reconcile openshiftapiserveroperatorconfigs (OperatorConfig_NewGeneration) during DualReplica upgrades while machine-config is progressing" | ||
|
|
@@ -821,6 +765,15 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, | |
| bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName) | ||
| testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName) | ||
| operatorEvents := eventsByOperator[operatorName] | ||
| if topology == configv1.SingleReplicaTopologyMode { | ||
| ret = append(ret, &junitapi.JUnitTestCase{ | ||
| Name: testName, | ||
| SkipMessage: &junitapi.SkipMessage{ | ||
| Message: "Test skipped on a single-node cluster", | ||
| }, | ||
| }) | ||
| continue | ||
| } | ||
| if len(operatorEvents) == 0 { | ||
| ret = append(ret, &junitapi.JUnitTestCase{ | ||
| Name: testName, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.