-
Notifications
You must be signed in to change notification settings - Fork 28
feat(kcl/k8s): add ValidateWorkloadHealth step #1234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
6e376ae
4e81ba1
026d0a2
64d4e5a
c9fa836
a4d23f2
a32db54
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,179 @@ | ||
| import azure_pipelines.ap.steps | ||
| import lib.steps.azure | ||
|
|
||
| # ValidateWorkloadHealth validates that a Kubernetes workload (Deployment, | ||
| # StatefulSet, DaemonSet, Job, or Pod) is healthy. It first checks the | ||
| # resource/pod status, then scans the full container logs for error patterns. | ||
| # Requires kubectl on the agent and a kubeconfig already in place (e.g. via | ||
| # GetCredentials). | ||
| # | ||
| # For Jobs, healthy means succeeded >= completions (all done) OR active>0 | ||
| # without a Failed condition (still in progress). Anything else fails. | ||
| # | ||
| # kind="*" is a wildcard: enumerate every Deployment/StatefulSet/DaemonSet/Job | ||
| # in `namespace` and check each, then check every pod. Use it for namespaces | ||
| # you own; avoid for system namespaces. | ||
| ValidateWorkloadHealth = lambda \ | ||
| serviceConnection: str, \ | ||
| kind: "deployment" | "statefulset" | "daemonset" | "job" | "pod" | "*", \ | ||
| name: str = "", \ | ||
| namespace: str = "default", \ | ||
| errorPatterns: str = "panic|fatal", \ | ||
| displayName: str = "", \ | ||
| continueOnError: bool = Undefined \ | ||
| -> steps.Step { | ||
| assert (kind == "*" and name == "") or (kind != "*" and name != ""), \ | ||
| "name must be empty when kind=='*' and non-empty otherwise (got kind='${kind}', name='${name}')" | ||
| assert "'" not in errorPatterns, \ | ||
| "errorPatterns must not contain single quotes (got ${errorPatterns})" | ||
|
|
||
| defaultTitle = "Validate all workloads in ${namespace}" if kind == "*" else "Validate ${kind}/${name}" | ||
| title = displayName if displayName else defaultTitle | ||
| desc = "all workloads in ${namespace}" if kind == "*" else "${kind} ${namespace}/${name}" | ||
| script = """ | ||
| set +e | ||
|
|
||
| failures=0 | ||
| fail() { echo "FAIL: $*"; failures=$((failures + 1)); } | ||
|
|
||
| # check_controllers KIND DESIRED_PATH READY_PATH [NAME] | ||
| # If NAME is given, check just that one; otherwise check every instance of KIND. | ||
| check_controllers() { | ||
|
wonderyl marked this conversation as resolved.
|
||
| local kind=$1 desired_path=$2 ready_path=$3 name=$4 | ||
| local n desired ready names | ||
| if [ -n "$name" ]; then | ||
| names="$name" | ||
| else | ||
| names=$(kubectl -n "${namespace}" get "$kind" -o jsonpath='{.items[*].metadata.name}') | ||
| fi | ||
| for n in $names; do | ||
| desired=$(kubectl -n "${namespace}" get "$kind" "$n" -o jsonpath="$desired_path") | ||
| ready=$(kubectl -n "${namespace}" get "$kind" "$n" -o jsonpath="$ready_path") | ||
| [ -z "$ready" ] && ready=0 | ||
| [ -z "$desired" ] && desired=0 | ||
| echo "$kind ${namespace}/$n: ready=$ready desired=$desired" | ||
| if [ "$ready" != "$desired" ] || [ "$desired" = "0" ]; then | ||
| fail "$kind ${namespace}/$n not fully ready ($ready/$desired)" | ||
| fi | ||
| done | ||
| } | ||
|
|
||
| # check_jobs [NAME] | ||
| # Healthy = succeeded >= completions (done) OR active>0 without Failed condition | ||
| # (still running). Otherwise (Failed=True, or stalled with active=0 and not | ||
| # enough successes) fail. | ||
| check_jobs() { | ||
| local name=$1 | ||
| local n names completions succeeded active failed_cond | ||
| if [ -n "$name" ]; then | ||
| names="$name" | ||
| else | ||
| names=$(kubectl -n "${namespace}" get jobs -o jsonpath='{.items[*].metadata.name}') | ||
| fi | ||
| for n in $names; do | ||
| completions=$(kubectl -n "${namespace}" get job "$n" -o jsonpath='{.spec.completions}') | ||
| succeeded=$(kubectl -n "${namespace}" get job "$n" -o jsonpath='{.status.succeeded}') | ||
| active=$(kubectl -n "${namespace}" get job "$n" -o jsonpath='{.status.active}') | ||
| failed_cond=$(kubectl -n "${namespace}" get job "$n" \\ | ||
| -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}') | ||
| [ -z "$completions" ] && completions=1 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. avoids silence failure, not being able to get comletions and other fields should be considered as failures.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done — added an existence probe (kubectl get ... -o name) at the top of check_controllers and check_jobs. If the resource can't be read (missing / API error / permission denied), it now fails loudly and skips that item instead of silently defaulting fields to 0.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the existence of the workload directly translate to these fields exist. I was more thinking about when field like completion doesn't exist, just print out an error and exit. The result is not usable when you use default anyway. |
||
| [ -z "$succeeded" ] && succeeded=0 | ||
| [ -z "$active" ] && active=0 | ||
| echo "job ${namespace}/$n: succeeded=$succeeded/$completions active=$active failed=$failed_cond" | ||
| if [ "$failed_cond" = "True" ]; then | ||
| fail "job ${namespace}/$n failed" | ||
| elif [ "$succeeded" -lt "$completions" ] && [ "$active" = "0" ]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So if I want 3 completions, 2 succeeded and 1 is active, it will count as succeed?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this step is meant to be usable in two scenarios: (1) right after a Job is created, to confirm it has come up and is making progress (running + no failed pods); (2) after waiting for the Job to finish, to confirm everything succeeded. As long as the Job is either succeeded or still running (with no Failed condition)
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then what happens when you want to check in scenario 2, it actually meets scenario 1 criteria? It's cleaner just split them into two functions and make the name of the function clear, if you check succeed and active, name name it checkJobSucceedOrActive. Health mean different things in different scenarios.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These fields are marked as omitempty in the JSON, so it’s not straightforward to trigger a clear (loud) failure. Also, the job validation logic has been moved to another file and will be included in the next PR. |
||
| fail "job ${namespace}/$n stalled (succeeded=$succeeded/$completions, no active pods)" | ||
| fi | ||
| done | ||
| } | ||
|
|
||
| check_pod() { | ||
| local pod=$1 | ||
| echo "::: checking pod ${namespace}/$pod" | ||
| local phase ready_cond not_ready log_output matches | ||
| phase=$(kubectl -n "${namespace}" get pod "$pod" -o jsonpath='{.status.phase}' 2>/dev/null) | ||
| case "$phase" in | ||
| Succeeded) | ||
| # Job pod that finished cleanly; skip readiness checks, still scan logs. | ||
| ;; | ||
| Running) | ||
| ready_cond=$(kubectl -n "${namespace}" get pod "$pod" \\ | ||
| -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}') | ||
| if [ "$ready_cond" != "True" ]; then | ||
| fail "pod ${namespace}/$pod Ready condition=$ready_cond (expected True)" | ||
| return | ||
| fi | ||
| not_ready=$(kubectl -n "${namespace}" get pod "$pod" \\ | ||
| -o jsonpath='{range .status.containerStatuses[?(@.ready==false)]}{.name} {end}') | ||
| if [ -n "$not_ready" ]; then | ||
| fail "pod ${namespace}/$pod has not-ready containers: $not_ready" | ||
| return | ||
| fi | ||
| ;; | ||
| *) | ||
| fail "pod ${namespace}/$pod phase=$phase (expected Running or Succeeded)" | ||
| return | ||
| ;; | ||
| esac | ||
| log_output=$(kubectl -n "${namespace}" logs "$pod" --all-containers=true --prefix=true 2>&1) | ||
| matches=$(printf '%s\\n' "$log_output" | grep -wEi '${errorPatterns}') | ||
| if [ -n "$matches" ]; then | ||
| echo "--- error lines in ${namespace}/$pod ---" | ||
| printf '%s\\n' "$matches" | ||
| echo "--- end ---" | ||
| fail "pod ${namespace}/$pod logs contain error pattern" | ||
| fi | ||
| } | ||
|
|
||
| # 1. Resolve target controllers and pod set. | ||
| case "${kind}" in | ||
|
wonderyl marked this conversation as resolved.
|
||
| "*") | ||
| check_controllers deployment '{.spec.replicas}' '{.status.readyReplicas}' | ||
| check_controllers statefulset '{.spec.replicas}' '{.status.readyReplicas}' | ||
| check_controllers daemonset '{.status.desiredNumberScheduled}' '{.status.numberReady}' | ||
| check_jobs "" | ||
| pods=$(kubectl -n "${namespace}" get pods -o jsonpath='{.items[*].metadata.name}') | ||
| ;; | ||
| pod) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just thinking out loud, would anyone actually check for the health of a particular pod?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question — generally we don’t focus on the health of a specific pod unless it represents a critical or singleton component, there are edge cases—e.g., pods deployed via custom binaries or standalone images—where they’re not tightly managed by controllers, and in those cases checking individual pod health may still be necessary.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is only a hypothetical use case, I suggest to remove it for now and add it back when someone really have a use case for it. |
||
| pods="${name}" | ||
| ;; | ||
| deployment|statefulset|daemonset|job) | ||
| case "${kind}" in | ||
| daemonset) | ||
| check_controllers daemonset '{.status.desiredNumberScheduled}' '{.status.numberReady}' "${name}" ;; | ||
| deployment|statefulset) | ||
| check_controllers "${kind}" '{.spec.replicas}' '{.status.readyReplicas}' "${name}" ;; | ||
| job) | ||
| check_jobs "${name}" ;; | ||
| esac | ||
| # Resolve the controller's pods via its matchLabels. | ||
| selector=$(kubectl -n "${namespace}" get "${kind}" "${name}" \\ | ||
| -o go-template='{{range $k,$v := .spec.selector.matchLabels}}{{$k}}={{$v}},{{end}}' \\ | ||
| | sed 's/,$//') | ||
| if [ -z "$selector" ]; then | ||
| fail "${kind} ${namespace}/${name} has no matchLabels selector (matchExpressions not supported)" | ||
| pods="" | ||
| else | ||
| pods=$(kubectl -n "${namespace}" get pods -l "$selector" -o jsonpath='{.items[*].metadata.name}') | ||
| fi | ||
| ;; | ||
| esac | ||
|
|
||
| if [ -z "$pods" ]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. $pods is assigned in multiple places and used in here. Having a local variable that span 10s of lines of code makes it harder to read. If you use the way I suggested in https://github.com/Azure/telescope/pull/1234/changes#r3488891510, you can avoid this. |
||
| fail "no pods found for ${desc}" | ||
| fi | ||
|
|
||
| # 2. For each pod, check status and scan full logs. | ||
| for pod in $pods; do | ||
| check_pod "$pod" | ||
| done | ||
|
|
||
| if [ "$failures" -gt 0 ]; then | ||
| echo "Validation failed with $failures error(s) for ${desc}" | ||
| exit 1 | ||
| fi | ||
| echo "Validation passed for ${desc}" | ||
| """ | ||
| azure.AzCli(serviceConnection, title, script, continueOnError=continueOnError) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When you read log and try to find the exact failure, you will search for "FAIL:", but this string to too simple, other error string may have the same pattern. I suggest to add a bit more words to make it less likely to collide with others. E.g. "FAILED WORKLOAD CHECK:"