Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 179 additions & 0 deletions kcl/lib/steps/k8s/validate_workload.k
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import azure_pipelines.ap.steps
import lib.steps.azure

# ValidateWorkloadHealth validates that a Kubernetes workload (Deployment,
# StatefulSet, DaemonSet, Job, or Pod) is healthy. It first checks the
# resource/pod status, then scans the full container logs for error patterns.
# Requires kubectl on the agent and a kubeconfig already in place (e.g. via
# GetCredentials).
#
# For Jobs, healthy means succeeded >= completions (all done) OR active>0
# without a Failed condition (still in progress). Anything else fails.
#
# kind="*" is a wildcard: enumerate every Deployment/StatefulSet/DaemonSet/Job
# in `namespace` and check each, then check every pod. Use it for namespaces
# you own; avoid for system namespaces.
ValidateWorkloadHealth = lambda \
serviceConnection: str, \
kind: "deployment" | "statefulset" | "daemonset" | "job" | "pod" | "*", \
name: str = "", \
namespace: str = "default", \
errorPatterns: str = "panic|fatal", \
displayName: str = "", \
continueOnError: bool = Undefined \
-> steps.Step {
assert (kind == "*" and name == "") or (kind != "*" and name != ""), \
"name must be empty when kind=='*' and non-empty otherwise (got kind='${kind}', name='${name}')"
assert "'" not in errorPatterns, \
"errorPatterns must not contain single quotes (got ${errorPatterns})"

defaultTitle = "Validate all workloads in ${namespace}" if kind == "*" else "Validate ${kind}/${name}"
title = displayName if displayName else defaultTitle
desc = "all workloads in ${namespace}" if kind == "*" else "${kind} ${namespace}/${name}"
script = """
set +e

failures=0
fail() { echo "FAIL: $*"; failures=$((failures + 1)); }

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you read log and try to find the exact failure, you will search for "FAIL:", but this string to too simple, other error string may have the same pattern. I suggest to add a bit more words to make it less likely to collide with others. E.g. "FAILED WORKLOAD CHECK:"


# check_controllers KIND DESIRED_PATH READY_PATH [NAME]
# If NAME is given, check just that one; otherwise check every instance of KIND.
check_controllers() {
Comment thread
wonderyl marked this conversation as resolved.
local kind=$1 desired_path=$2 ready_path=$3 name=$4
local n desired ready names
if [ -n "$name" ]; then
names="$name"
else
names=$(kubectl -n "${namespace}" get "$kind" -o jsonpath='{.items[*].metadata.name}')
fi
for n in $names; do
desired=$(kubectl -n "${namespace}" get "$kind" "$n" -o jsonpath="$desired_path")
ready=$(kubectl -n "${namespace}" get "$kind" "$n" -o jsonpath="$ready_path")
[ -z "$ready" ] && ready=0
[ -z "$desired" ] && desired=0
echo "$kind ${namespace}/$n: ready=$ready desired=$desired"
if [ "$ready" != "$desired" ] || [ "$desired" = "0" ]; then
fail "$kind ${namespace}/$n not fully ready ($ready/$desired)"
fi
done
}

# check_jobs [NAME]
# Healthy = succeeded >= completions (done) OR active>0 without Failed condition
# (still running). Otherwise (Failed=True, or stalled with active=0 and not
# enough successes) fail.
check_jobs() {
local name=$1
local n names completions succeeded active failed_cond
if [ -n "$name" ]; then
names="$name"
else
names=$(kubectl -n "${namespace}" get jobs -o jsonpath='{.items[*].metadata.name}')
fi
for n in $names; do
completions=$(kubectl -n "${namespace}" get job "$n" -o jsonpath='{.spec.completions}')
succeeded=$(kubectl -n "${namespace}" get job "$n" -o jsonpath='{.status.succeeded}')
active=$(kubectl -n "${namespace}" get job "$n" -o jsonpath='{.status.active}')
failed_cond=$(kubectl -n "${namespace}" get job "$n" \\
-o jsonpath='{.status.conditions[?(@.type=="Failed")].status}')
[ -z "$completions" ] && completions=1

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

avoids silence failure, not being able to get comletions and other fields should be considered as failures.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — added an existence probe (kubectl get ... -o name) at the top of check_controllers and check_jobs. If the resource can't be read (missing / API error / permission denied), it now fails loudly and skips that item instead of silently defaulting fields to 0.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the existence of the workload directly translate to these fields exist. I was more thinking about when field like completion doesn't exist, just print out an error and exit. The result is not usable when you use default anyway.

[ -z "$succeeded" ] && succeeded=0
[ -z "$active" ] && active=0
echo "job ${namespace}/$n: succeeded=$succeeded/$completions active=$active failed=$failed_cond"
if [ "$failed_cond" = "True" ]; then
fail "job ${namespace}/$n failed"
elif [ "$succeeded" -lt "$completions" ] && [ "$active" = "0" ]; then

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if I want 3 completions, 2 succeeded and 1 is active, it will count as succeed?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this step is meant to be usable in two scenarios: (1) right after a Job is created, to confirm it has come up and is making progress (running + no failed pods); (2) after waiting for the Job to finish, to confirm everything succeeded. As long as the Job is either succeeded or still running (with no Failed condition)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then what happens when you want to check in scenario 2, it actually meets scenario 1 criteria? It's cleaner just split them into two functions and make the name of the function clear, if you check succeed and active, name name it checkJobSucceedOrActive. Health mean different things in different scenarios.
This also prompt me thinking, do you need a generic check workload health function? since you need to use them separately. one for checking cl2 drivers, which are jobs, one for checking kube-system workloads, which doesn't have job at all.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These fields are marked as omitempty in the JSON, so it’s not straightforward to trigger a clear (loud) failure. Also, the job validation logic has been moved to another file and will be included in the next PR.

fail "job ${namespace}/$n stalled (succeeded=$succeeded/$completions, no active pods)"
fi
done
}

check_pod() {
local pod=$1
echo "::: checking pod ${namespace}/$pod"
local phase ready_cond not_ready log_output matches
phase=$(kubectl -n "${namespace}" get pod "$pod" -o jsonpath='{.status.phase}' 2>/dev/null)
case "$phase" in
Succeeded)
# Job pod that finished cleanly; skip readiness checks, still scan logs.
;;
Running)
ready_cond=$(kubectl -n "${namespace}" get pod "$pod" \\
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
if [ "$ready_cond" != "True" ]; then
fail "pod ${namespace}/$pod Ready condition=$ready_cond (expected True)"
return
fi
not_ready=$(kubectl -n "${namespace}" get pod "$pod" \\
-o jsonpath='{range .status.containerStatuses[?(@.ready==false)]}{.name} {end}')
if [ -n "$not_ready" ]; then
fail "pod ${namespace}/$pod has not-ready containers: $not_ready"
return
fi
;;
*)
fail "pod ${namespace}/$pod phase=$phase (expected Running or Succeeded)"
return
;;
esac
log_output=$(kubectl -n "${namespace}" logs "$pod" --all-containers=true --prefix=true 2>&1)
matches=$(printf '%s\\n' "$log_output" | grep -wEi '${errorPatterns}')
if [ -n "$matches" ]; then
echo "--- error lines in ${namespace}/$pod ---"
printf '%s\\n' "$matches"
echo "--- end ---"
fail "pod ${namespace}/$pod logs contain error pattern"
fi
}

# 1. Resolve target controllers and pod set.
case "${kind}" in
Comment thread
wonderyl marked this conversation as resolved.
"*")
check_controllers deployment '{.spec.replicas}' '{.status.readyReplicas}'
check_controllers statefulset '{.spec.replicas}' '{.status.readyReplicas}'
check_controllers daemonset '{.status.desiredNumberScheduled}' '{.status.numberReady}'
check_jobs ""
pods=$(kubectl -n "${namespace}" get pods -o jsonpath='{.items[*].metadata.name}')
;;
pod)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just thinking out loud, would anyone actually check for the health of a particular pod?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question — generally we don’t focus on the health of a specific pod unless it represents a critical or singleton component, there are edge cases—e.g., pods deployed via custom binaries or standalone images—where they’re not tightly managed by controllers, and in those cases checking individual pod health may still be necessary.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is only a hypothetical use case, I suggest to remove it for now and add it back when someone really have a use case for it.

pods="${name}"
;;
deployment|statefulset|daemonset|job)
case "${kind}" in
daemonset)
check_controllers daemonset '{.status.desiredNumberScheduled}' '{.status.numberReady}' "${name}" ;;
deployment|statefulset)
check_controllers "${kind}" '{.spec.replicas}' '{.status.readyReplicas}' "${name}" ;;
job)
check_jobs "${name}" ;;
esac
# Resolve the controller's pods via its matchLabels.
selector=$(kubectl -n "${namespace}" get "${kind}" "${name}" \\
-o go-template='{{range $k,$v := .spec.selector.matchLabels}}{{$k}}={{$v}},{{end}}' \\
| sed 's/,$//')
if [ -z "$selector" ]; then
fail "${kind} ${namespace}/${name} has no matchLabels selector (matchExpressions not supported)"
pods=""
else
pods=$(kubectl -n "${namespace}" get pods -l "$selector" -o jsonpath='{.items[*].metadata.name}')
fi
;;
esac

if [ -z "$pods" ]; then

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

$pods is assigned in multiple places and used in here. Having a local variable that span 10s of lines of code makes it harder to read. If you use the way I suggested in https://github.com/Azure/telescope/pull/1234/changes#r3488891510, you can avoid this.

fail "no pods found for ${desc}"
fi

# 2. For each pod, check status and scan full logs.
for pod in $pods; do
check_pod "$pod"
done

if [ "$failures" -gt 0 ]; then
echo "Validation failed with $failures error(s) for ${desc}"
exit 1
fi
echo "Validation passed for ${desc}"
"""
azure.AzCli(serviceConnection, title, script, continueOnError=continueOnError)
}