Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions kcl/lib/steps/k8s/validate_workload.k
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import azure_pipelines.ap.steps
import lib.steps.azure

# ValidateWorkloadHealth validates that a Kubernetes controller workload
# (Deployment, StatefulSet, or DaemonSet) is healthy. It first checks the
# resource status, then scans the full container logs of its pods for error
# patterns. Requires kubectl on the agent and a kubeconfig already in place
# (e.g. via GetCredentials).
#
# Job is intentionally not handled here -- use ValidateJobRunning /
# ValidateJobCompleted in validate_job.k instead.
#
# kind="*" is a wildcard: enumerate every Deployment/StatefulSet/DaemonSet in
# `namespace` and check each, then check every pod. Use it for namespaces you
# own; avoid for system namespaces.
ValidateWorkloadHealth = lambda \
serviceConnection: str, \
kind: "deployment" | "statefulset" | "daemonset" | "*", \
name: str = "", \
namespace: str = "default", \
errorPatterns: str = "panic|fatal", \
displayName: str = "", \
continueOnError: bool = Undefined \
-> steps.Step {
assert (kind == "*" and name == "") or (kind != "*" and name != ""), \
"name must be empty when kind=='*' and non-empty otherwise (got kind='${kind}', name='${name}')"
assert "'" not in errorPatterns, \
"errorPatterns must not contain single quotes (got ${errorPatterns})"

defaultTitle = "Validate all workloads in ${namespace}" if kind == "*" else "Validate ${kind}/${name}"
title = displayName if displayName else defaultTitle
desc = "all workloads in ${namespace}" if kind == "*" else "${kind} ${namespace}/${name}"
script = """
set +e

failures=0
fail() { echo "FAILED WORKLOAD CHECK: $*"; failures=$((failures + 1)); }

# check_controllers KIND DESIRED_PATH READY_PATH [NAME]
# If NAME is given, check just that one; otherwise check every instance of KIND.
check_controllers() {
Comment thread
wonderyl marked this conversation as resolved.
local kind=$1 desired_path=$2 ready_path=$3 name=$4
local n desired ready names
if [ -n "$name" ]; then
names="$name"
else
names=$(kubectl -n "${namespace}" get "$kind" -o jsonpath='{.items[*].metadata.name}')
fi
for n in $names; do
# Probe existence first so a missing object / API error fails loudly
# instead of silently defaulting desired/ready to 0.
if ! kubectl -n "${namespace}" get "$kind" "$n" -o name >/dev/null 2>&1; then
fail "$kind ${namespace}/$n: cannot read resource (missing or kubectl error)"
continue
fi
desired=$(kubectl -n "${namespace}" get "$kind" "$n" -o jsonpath="$desired_path")
ready=$(kubectl -n "${namespace}" get "$kind" "$n" -o jsonpath="$ready_path")
[ -z "$ready" ] && ready=0
[ -z "$desired" ] && desired=0
echo "$kind ${namespace}/$n: ready=$ready desired=$desired"
if [ "$ready" != "$desired" ] || [ "$desired" = "0" ]; then
fail "$kind ${namespace}/$n not fully ready ($ready/$desired)"
fi
done
}

check_pod() {
local pod=$1
echo "::: checking pod ${namespace}/$pod"
local phase ready_cond not_ready log_output matches
phase=$(kubectl -n "${namespace}" get pod "$pod" -o jsonpath='{.status.phase}' 2>/dev/null)
case "$phase" in
Succeeded)
# Job pod that finished cleanly; skip readiness checks, still scan logs.
;;
Running)
ready_cond=$(kubectl -n "${namespace}" get pod "$pod" \\
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}')
if [ "$ready_cond" != "True" ]; then
fail "pod ${namespace}/$pod Ready condition=$ready_cond (expected True)"
return
fi
not_ready=$(kubectl -n "${namespace}" get pod "$pod" \\
-o jsonpath='{range .status.containerStatuses[?(@.ready==false)]}{.name} {end}')
if [ -n "$not_ready" ]; then
fail "pod ${namespace}/$pod has not-ready containers: $not_ready"
return
fi
;;
*)
fail "pod ${namespace}/$pod phase=$phase (expected Running or Succeeded)"
return
;;
esac
log_output=$(kubectl -n "${namespace}" logs "$pod" --all-containers=true --prefix=true 2>&1)
matches=$(printf '%s\\n' "$log_output" | grep -wEi '${errorPatterns}')
if [ -n "$matches" ]; then
echo "--- error lines in ${namespace}/$pod ---"
printf '%s\\n' "$matches"
echo "--- end ---"
fail "pod ${namespace}/$pod logs contain error pattern"
fi
}

# check_pods POD_LIST
# Scan each pod in a whitespace-separated list. Fails if the list is empty.
check_pods() {
local pods=$1 pod
if [ -z "$pods" ]; then
fail "no pods found for ${desc}"
return
fi
for pod in $pods; do
check_pod "$pod"
done
}

# check_controller_pods KIND NAME
# Resolve the controller's pods via its matchLabels and scan them.
check_controller_pods() {
local kind=$1 name=$2 selector
selector=$(kubectl -n "${namespace}" get "$kind" "$name" \\
-o go-template='{{range $k,$v := .spec.selector.matchLabels}}{{$k}}={{$v}},{{end}}' \\
| sed 's/,$//')
if [ -z "$selector" ]; then
fail "$kind ${namespace}/$name has no matchLabels selector (matchExpressions not supported)"
return
fi
check_pods "$(kubectl -n "${namespace}" get pods -l "$selector" -o jsonpath='{.items[*].metadata.name}')"
}

case "${kind}" in

@wonderyl wonderyl Jun 29, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's clearer to write the following way. You used double case, to reuse the same piece of code, but you can achieve the same with a function.

case "${kind}" in
  "*")
    check_controllers deployment  '{.spec.replicas}'                 '{.status.readyReplicas}'
    check_controllers statefulset '{.spec.replicas}'                 '{.status.readyReplicas}'
    check_controllers daemonset   '{.status.desiredNumberScheduled}' '{.status.numberReady}'
    check_jobs ""
    pods=$(kubectl -n "${namespace}" get pods -o jsonpath='{.items[*].metadata.name}')
    ;;
  pod)
    check_pods "${name}"
    ;;
  deployment|statefulset)
    check_controllers "${kind}" '{.spec.replicas}' '{.status.readyReplicas}' "${name}" 
    check_controller_pods "${kind}" "${name}"
    ;;
  daemonset)
    check_controllers daemonset '{.status.desiredNumberScheduled}' '{.status.numberReady}' "${name}" ;;
    check_controller_pods "${kind}" "${name}"
  job)
    check_jobs "${name}" 
    check_controller_pods "${kind}" "${name}"
    ;;
esac

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done — refactored as suggested. Each case branch now calls check_pods / check_controller_pods directly, and $pods no longer crosses branches.

"*")
check_controllers deployment '{.spec.replicas}' '{.status.readyReplicas}'
check_controllers statefulset '{.spec.replicas}' '{.status.readyReplicas}'
check_controllers daemonset '{.status.desiredNumberScheduled}' '{.status.numberReady}'
check_pods "$(kubectl -n "${namespace}" get pods -o jsonpath='{.items[*].metadata.name}')"
;;
deployment|statefulset)
check_controllers "${kind}" '{.spec.replicas}' '{.status.readyReplicas}' "${name}"
check_controller_pods "${kind}" "${name}"
;;
daemonset)
check_controllers daemonset '{.status.desiredNumberScheduled}' '{.status.numberReady}' "${name}"
check_controller_pods "${kind}" "${name}"
;;
esac

if [ "$failures" -gt 0 ]; then
echo "Validation failed with $failures error(s) for ${desc}"
exit 1
fi
echo "Validation passed for ${desc}"
"""
azure.AzCli(serviceConnection, title, script, continueOnError=continueOnError)
}