diff --git a/.claude/skills/generate-yaml/SKILL.md b/.claude/skills/generate-yaml/SKILL.md index 06c7f9c2d4..c18ef6071c 100644 --- a/.claude/skills/generate-yaml/SKILL.md +++ b/.claude/skills/generate-yaml/SKILL.md @@ -5,6 +5,24 @@ description: Generate pipeline YAML from KCL source # Generate pipeline YAML from KCL source +## Validate scenarios first + +Before generating, run the cross-scenario validation gate. It catches the one +thing KCL's compiler cannot see: **path-string references** between scenarios +(one scenario's `cl2Manifest` / `kwokNodeManifest` pointing into another +scenario's directory) that form a loop. KCL already rejects circular *imports* +on its own, so this gate deliberately does not re-check those. + +```bash +python3 scripts/validate_scenarios.py +``` + +On success it prints `validate_scenarios: OK ...` and exits 0. If it fails +(exit 1), it prints the offending cycle (`a -> b -> a`); fix the path-string +references before continuing. + +## Generate + Given a pipeline KCL file (e.g. `path/to/pipeline.k`), determine its directory and run: ```bash @@ -18,6 +36,17 @@ The output YAML file is written alongside the KCL source file in the same direct kcl run kcl/example_pipeline/pipeline.k -S output -o kcl/example_pipeline/pipeline.yaml ``` +### If `kcl run` reports a circular import + +KCL fails fast with `error[E1001] RecursiveLoad` / `circular reference between +modules ...` when scenarios import each other in a loop. To reframe that raw +diagnostic in scenario terms for the author, pass the captured stderr through: + +```python +from scripts.validate_scenarios import format_kcl_cycle_error +print(format_kcl_cycle_error(stderr)) # None if stderr is not a RecursiveLoad +``` + ## Split if oversized Azure DevOps enforces a 2 MB limit on a single pipeline YAML file. After generating the YAML, check its size: diff --git a/kcl/example_pipeline/pipeline.k b/kcl/example_pipeline/pipeline.k index 1f760a30d1..ef71be8e76 100644 --- a/kcl/example_pipeline/pipeline.k +++ b/kcl/example_pipeline/pipeline.k @@ -1,22 +1,11 @@ -import azure_pipelines.ap -import azure_pipelines.ap.jobs.job -import lib.const +import lib.scenario import lib.steps.azure -import lib.steps.common -import lib.steps.k8s -import lib.util -SUBSCRIPTION_ID = const.DEFAULT_SUBSCRIPTION_ID -RESOURCE_GROUP = "$(RUN_ID)" LOCATION = "westus2" CLUSTER = "stg-H2-yaolei" -CL2_POOL = "cl2pool" -CL2_TAINT_PREFIX = "cl2pool" -CL2_NAMESPACE = "clusterloader2" -KWOK_NODE_COUNT = 100 # SkipAKSCluster disables Gatekeeper on ephemeral clusters -requestBody = util.escapeStr(""" +requestBody = """ { "location": "${LOCATION}", "identity": { "type": "SystemAssigned" }, @@ -57,111 +46,31 @@ requestBody = util.escapeStr(""" } } } - }""") - -createClusterScript = """ -az cloud update --endpoint-resource-manager https://eastus2euap.management.azure.com/ -az rest \\ - --method put \\ - --uri "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP}/providers/Microsoft.ContainerService/managedClusters/${CLUSTER}?api-version=2026-01-02-preview" \\ - --body "${requestBody}" -""" + }""" cl2ResultJson = """{ "cl2_status": "\${STATUS}", "region": "${LOCATION}" }""" -output = ap.Pipeline { +output = scenario.build(scenario.Cl2Benchmark { name = "Example Pipeline" + location = LOCATION + cluster = CLUSTER + requestBody = requestBody - trigger = ["v2"] - pool = const.DEFAULT_POOL - - jobs = [ - job.Job { - job = "benchmarking" - displayName = "Benchmarking" - timeoutInMinutes = 1440 + nodePool = azure.NodePool { + name = "cl2pool" + sku = "Standard_D8S_v4" + count = 4 + taintPrefix = "cl2pool" + } - steps = [ - common.SetRunId(), - common.InstallPythonDependencies(), - azure.Login( - const.DEFAULT_SERVICE_CONNECTION, - SUBSCRIPTION_ID, - LOCATION - ), - azure.CreateResourceGroup( - const.DEFAULT_SERVICE_CONNECTION, - RESOURCE_GROUP, - LOCATION, - SUBSCRIPTION_ID - ), - azure.AzCli( - const.DEFAULT_SERVICE_CONNECTION, - "Create cluster", - createClusterScript), - azure.WaitForClusterSucceeded( - const.DEFAULT_SERVICE_CONNECTION, - CLUSTER, - RESOURCE_GROUP, - SUBSCRIPTION_ID), - azure.CreateNodePool( - const.DEFAULT_SERVICE_CONNECTION, - CLUSTER, - RESOURCE_GROUP, - SUBSCRIPTION_ID, - azure.NodePool { - name = CL2_POOL, - sku = "Standard_D8S_v4", - count = 4, - taintPrefix = CL2_TAINT_PREFIX }), - azure.GetCredentials( - const.DEFAULT_SERVICE_CONNECTION, - CLUSTER, - RESOURCE_GROUP, - SUBSCRIPTION_ID), - *k8s.CreateKwokNodes( - KWOK_NODE_COUNT, - params = { - "node-manifest-path": "$(Pipeline.Workspace)/s/kcl/example_pipeline/kwok-node.yaml" - }), - k8s.RunClusterLoader2( - const.DEFAULT_SERVICE_CONNECTION, - CL2_NAMESPACE, - manifest = "kcl/example_pipeline/cl2.yaml"), - k8s.PrintCl2PodLogs( - const.DEFAULT_SERVICE_CONNECTION, - CL2_NAMESPACE), - azure.AzCli( - const.DEFAULT_SERVICE_CONNECTION, - "Collect cl2 pods status", - """ -TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") -STATUS="succeeded" + kwokNodeCount = 100 + kwokNodeManifest = "$(Pipeline.Workspace)/s/kcl/example_pipeline/kwok-node.yaml" -# Check if any cl2 pod failed -FAILED=$(kubectl get pods --namespace="${CL2_NAMESPACE}" -l job-name=cl2 -o jsonpath='{range .items[*]}{.status.phase}{"\\n"}{end}' | grep -c "^Failed$" || true) -if [ "$FAILED" -gt 0 ]; then - STATUS="failed" -fi + cl2Namespace = "clusterloader2" + cl2Manifest = "kcl/example_pipeline/cl2.yaml" -cat > /tmp/run-result.json << EOF -${util.formatResult(cl2ResultJson)} -EOF -cat /tmp/run-result.json -"""), - common.UploadResult( - const.DEFAULT_SERVICE_CONNECTION, - const.DEFAULT_STORAGE_ACCOUNT, - const.DEFAULT_STORAGE_SUBSCRIPTION_ID, - const.DEFAULT_STORAGE_CONTAINER), - azure.DeleteResourceGroup( - const.DEFAULT_SERVICE_CONNECTION, - RESOURCE_GROUP, - SUBSCRIPTION_ID) - ] - } - ] -} + resultJson = cl2ResultJson +}) diff --git a/kcl/example_pipeline/pipeline.yaml b/kcl/example_pipeline/pipeline.yaml index 7beb30fd63..ad061b8a33 100644 --- a/kcl/example_pipeline/pipeline.yaml +++ b/kcl/example_pipeline/pipeline.yaml @@ -39,10 +39,14 @@ jobs: azureSubscription: Azure-for-Telescope-internal scriptType: bash scriptLocation: inlineScript - inlineScript: | + inlineScript: |+ set -exo pipefail - az group create --name "$(RUN_ID)" --location "westus2" --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" + az group create \ + --name "$(RUN_ID)" \ + --location "westus2" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + displayName: Create resource group in westus2 (b8ceb4e5-f05b-4562-a9f5-14acb1f24219) - task: AzureCLI@2 inputs: diff --git a/kcl/kata_benchmark/cl2.k b/kcl/kata_benchmark/cl2.k new file mode 100644 index 0000000000..a828ca8989 --- /dev/null +++ b/kcl/kata_benchmark/cl2.k @@ -0,0 +1,17 @@ +import lib.scenario + +# cl2.k renders the ClusterLoader2 driver manifest for the Kata benchmark. +# +# Enabling AKS Kata Confidential Containers for the measured workload is a +# single field: `runtimeClassName`. The archetype's manifest builder templates +# it into the override ConfigMap as CL2_RUNTIME_CLASS_NAME, which the in-cluster +# CL2 image stamps onto every workload pod it generates. Everything else inherits +# the shared defaults, so this file stays a couple of lines. +# +# Generate with: +# kcl run kcl/kata_benchmark/cl2.k -S manifests -o kcl/kata_benchmark/cl2.yaml +manifests = scenario.buildCl2Manifest(scenario.Cl2Manifest { + override = scenario.Cl2Override { + runtimeClassName = "kata-mshv-vm-isolation" + } +}) diff --git a/kcl/kata_benchmark/cl2.yaml b/kcl/kata_benchmark/cl2.yaml new file mode 100755 index 0000000000..024df51077 --- /dev/null +++ b/kcl/kata_benchmark/cl2.yaml @@ -0,0 +1,92 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: cl2-override + namespace: clusterloader2 +data: + override.yaml: |- + NODES: 30 + PODS_PER_NODE: 10 + BIG_GROUP_SIZE: 8 + MEDIUM_GROUP_SIZE: 8 + SMALL_GROUP_SIZE: 4 + SMALL_STATEFUL_SETS_PER_NAMESPACE: 0 + MEDIUM_STATEFUL_SETS_PER_NAMESPACE: 0 + CL2_RATE_LIMIT_POD_CREATION: false + CL2_ENABLE_PVS: false + CL2_ENABLE_CLUSTER_OOMS_TRACKER: false + CL2_DEFAULT_QPS: 5000 + CL2_RUN_ON_ARM_NODES: true # This is a hack to allow Cl2 to run on Kwok nodes + CL2_RUNTIME_CLASS_NAME: kata-mshv-vm-isolation +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: cl2 + namespace: clusterloader2 +spec: + completions: 4 + parallelism: 4 + backoffLimit: 0 + template: + spec: + containers: + - name: cl2 + image: ghcr.io/azure/clusterloader2:v20260220 + args: + - '--provider=aks' + - '--run-from-cluster=true' + - '--v=2' + - '--testoverrides=/override/override.yaml' + - '--testconfig=testing/load/config.yaml' + - '--k8s-clients-number=500' + resources: + requests: + cpu: '6' + memory: '24Gi' + volumeMounts: + - mountPath: /override + name: cl2-override + nodeSelector: + agentpool: cl2pool + restartPolicy: Never + serviceAccountName: cl2 + tolerations: + - effect: NoSchedule + key: cl2pool + operator: Exists + volumes: + - name: cl2-override + configMap: + name: cl2-override +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: cl2 + namespace: clusterloader2 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cl2 +rules: +- apiGroups: + - '*' + resources: + - '*' + verbs: + - '*' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cl2 +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cl2 +subjects: +- kind: ServiceAccount + name: cl2 + namespace: clusterloader2 diff --git a/kcl/kata_benchmark/kwok-node.yaml b/kcl/kata_benchmark/kwok-node.yaml new file mode 100644 index 0000000000..3ec36bf79e --- /dev/null +++ b/kcl/kata_benchmark/kwok-node.yaml @@ -0,0 +1,54 @@ +apiVersion: v1 +kind: Node +metadata: + name: {{node_name}} + annotations: + node.alpha.kubernetes.io/ttl: "0" + kwok.x-k8s.io/node: fake + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + kubernetes.io/arch: amd64 + kubernetes.io/hostname: {{node_name}} + kubernetes.io/os: linux + kubernetes.io/role: agent + node-role.kubernetes.io/agent: "" + type: kwok +spec: + providerID: "kwok://{{node_name}}" + unschedulable: false + taints: # Avoid scheduling actual running pods to fake Node + - effect: NoSchedule + key: kubernetes.io/arch + value: arm64 # This is a hack to allow Cl2 pods to run on Kwok nodes. +status: + addresses: + - type: InternalIP + address: {{node_ip}} + allocatable: + cpu: {{node_cpu}} + memory: {{node_memory}} + pods: {{node_pods}} + nvidia.com/gpu: {{node_gpu}} + capacity: + cpu: {{node_cpu}} + memory: {{node_memory}} + pods: {{node_pods}} + nvidia.com/gpu: {{node_gpu}} + conditions: + - type: "Ready" + status: "True" + reason: "KubeletReady" + message: "kubelet is posting ready status" + nodeInfo: + architecture: amd64 + bootID: "" + containerRuntimeVersion: "" + kernelVersion: "" + kubeProxyVersion: fake + kubeletVersion: fake + machineID: "" + operatingSystem: linux + osImage: "" + systemUUID: "" + phase: Running diff --git a/kcl/kata_benchmark/pipeline.k b/kcl/kata_benchmark/pipeline.k new file mode 100644 index 0000000000..31743822ab --- /dev/null +++ b/kcl/kata_benchmark/pipeline.k @@ -0,0 +1,91 @@ +import lib.scenario +import lib.steps.azure + +# Kata Benchmark reuses the Cl2Benchmark archetype. It is intentionally a near +# copy of example_pipeline to show the cost of enabling AKS Kata Confidential +# Containers: only the Kata-relevant fields differ. +# +# 1. nodePool.runtime = "kata" -> CreateNodePool adds +# `--workload-runtime KataMshvAccVmIsolation` +# 2. nodePool.sku bumped to a nested-virtualization-capable SKU +# (Standard_D8S_v5), required by Kata's Hyper-V isolation +# 3. cl2Manifest points at the Kata cl2.yaml (generated by cl2.k), whose +# override sets CL2_RUNTIME_CLASS_NAME = kata-mshv-vm-isolation +# +# Everything else (cluster body, KWOK padding, result schema) is identical to +# example_pipeline. + +LOCATION = "westus2" +CLUSTER = "stg-H2-yaolei" + +# SkipAKSCluster disables Gatekeeper on ephemeral clusters +requestBody = """ +{ + "location": "${LOCATION}", + "identity": { "type": "SystemAssigned" }, + "sku": { + "name": "Base", + "tier": "Standard" + }, + "tags": { + "SkipAKSCluster": "true", + "SkipASMAzSecPackAutoConfig": "true", + "SkipLinuxAzSecPack": "true" + }, + "properties": { + "controlPlaneScalingProfile": {"scalingSize": "H2"}, + "kubernetesVersion": "1.33.0", + "dnsPrefix": "${CLUSTER}-dns", + "agentPoolProfiles": [ + { + "name": "systempool", + "mode": "System", + "count": 30, + "vmSize": "Standard_D8S_v4", + "osType": "Linux", + "maxPods": 250 + } + ], + "networkProfile": { + "networkPlugin": "azure", + "networkPluginMode": "overlay", + "podCidr": "10.64.0.0/10", + "serviceCidr": "10.0.0.0/16", + "dnsServiceIP": "10.0.0.10", + "outboundType": "managedNATGateway", + "natGatewayProfile": { + "managedOutboundIPProfile": { + "count": 10 + } + } + } + } + }""" + +cl2ResultJson = """{ + "cl2_status": "\${STATUS}", + "region": "${LOCATION}" + }""" + +output = scenario.build(scenario.Cl2Benchmark { + name = "Kata Benchmark" + location = LOCATION + cluster = CLUSTER + requestBody = requestBody + + nodePool = azure.NodePool { + name = "cl2pool" + sku = "Standard_D8S_v5" + count = 4 + taintPrefix = "cl2pool" + runtime = "kata" + } + + kwokNodeCount = 100 + kwokNodeManifest = "$(Pipeline.Workspace)/s/kcl/kata_benchmark/kwok-node.yaml" + + cl2Namespace = "clusterloader2" + cl2Manifest = "kcl/kata_benchmark/cl2.yaml" + + resultJson = cl2ResultJson +}) diff --git a/kcl/kata_benchmark/pipeline.yaml b/kcl/kata_benchmark/pipeline.yaml new file mode 100755 index 0000000000..a99c8e7861 --- /dev/null +++ b/kcl/kata_benchmark/pipeline.yaml @@ -0,0 +1,283 @@ +name: Kata Benchmark +pool: AKS-Telescope-Airlock +trigger: +- v2 +jobs: +- job: benchmarking + displayName: Benchmarking + timeoutInMinutes: 1440 + steps: + - bash: |2 + + set -eo pipefail + + job_id="$(System.JobId)" + RUN_ID=$(Build.BuildId)-${job_id:0:8} + echo "Run ID: $RUN_ID" + + echo "##vso[task.setvariable variable=RUN_ID]$RUN_ID" + displayName: Set Run ID + - bash: |- + set -exo pipefail + pip3 install --upgrade "pip<24" + pip3 install -r $(Pipeline.Workspace)/s/modules/python/requirements.txt + displayName: Install Python dependencies + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az account set --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" + az config set defaults.location="westus2" + az account show + displayName: Login to Azure + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: |+ + set -exo pipefail + + az group create \ + --name "$(RUN_ID)" \ + --location "westus2" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + + displayName: Create resource group in westus2 (b8ceb4e5-f05b-4562-a9f5-14acb1f24219) + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az cloud update --endpoint-resource-manager https://eastus2euap.management.azure.com/ + az rest \ + --method put \ + --uri "/subscriptions/b8ceb4e5-f05b-4562-a9f5-14acb1f24219/resourceGroups/$(RUN_ID)/providers/Microsoft.ContainerService/managedClusters/stg-H2-yaolei?api-version=2026-01-02-preview" \ + --body " + { + \"location\": \"westus2\", + \"identity\": { \"type\": \"SystemAssigned\" }, + \"sku\": { + \"name\": \"Base\", + \"tier\": \"Standard\" + }, + \"tags\": { + \"SkipAKSCluster\": \"true\", + \"SkipASMAzSecPackAutoConfig\": \"true\", + \"SkipLinuxAzSecPack\": \"true\" + }, + \"properties\": { + \"controlPlaneScalingProfile\": {\"scalingSize\": \"H2\"}, + \"kubernetesVersion\": \"1.33.0\", + \"dnsPrefix\": \"stg-H2-yaolei-dns\", + \"agentPoolProfiles\": [ + { + \"name\": \"systempool\", + \"mode\": \"System\", + \"count\": 30, + \"vmSize\": \"Standard_D8S_v4\", + \"osType\": \"Linux\", + \"maxPods\": 250 + } + ], + \"networkProfile\": { + \"networkPlugin\": \"azure\", + \"networkPluginMode\": \"overlay\", + \"podCidr\": \"10.64.0.0/10\", + \"serviceCidr\": \"10.0.0.0/16\", + \"dnsServiceIP\": \"10.0.0.10\", + \"outboundType\": \"managedNATGateway\", + \"natGatewayProfile\": { + \"managedOutboundIPProfile\": { + \"count\": 10 + } + } + } + } + }" + displayName: Create cluster + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + while true; do + STATE=$(az aks show \ + --name "stg-H2-yaolei" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --query "provisioningState" \ + --output tsv) + echo "Cluster provisioning state: $STATE" + if [ "$STATE" = "Succeeded" ]; then + echo "Cluster is ready." + break + elif [ "$STATE" = "Failed" ] || [ "$STATE" = "Canceled" ]; then + echo "Cluster failed with state: $STATE." + exit 1 + else + echo "Provisioning state: $STATE. Retry in 30 seconds" + fi + sleep 30 + done + displayName: Wait for cluster to succeed + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az aks nodepool add \ + --cluster-name "stg-H2-yaolei" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --name "cl2pool" \ + --node-count 4 \ + --node-vm-size "Standard_D8S_v5" \ + --mode User \ + --node-taints cl2pool=true:NoSchedule \ + --workload-runtime KataMshvAccVmIsolation + displayName: Create node pool cl2pool + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az aks get-credentials \ + --name "stg-H2-yaolei" \ + --resource-group "$(RUN_ID)" \ + --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" \ + --overwrite-existing + displayName: Get credentials for stg-H2-yaolei + - script: |2- + + set -exo pipefail + export PYTHONPATH=$PYTHONPATH:$(pwd) + python3 kwok/kwok.py --action create --node-count 100 --node-manifest-path $(Pipeline.Workspace)/s/kcl/kata_benchmark/kwok-node.yaml + workingDirectory: modules/python + displayName: Create KWOK Nodes + - script: |2- + + set -exo pipefail + export PYTHONPATH=$PYTHONPATH:$(pwd) + python3 kwok/kwok.py --action validate --node-count 100 --node-manifest-path $(Pipeline.Workspace)/s/kcl/kata_benchmark/kwok-node.yaml + workingDirectory: modules/python + displayName: Validate KWOK Nodes + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + set +eo pipefail + kubectl create namespace "clusterloader2" || true + kubectl apply -f $(Pipeline.Workspace)/s/kcl/kata_benchmark/cl2.yaml + # Wait + consecutive_errors=0 + while true; do + # Under consistent API server errors, we would want to exit early instead of waiting for the entire timeout duration. + phases="$(kubectl get pods --namespace="clusterloader2" -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' 2>/tmp/kubectl_err)" + kubectl_exit_code=$? + if [ $kubectl_exit_code -ne 0 ]; then + consecutive_errors=$((consecutive_errors + 1)) + echo "kubectl get pods failed (exit_code=$kubectl_exit_code, consecutive=$consecutive_errors/20):" + cat /tmp/kubectl_err + if [ "$consecutive_errors" -ge "20" ]; then + echo "API server unresponsive for $consecutive_errors consecutive polls; aborting." + exit 1 + fi + sleep 30 + continue + fi + consecutive_errors=0 + + total=0 + terminal=0 + + if [ -n "$phases" ]; then + while IFS= read -r phase; do + if [ -z "$phase" ]; then + continue + fi + + total=$((total + 1)) + if [ "$phase" = "Succeeded" ] || [ "$phase" = "Failed" ]; then + terminal=$((terminal + 1)) + fi + done <<< "$phases" + fi + + if [ "$total" -gt 0 ] && [ "$terminal" -eq "$total" ]; then + echo "All cl2 pods reached terminal state (Succeeded or Failed)" + kubectl get pods --namespace="clusterloader2" -l job-name=cl2 -o wide + + break + fi + + sleep 30 + done + displayName: Run cluster loader 2 + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + for pod in $(kubectl get pods --namespace="clusterloader2" -l job-name=cl2 -o jsonpath='{.items[*].metadata.name}'); do + echo "===== Logs for pod: $pod =====" + kubectl logs --namespace="clusterloader2" "$pod" || true + done + displayName: Print cl2 pod logs + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: "set -exo pipefail\n\nTIMESTAMP=$(date -u +\"%Y-%m-%dT%H:%M:%SZ\")\nSTATUS=\"succeeded\"\n\n# Check if any cl2 pod failed\nFAILED=$(kubectl get pods --namespace=\"clusterloader2\" -l job-name=cl2 -o jsonpath='{range .items[*]}{.status.phase}{\"\\n\"}{end}' | grep -c \"^Failed$\" || true)\nif [ \"$FAILED\" -gt 0 ]; then\n STATUS=\"failed\"\nfi\n\ncat > /tmp/run-result.json << EOF\n\n{\n \"timestamp\": \"${TIMESTAMP}\",\n \"run_id\": \"$(RUN_ID)\",\n \"run_url\": \"$(System.TeamFoundationCollectionUri)$(System.TeamProject)/_build/results?buildId=$(Build.BuildId)\",\n \"pipeline\": \"$(Build.DefinitionName)\",\n \"result\": {\n \"cl2_status\": \"${STATUS}\", \n \"region\": \"westus2\"\n }\n}\nEOF\ncat /tmp/run-result.json\n" + displayName: Collect cl2 pods status + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az storage blob upload \ + --account-name "telescopev2" \ + --subscription "137f0351-8235-42a6-ac7a-6b46be2d21c7" \ + --container-name "aks" \ + --name "$(RUN_ID)/run-result.json" \ + --file /tmp/run-result.json \ + --auth-mode key + displayName: Upload result to storage account + - task: AzureCLI@2 + inputs: + azureSubscription: Azure-for-Telescope-internal + scriptType: bash + scriptLocation: inlineScript + inlineScript: | + set -exo pipefail + + az group delete --name "$(RUN_ID)" --subscription "b8ceb4e5-f05b-4562-a9f5-14acb1f24219" --yes + condition: always() + displayName: Delete resource group (b8ceb4e5-f05b-4562-a9f5-14acb1f24219) diff --git a/kcl/lib/scenario/cl2_benchmark.k b/kcl/lib/scenario/cl2_benchmark.k new file mode 100644 index 0000000000..71e2c621e5 --- /dev/null +++ b/kcl/lib/scenario/cl2_benchmark.k @@ -0,0 +1,163 @@ +import azure_pipelines.ap +import azure_pipelines.ap.jobs.job +import lib.const +import lib.steps.azure +import lib.steps.common +import lib.steps.k8s +import lib.util + +# Cl2Benchmark is the "ClusterLoader2 on AKS" scenario archetype. +# +# It captures the small set of scenario-specific knobs (cluster identity, +# cluster requestBody, node pool, optional KWOK nodes, CL2 manifest, result +# JSON) and the archetype `build` lambda fills in the shared spine that every +# CL2 benchmark repeats: run-id -> python deps -> login -> resource group -> +# create cluster -> wait -> node pool -> credentials -> [kwok] -> CL2 -> +# logs -> collect result -> upload -> delete resource group. +# +# Shared defaults are sourced from lib/const so a new scenario only overrides +# what is genuinely different. `requestBody` (the ARM cluster body) is a +# pass-through escape hatch: authors supply raw JSON and the archetype escapes +# it for inline embedding. +schema Cl2Benchmark: + # Pipeline-level + name: str + trigger: [str] = ["v2"] + pool: str = const.DEFAULT_POOL + jobName: str = "benchmarking" + jobDisplayName: str = "Benchmarking" + timeoutInMinutes: int = 1440 + + # Cluster identity + serviceConnection: str = const.DEFAULT_SERVICE_CONNECTION + subscription: str = const.DEFAULT_SUBSCRIPTION_ID + resourceGroup: str = "$(RUN_ID)" + location: str + cluster: str + + # Cluster creation + requestBody: str # raw ARM managedCluster body (escaped by the archetype) + clusterApiVersion: str = "2026-01-02-preview" + useEuapEndpoint: bool = True + + # Workload node pool (the pool CL2 schedules onto) + nodePool: azure.NodePool + + # Optional KWOK fake nodes (0 = skip) + kwokNodeCount: int = 0 + kwokNodeManifest: str = "" + + # ClusterLoader2 + cl2Namespace: str + cl2Manifest: str + + # Result reporting: inner result JSON merged into the Telescope ADX schema + resultJson: str + + # Result storage (defaults to the shared Telescope account) + storageAccount: str = const.DEFAULT_STORAGE_ACCOUNT + storageSubscription: str = const.DEFAULT_STORAGE_SUBSCRIPTION_ID + storageContainer: str = const.DEFAULT_STORAGE_CONTAINER + +# build assembles a Cl2Benchmark config into a full azure_pipelines Pipeline. +build = lambda cfg: Cl2Benchmark -> ap.Pipeline { + escaped = util.escapeStr(cfg.requestBody) + + euap = "az cloud update --endpoint-resource-manager https://eastus2euap.management.azure.com/\n" if cfg.useEuapEndpoint else "" + + createClusterScript = "\n" + euap + """az rest \\ + --method put \\ + --uri "/subscriptions/${cfg.subscription}/resourceGroups/${cfg.resourceGroup}/providers/Microsoft.ContainerService/managedClusters/${cfg.cluster}?api-version=${cfg.clusterApiVersion}" \\ + --body "${escaped}" +""" + + collectScript = """ +TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") +STATUS="succeeded" + +# Check if any cl2 pod failed +FAILED=$(kubectl get pods --namespace="${cfg.cl2Namespace}" -l job-name=cl2 -o jsonpath='{range .items[*]}{.status.phase}{"\\n"}{end}' | grep -c "^Failed$" || true) +if [ "$FAILED" -gt 0 ]; then + STATUS="failed" +fi + +cat > /tmp/run-result.json << EOF +${util.formatResult(cfg.resultJson)} +EOF +cat /tmp/run-result.json +""" + + kwokSteps = k8s.CreateKwokNodes( + cfg.kwokNodeCount, + params = {"node-manifest-path": cfg.kwokNodeManifest} + ) if cfg.kwokNodeCount > 0 else [] + + ap.Pipeline { + name = cfg.name + trigger = cfg.trigger + pool = cfg.pool + + jobs = [ + job.Job { + job = cfg.jobName + displayName = cfg.jobDisplayName + timeoutInMinutes = cfg.timeoutInMinutes + + steps = [ + common.SetRunId(), + common.InstallPythonDependencies(), + azure.Login( + cfg.serviceConnection, + cfg.subscription, + cfg.location), + azure.CreateResourceGroup( + cfg.serviceConnection, + cfg.resourceGroup, + cfg.location, + cfg.subscription), + azure.AzCli( + cfg.serviceConnection, + "Create cluster", + createClusterScript), + azure.WaitForClusterSucceeded( + cfg.serviceConnection, + cfg.cluster, + cfg.resourceGroup, + cfg.subscription), + azure.CreateNodePool( + cfg.serviceConnection, + cfg.cluster, + cfg.resourceGroup, + cfg.subscription, + cfg.nodePool), + azure.GetCredentials( + cfg.serviceConnection, + cfg.cluster, + cfg.resourceGroup, + cfg.subscription), + ] + kwokSteps + [ + k8s.RunClusterLoader2( + cfg.serviceConnection, + cfg.cl2Namespace, + manifest = cfg.cl2Manifest), + k8s.PrintCl2PodLogs( + cfg.serviceConnection, + cfg.cl2Namespace), + azure.AzCli( + cfg.serviceConnection, + "Collect cl2 pods status", + collectScript), + common.UploadResult( + cfg.serviceConnection, + cfg.storageAccount, + cfg.storageSubscription, + cfg.storageContainer), + azure.DeleteResourceGroup( + cfg.serviceConnection, + cfg.resourceGroup, + cfg.subscription), + ] + } + ] + } +} diff --git a/kcl/lib/scenario/cl2_manifest.k b/kcl/lib/scenario/cl2_manifest.k new file mode 100644 index 0000000000..a7de4e824a --- /dev/null +++ b/kcl/lib/scenario/cl2_manifest.k @@ -0,0 +1,193 @@ +# cl2_manifest.k renders the ClusterLoader2 "driver" manifest from KCL. +# +# The CL2 driver is the in-cluster Job that runs the ClusterLoader2 binary; it +# reads its tunables from a mounted `override.yaml` ConfigMap and generates the +# measured workload pods internally. Historically this manifest was a static +# multi-document YAML file copied per scenario. Modeling it as a typed module +# means a scenario author tunes load via named fields instead of editing raw +# YAML, and enabling a feature like AKS Kata Confidential Containers is a single +# field (`runtimeClassName`) rather than a hand-edited string. +# +# `buildCl2Manifest` returns the manifest as a list of Kubernetes objects. When +# selected with `kcl run cl2.k -S manifests`, KCL serializes the list as a +# multi-document (`---` separated) YAML stream that `kubectl apply -f` accepts +# unchanged, so RunClusterLoader2 needs no changes. + +# _boolStr renders a KCL bool as a lowercase YAML boolean literal ("true" / +# "false") for embedding inside the override.yaml text block. (Plain string +# interpolation of a KCL bool would yield "True"/"False", which CL2 rejects.) +_boolStr = lambda v: bool -> str { + "true" if v else "false" +} + +# Cl2Override holds the knobs CL2 reads from override.yaml. Defaults mirror the +# previous static manifest so an unconfigured Cl2Manifest reproduces today's +# behavior. +schema Cl2Override: + nodes: int = 30 + podsPerNode: int = 10 + bigGroupSize: int = 8 + mediumGroupSize: int = 8 + smallGroupSize: int = 4 + smallStatefulSetsPerNamespace: int = 0 + mediumStatefulSetsPerNamespace: int = 0 + rateLimitPodCreation: bool = False + enablePvs: bool = False + enableClusterOomsTracker: bool = False + defaultQps: int = 5000 + # runOnArmNodes is a hack that lets CL2 schedule onto KWOK fake nodes. + runOnArmNodes: bool = True + # runtimeClassName, when set, makes the measured workload pods run under the + # named RuntimeClass. Set "kata-mshv-vm-isolation" to benchmark AKS Kata + # Confidential Containers. Unset -> the cluster default runtime (runc). + runtimeClassName?: str + +# Cl2Manifest holds everything needed to render the CL2 driver manifest. +schema Cl2Manifest: + namespace: str = "clusterloader2" + # nodePoolName is the pool CL2 is pinned to; it drives both the nodeSelector + # (agentpool=) and the matching NoSchedule toleration key. + nodePoolName: str = "cl2pool" + image: str = "ghcr.io/azure/clusterloader2:v20260220" + completions: int = 4 + parallelism: int = 4 + k8sClientsNumber: int = 500 + cpuRequest: str = "6" + memoryRequest: str = "24Gi" + override: Cl2Override = Cl2Override {} + +# buildCl2Manifest renders a Cl2Manifest into the list of Kubernetes objects +# that make up the CL2 driver: an override ConfigMap, the CL2 Job, and the +# ServiceAccount/ClusterRole/ClusterRoleBinding it runs as. +buildCl2Manifest = lambda m: Cl2Manifest -> [{str:any}] { + o = m.override + + overrideLines = [ + "NODES: ${o.nodes}", + "PODS_PER_NODE: ${o.podsPerNode}", + "BIG_GROUP_SIZE: ${o.bigGroupSize}", + "MEDIUM_GROUP_SIZE: ${o.mediumGroupSize}", + "SMALL_GROUP_SIZE: ${o.smallGroupSize}", + "SMALL_STATEFUL_SETS_PER_NAMESPACE: ${o.smallStatefulSetsPerNamespace}", + "MEDIUM_STATEFUL_SETS_PER_NAMESPACE: ${o.mediumStatefulSetsPerNamespace}", + "CL2_RATE_LIMIT_POD_CREATION: ${_boolStr(o.rateLimitPodCreation)}", + "CL2_ENABLE_PVS: ${_boolStr(o.enablePvs)}", + "CL2_ENABLE_CLUSTER_OOMS_TRACKER: ${_boolStr(o.enableClusterOomsTracker)}", + "CL2_DEFAULT_QPS: ${o.defaultQps}", + "CL2_RUN_ON_ARM_NODES: ${_boolStr(o.runOnArmNodes)} # This is a hack to allow Cl2 to run on Kwok nodes", + "CL2_RUNTIME_CLASS_NAME: ${o.runtimeClassName}" if o.runtimeClassName else "", + ] + overrideYaml = "\n".join([line for line in overrideLines if line]) + + [ + { + apiVersion = "v1" + kind = "ConfigMap" + metadata = { + name = "cl2-override" + namespace = m.namespace + } + data = { + "override.yaml" = overrideYaml + } + } + { + apiVersion = "batch/v1" + kind = "Job" + metadata = { + name = "cl2" + namespace = m.namespace + } + spec = { + completions = m.completions + parallelism = m.parallelism + backoffLimit = 0 + template.spec = { + containers = [ + { + name = "cl2" + image = m.image + args = [ + "--provider=aks" + "--run-from-cluster=true" + "--v=2" + "--testoverrides=/override/override.yaml" + "--testconfig=testing/load/config.yaml" + "--k8s-clients-number=${m.k8sClientsNumber}" + ] + resources.requests = { + cpu = m.cpuRequest + memory = m.memoryRequest + } + volumeMounts = [ + { + mountPath = "/override" + name = "cl2-override" + } + ] + } + ] + nodeSelector = { + agentpool = m.nodePoolName + } + restartPolicy = "Never" + serviceAccountName = "cl2" + tolerations = [ + { + effect = "NoSchedule" + key = m.nodePoolName + operator = "Exists" + } + ] + volumes = [ + { + name = "cl2-override" + configMap.name = "cl2-override" + } + ] + } + } + } + { + apiVersion = "v1" + kind = "ServiceAccount" + metadata = { + name = "cl2" + namespace = m.namespace + } + } + { + apiVersion = "rbac.authorization.k8s.io/v1" + kind = "ClusterRole" + metadata = { + name = "cl2" + } + rules = [ + { + apiGroups = ["*"] + resources = ["*"] + verbs = ["*"] + } + ] + } + { + apiVersion = "rbac.authorization.k8s.io/v1" + kind = "ClusterRoleBinding" + metadata = { + name = "cl2" + } + roleRef = { + apiGroup = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "cl2" + } + subjects = [ + { + kind = "ServiceAccount" + name = "cl2" + namespace = m.namespace + } + ] + } + ] +} diff --git a/kcl/lib/steps/azure/create_node_pool.k b/kcl/lib/steps/azure/create_node_pool.k index a8bc940d3c..60835aeb71 100644 --- a/kcl/lib/steps/azure/create_node_pool.k +++ b/kcl/lib/steps/azure/create_node_pool.k @@ -11,6 +11,10 @@ schema NodePool: maxPods?: int vnetSubnetId?: str osSKU?: str + # runtime selects the pod runtime for this pool. "kata" enables AKS Kata + # Confidential Containers (Hyper-V isolation); requires a nested-virt SKU + # (e.g. Standard_D8S_v5). Leave unset for the default runc runtime. + runtime?: str CreateNodePool = lambda serviceConnection: str, cluster: str, resourceGroup: str, subscription: str, pool: NodePool, noWait: bool = False -> steps.Step { flagDecision = [ @@ -19,6 +23,7 @@ CreateNodePool = lambda serviceConnection: str, cluster: str, resourceGroup: str "--max-pods ${pool.maxPods}" if pool.maxPods else "", "--vnet-subnet-id \"${pool.vnetSubnetId}\"" if pool.vnetSubnetId else "", "--os-sku ${pool.osSKU}" if pool.osSKU else "", + "--workload-runtime KataMshvAccVmIsolation" if pool.runtime == "kata" else "", "--no-wait" if noWait else "", ] flags = " \\\n ".join([f for f in flagDecision if f]) diff --git a/scripts/test_validate_scenarios.py b/scripts/test_validate_scenarios.py new file mode 100644 index 0000000000..b1a57bbcc3 --- /dev/null +++ b/scripts/test_validate_scenarios.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +"""Tests for validate_scenarios.py (the thin /generate_yaml cross-scenario gate). + +Run with: python3 scripts/test_validate_scenarios.py +""" + +import os +import tempfile +import unittest + +from validate_scenarios import ( + extract_path_refs, + ref_owner, + find_cycles, + discover_scenarios, + build_graph, + find_scenario_reference_cycles, + format_kcl_cycle_error, +) + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +def _write(path: str, content: str = "") -> None: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + f.write(content) + + +class TestExtractPathRefs(unittest.TestCase): + def test_plain_cl2_manifest_ref(self): + text = 'cl2Manifest = "kcl/kata_benchmark/cl2.yaml"' + self.assertEqual(extract_path_refs(text), ["kcl/kata_benchmark/cl2.yaml"]) + + def test_pipeline_workspace_prefixed_ref_is_normalized(self): + text = 'kwokNodeManifest = "$(Pipeline.Workspace)/s/kcl/kata_benchmark/kwok-node.yaml"' + self.assertEqual( + extract_path_refs(text), ["kcl/kata_benchmark/kwok-node.yaml"] + ) + + def test_multiple_refs_in_one_blob(self): + text = ( + 'cl2Manifest = "kcl/example_pipeline/cl2.yaml"\n' + 'kwokNodeManifest = "$(Pipeline.Workspace)/s/kcl/example_pipeline/kwok-node.yaml"\n' + ) + self.assertEqual( + extract_path_refs(text), + [ + "kcl/example_pipeline/cl2.yaml", + "kcl/example_pipeline/kwok-node.yaml", + ], + ) + + def test_no_kcl_paths_returns_empty(self): + text = 'name = "Kata Benchmark"\nLOCATION = "westus2"' + self.assertEqual(extract_path_refs(text), []) + + def test_import_lines_are_not_path_refs(self): + # Imports use dotted module syntax, not slash paths; must be ignored. + text = "import lib.scenario\nimport lib.steps.azure" + self.assertEqual(extract_path_refs(text), []) + + +class TestRefOwner(unittest.TestCase): + def setUp(self): + self.scenarios = { + "example_pipeline", + "kata_benchmark", + "apiserver_benchmark/configmaps100", + } + + def test_top_level_scenario_owner(self): + self.assertEqual( + ref_owner("kcl/kata_benchmark/cl2.yaml", self.scenarios), + "kata_benchmark", + ) + + def test_nested_scenario_owner(self): + self.assertEqual( + ref_owner( + "kcl/apiserver_benchmark/configmaps100/cl2.yaml", self.scenarios + ), + "apiserver_benchmark/configmaps100", + ) + + def test_lib_path_has_no_scenario_owner(self): + self.assertIsNone(ref_owner("kcl/lib/scenario/foo.yaml", self.scenarios)) + + def test_unknown_dir_has_no_owner(self): + self.assertIsNone(ref_owner("kcl/nope/cl2.yaml", self.scenarios)) + + def test_longest_prefix_wins(self): + scenarios = {"apiserver_benchmark", "apiserver_benchmark/configmaps100"} + self.assertEqual( + ref_owner( + "kcl/apiserver_benchmark/configmaps100/cl2.yaml", scenarios + ), + "apiserver_benchmark/configmaps100", + ) + + +class TestFindCycles(unittest.TestCase): + def test_simple_two_node_cycle(self): + graph = {"a": {"b"}, "b": {"a"}} + cycles = find_cycles(graph) + self.assertEqual(len(cycles), 1) + self.assertEqual(set(cycles[0]), {"a", "b"}) + + def test_acyclic_graph_has_no_cycles(self): + graph = {"a": {"b"}, "b": {"c"}, "c": set()} + self.assertEqual(find_cycles(graph), []) + + def test_three_node_cycle(self): + graph = {"a": {"b"}, "b": {"c"}, "c": {"a"}} + cycles = find_cycles(graph) + self.assertEqual(len(cycles), 1) + self.assertEqual(set(cycles[0]), {"a", "b", "c"}) + + def test_disconnected_acyclic(self): + graph = {"a": {"b"}, "b": set(), "c": {"d"}, "d": set()} + self.assertEqual(find_cycles(graph), []) + + +class TestDiscoverScenarios(unittest.TestCase): + def test_finds_pipeline_dirs_and_excludes_lib(self): + with tempfile.TemporaryDirectory() as d: + kcl = os.path.join(d, "kcl") + _write(os.path.join(kcl, "example_pipeline", "pipeline.k")) + _write(os.path.join(kcl, "nested", "deep", "pipeline.k")) + _write(os.path.join(kcl, "lib", "scenario", "cl2_benchmark.k")) + # Even a pipeline.k under lib/ must be excluded. + _write(os.path.join(kcl, "lib", "weird", "pipeline.k")) + self.assertEqual( + discover_scenarios(kcl), {"example_pipeline", "nested/deep"} + ) + + +class TestBuildGraph(unittest.TestCase): + def test_cross_scenario_edge_with_self_and_lib_excluded(self): + with tempfile.TemporaryDirectory() as d: + kcl = os.path.join(d, "kcl") + _write( + os.path.join(kcl, "a", "pipeline.k"), + 'cl2Manifest = "kcl/b/cl2.yaml"\n' + 'kwokNodeManifest = "$(Pipeline.Workspace)/s/kcl/a/own.yaml"\n', + ) + _write( + os.path.join(kcl, "b", "pipeline.k"), + 'cl2Manifest = "kcl/lib/scenario/x.yaml"\n', + ) + _write(os.path.join(kcl, "lib", "scenario", "cl2_benchmark.k")) + scenarios = discover_scenarios(kcl) + graph = build_graph(kcl, scenarios) + self.assertEqual(graph, {"a": {"b"}, "b": set()}) + + +class TestFindScenarioReferenceCycles(unittest.TestCase): + def test_detects_cross_scenario_cycle(self): + with tempfile.TemporaryDirectory() as d: + kcl = os.path.join(d, "kcl") + _write( + os.path.join(kcl, "a", "pipeline.k"), + 'cl2Manifest = "kcl/b/cl2.yaml"\n', + ) + _write( + os.path.join(kcl, "b", "pipeline.k"), + 'cl2Manifest = "kcl/a/cl2.yaml"\n', + ) + cycles = find_scenario_reference_cycles(kcl) + self.assertEqual(len(cycles), 1) + self.assertEqual(set(cycles[0]), {"a", "b"}) + + def test_real_repo_has_no_cross_scenario_cycles(self): + kcl = os.path.join(REPO_ROOT, "kcl") + self.assertEqual(find_scenario_reference_cycles(kcl), []) + + +class TestFormatKclCycleError(unittest.TestCase): + KCL_CYCLE_STDERR = ( + "error[E1001]: RecursiveLoad\n" + "Could not compiles due to cyclic import statements\n" + "- /repo/kcl/aaa/main.k\n" + "- /repo/kcl/bbb/main.k\n\n" + "error[E2L23]: CompileError\n" + " --> /repo/kcl/bbb/main.k:1:1\n" + "There is a circular reference between modules bbb, aaa\n" + ) + + def test_detects_and_summarizes_cycle(self): + msg = format_kcl_cycle_error(self.KCL_CYCLE_STDERR) + self.assertIsNotNone(msg) + self.assertIn("circular", msg.lower()) + # The friendly message should name the modules involved. + self.assertIn("bbb", msg) + self.assertIn("aaa", msg) + + def test_returns_none_for_unrelated_error(self): + stderr = "error[E2L23]: CompileError\nundefined identifier 'foo'\n" + self.assertIsNone(format_kcl_cycle_error(stderr)) + + def test_returns_none_for_empty(self): + self.assertIsNone(format_kcl_cycle_error("")) + + +if __name__ == "__main__": + unittest.main() diff --git a/scripts/validate_scenarios.py b/scripts/validate_scenarios.py new file mode 100644 index 0000000000..3ca47ee97d --- /dev/null +++ b/scripts/validate_scenarios.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Thin cross-scenario validation gate for Telescope v2 scenarios. + +KCL's compiler already rejects circular *imports* (error E1001 RecursiveLoad), +so this gate deliberately does NOT re-check import cycles. It covers the blind +spot KCL cannot see: path-string references between scenarios (e.g. one +scenario's ``cl2Manifest`` / ``kwokNodeManifest`` pointing into another +scenario's directory). A cycle of such references is a hard failure. +""" + +import os +import re + +# Matches a kcl/<...> path fragment embedded in a .k source blob, anchored to a +# boundary so it ignores dotted import statements (which have no slashes). +_PATH_REF_RE = re.compile(r'(?:^|[\s"/])(kcl/[^\s"]*)') + +# Matches KCL's "circular reference between modules X, Y" diagnostic. +_KCL_CYCLE_RE = re.compile( + r"circular reference between modules ([^\n]+)", re.IGNORECASE +) + + +def extract_path_refs(text: str) -> list[str]: + """Return all ``kcl/<...>`` path fragments embedded in a .k source blob. + + A leading ``$(Pipeline.Workspace)/s/`` is dropped automatically because the + match starts at ``kcl/``. + """ + return _PATH_REF_RE.findall(text) + + +def ref_owner(path_fragment: str, scenarios: set[str]) -> str | None: + """Return the scenario directory that owns ``path_fragment``, or None. + + ``scenarios`` are directories relative to ``kcl/`` (e.g. ``kata_benchmark`` + or ``apiserver_benchmark/configmaps100``). The longest matching scenario + prefix wins so nested scenarios resolve correctly. + """ + if not path_fragment.startswith("kcl/"): + return None + rel = path_fragment[len("kcl/"):] + best = None + for s in scenarios: + if rel == s or rel.startswith(s + "/"): + if best is None or len(s) > len(best): + best = s + return best + + +def find_cycles(graph: dict[str, set[str]]) -> list[list[str]]: + """Return cycles in a directed graph as lists of nodes (deduped by node set).""" + cycles: list[list[str]] = [] + seen_sets: set[frozenset] = set() + visited: set[str] = set() + stack: list[str] = [] + onstack: set[str] = set() + + def dfs(u: str) -> None: + visited.add(u) + stack.append(u) + onstack.add(u) + for v in graph.get(u, ()): + if v in onstack: + cyc = stack[stack.index(v):] + key = frozenset(cyc) + if key not in seen_sets: + seen_sets.add(key) + cycles.append(list(cyc)) + elif v not in visited: + dfs(v) + stack.pop() + onstack.discard(u) + + for n in graph: + if n not in visited: + dfs(n) + return cycles + + +def discover_scenarios(kcl_root: str) -> set[str]: + """Return scenario directories (relative to ``kcl_root``) that contain a + ``pipeline.k``. Anything under ``lib/`` is excluded.""" + scenarios: set[str] = set() + for dirpath, _dirnames, filenames in os.walk(kcl_root): + if "pipeline.k" not in filenames: + continue + rel = os.path.relpath(dirpath, kcl_root) + rel = rel.replace(os.sep, "/") + if rel == "lib" or rel.startswith("lib/"): + continue + scenarios.add(rel) + return scenarios + + +def build_graph(kcl_root: str, scenarios: set[str]) -> dict[str, set[str]]: + """Build the scenario reference graph from path-string references found in + each scenario's ``.k`` files. Self-references and references into ``lib/`` + are excluded.""" + graph: dict[str, set[str]] = {s: set() for s in scenarios} + for scenario in scenarios: + scenario_dir = os.path.join(kcl_root, *scenario.split("/")) + for name in os.listdir(scenario_dir): + if not name.endswith(".k"): + continue + with open(os.path.join(scenario_dir, name)) as f: + text = f.read() + for path_fragment in extract_path_refs(text): + owner = ref_owner(path_fragment, scenarios) + if owner is not None and owner != scenario: + graph[scenario].add(owner) + return graph + + +def find_scenario_reference_cycles(kcl_root: str) -> list[list[str]]: + """Return cross-scenario path-reference cycles under ``kcl_root``.""" + scenarios = discover_scenarios(kcl_root) + return find_cycles(build_graph(kcl_root, scenarios)) + + +def format_kcl_cycle_error(stderr: str) -> str | None: + """If ``stderr`` from ``kcl run`` reports a circular import, return a + friendly one-line summary; otherwise return None. + + KCL already detects scenario import cycles (error E1001 RecursiveLoad). This + just reframes that diagnostic in scenario terms so authors get a clear + pointer during ``/generate_yaml``. + """ + if "RecursiveLoad" not in stderr: + return None + modules = "" + m = _KCL_CYCLE_RE.search(stderr) + if m: + modules = m.group(1).strip() + summary = "Circular import detected between scenario modules" + if modules: + summary += f": {modules}" + summary += ( + ". Break the cycle so no scenario (transitively) imports itself." + ) + return summary + + +def main(argv: list[str] | None = None) -> int: + import argparse + + parser = argparse.ArgumentParser( + description=( + "Cross-scenario validation gate: fails on path-string reference " + "cycles between Telescope scenarios (a blind spot KCL's own import-" + "cycle check cannot see)." + ) + ) + default_kcl = os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "kcl" + ) + parser.add_argument( + "--kcl-root", + default=default_kcl, + help="Path to the kcl/ directory (default: /kcl).", + ) + args = parser.parse_args(argv) + + cycles = find_scenario_reference_cycles(args.kcl_root) + if not cycles: + print("validate_scenarios: OK (no cross-scenario reference cycles)") + return 0 + + print("validate_scenarios: FAILED - cross-scenario reference cycle(s):") + for cyc in cycles: + print(" " + " -> ".join(cyc + [cyc[0]])) + print( + "\nA scenario's path-string references (cl2Manifest / kwokNodeManifest) " + "must not form a loop into another scenario's directory." + ) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) +