From 7cba171a3b67e5319d9c347f290f40c41052fd89 Mon Sep 17 00:00:00 2001 From: Jose Castillo Lema Date: Fri, 29 May 2026 10:50:37 +0200 Subject: [PATCH 1/4] [NVIDIA-850] Move signed precompiled drivers to registry.stage.redhat.io Signed-off-by: Jose Castillo Lema --- ...stem-edge-nvidia-ci-main__4.21-stable.yaml | 7 +-- .../nvidia-gpu-operator-e2e-aws-workflow.yaml | 1 + .../merge-stage-credentials/OWNERS | 17 +++++++ ...erator-merge-stage-credentials-commands.sh | 51 +++++++++++++++++++ ...-merge-stage-credentials-ref.metadata.json | 23 +++++++++ ...-operator-merge-stage-credentials-ref.yaml | 30 +++++++++++ 6 files changed, 126 insertions(+), 3 deletions(-) create mode 100644 ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/OWNERS create mode 100644 ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh create mode 100644 ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.metadata.json create mode 100644 ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml diff --git a/ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.21-stable.yaml b/ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.21-stable.yaml index 6dda0c140ac9c..a9574f478a432 100644 --- a/ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.21-stable.yaml +++ b/ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.21-stable.yaml @@ -160,11 +160,12 @@ tests: env: AWS_SECUREBOOT_ENABLED: "true" BASE_DOMAIN: edge-sro.rhecoeng.com + MERGE_STAGE_REGISTRY_CREDENTIALS: "true" NVIDIAGPU_GPU_CLUSTER_POLICY_PATCH: '[{"op": "add", "path": "/spec/driver/usePrecompiled", "value": true}, {"op": "add", "path": "/spec/driver/repository", "value": - "quay.io/jcastillolema"}, {"op": "add", "path": "/spec/driver/image", "value": - "gpu-driver-rhel9"}, {"op": "add", "path": "/spec/driver/version", "value": - "580.159.03"}]' + "registry.stage.redhat.io/nvidia"}, {"op": "add", "path": "/spec/driver/image", + "value": "gpu-driver-rhel9"}, {"op": "add", "path": "/spec/driver/version", + "value": "580.159.03"}]' NVIDIAGPU_SUBSCRIPTION_CHANNEL: v26.3 workflow: nvidia-gpu-operator-e2e-aws zz_generated_metadata: diff --git a/ci-operator/step-registry/nvidia-gpu-operator/e2e-aws/nvidia-gpu-operator-e2e-aws-workflow.yaml b/ci-operator/step-registry/nvidia-gpu-operator/e2e-aws/nvidia-gpu-operator-e2e-aws-workflow.yaml index f33c3d1776dfb..54afe5819aa86 100644 --- a/ci-operator/step-registry/nvidia-gpu-operator/e2e-aws/nvidia-gpu-operator-e2e-aws-workflow.yaml +++ b/ci-operator/step-registry/nvidia-gpu-operator/e2e-aws/nvidia-gpu-operator-e2e-aws-workflow.yaml @@ -7,6 +7,7 @@ workflow: - chain: ipi-install test: - ref: aws-secureboot-verify + - ref: nvidia-gpu-operator-merge-stage-credentials - as: gpu-operator-e2e commands: make run-tests from: nvidia-ci diff --git a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/OWNERS b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/OWNERS new file mode 100644 index 0000000000000..46d56d48e2b69 --- /dev/null +++ b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/OWNERS @@ -0,0 +1,17 @@ +approvers: +- empovit +- fabiendupont +- ggordanired +- josecastillolema +- TomerNewman +- wabouhamad +- ybettan +options: {} +reviewers: +- empovit +- fabiendupont +- ggordanired +- josecastillolema +- TomerNewman +- wabouhamad +- ybettan diff --git a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh new file mode 100644 index 0000000000000..db99b95b570ee --- /dev/null +++ b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +set -euo pipefail + +if [[ "${MERGE_STAGE_REGISTRY_CREDENTIALS}" != "true" ]]; then + echo "MERGE_STAGE_REGISTRY_CREDENTIALS is not 'true', skipping." + exit 0 +fi + +STAGE_REGISTRY_PATH="/var/run/vault/mirror-registry/registry_stage.json" + +if [[ ! -f "${STAGE_REGISTRY_PATH}" ]]; then + echo "Stage registry credentials not found at ${STAGE_REGISTRY_PATH}" + exit 1 +fi + +echo "Extracting current cluster pull secret..." +oc extract secret/pull-secret -n openshift-config --confirm --to /tmp + +echo "Merging registry.stage.redhat.io credentials..." +[[ $- == *x* ]] && WAS_TRACING=true || WAS_TRACING=false +set +x +stage_auth_user=$(jq -r '.user' "${STAGE_REGISTRY_PATH}") +stage_auth_password=$(jq -r '.password' "${STAGE_REGISTRY_PATH}") +stage_registry_auth=$(echo -n "${stage_auth_user}:${stage_auth_password}" | base64 -w 0) +$WAS_TRACING && set -x + +jq --argjson stage "{\"registry.stage.redhat.io\": {\"auth\": \"${stage_registry_auth}\"}}" \ + '.auths |= . + $stage' /tmp/.dockerconfigjson > /tmp/new-dockerconfigjson + +echo "Updating cluster pull secret..." +oc set data secret/pull-secret -n openshift-config \ + --from-file=.dockerconfigjson=/tmp/new-dockerconfigjson + +echo "Waiting for MCP worker pool to propagate..." +total=$(oc get mcp worker -o jsonpath='{.status.machineCount}') +COUNTER=0 +while [ $COUNTER -lt 600 ]; do + sleep 20 + COUNTER=$((COUNTER + 20)) + updated=$(oc get mcp worker -o jsonpath='{.status.updatedMachineCount}') + echo "MCP rollout: ${updated}/${total} machines updated (${COUNTER}s elapsed)" + if [[ "${updated}" == "${total}" ]]; then + echo "MCP rollout complete." + exit 0 + fi +done + +echo "MCP rollout timed out after ${COUNTER}s" +oc get mcp worker -o yaml +exit 1 diff --git a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.metadata.json b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.metadata.json new file mode 100644 index 0000000000000..bf0869c884c26 --- /dev/null +++ b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.metadata.json @@ -0,0 +1,23 @@ +{ + "path": "nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml", + "owners": { + "approvers": [ + "empovit", + "fabiendupont", + "ggordanired", + "josecastillolema", + "TomerNewman", + "wabouhamad", + "ybettan" + ], + "reviewers": [ + "empovit", + "fabiendupont", + "ggordanired", + "josecastillolema", + "TomerNewman", + "wabouhamad", + "ybettan" + ] + } +} \ No newline at end of file diff --git a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml new file mode 100644 index 0000000000000..8ee121b97b47f --- /dev/null +++ b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml @@ -0,0 +1,30 @@ +ref: + as: nvidia-gpu-operator-merge-stage-credentials + from: cli + cli: latest + grace_period: 10m + commands: nvidia-gpu-operator-merge-stage-credentials-commands.sh + timeout: 10m + resources: + requests: + cpu: 100m + memory: 100Mi + env: + - name: MERGE_STAGE_REGISTRY_CREDENTIALS + default: "false" + documentation: |- + When set to "true", merges registry.stage.redhat.io credentials into + the cluster global pull secret. Required for pulling precompiled driver + images from the Red Hat staging registry. When "false", the step is a + no-op. + credentials: + - namespace: test-credentials + name: openshift-custom-mirror-registry + mount_path: /var/run/vault/mirror-registry + documentation: |- + Merges registry.stage.redhat.io pull credentials into the cluster global + pull secret so that the GPU operator can pull precompiled driver images + from the Red Hat staging registry. + + Controlled by the MERGE_STAGE_REGISTRY_CREDENTIALS env var. When set to + "false" (default), the step exits immediately as a no-op. From a438e10cc35615b343b3c39678426ed3d9a68243 Mon Sep 17 00:00:00 2001 From: Jose Castillo Lema Date: Fri, 29 May 2026 11:28:22 +0200 Subject: [PATCH 2/4] Fix possible credential leak Signed-off-by: Jose Castillo Lema --- .../nvidia-gpu-operator-merge-stage-credentials-commands.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh index db99b95b570ee..e2985378d6bbb 100644 --- a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh +++ b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh @@ -23,10 +23,10 @@ set +x stage_auth_user=$(jq -r '.user' "${STAGE_REGISTRY_PATH}") stage_auth_password=$(jq -r '.password' "${STAGE_REGISTRY_PATH}") stage_registry_auth=$(echo -n "${stage_auth_user}:${stage_auth_password}" | base64 -w 0) -$WAS_TRACING && set -x jq --argjson stage "{\"registry.stage.redhat.io\": {\"auth\": \"${stage_registry_auth}\"}}" \ '.auths |= . + $stage' /tmp/.dockerconfigjson > /tmp/new-dockerconfigjson +$WAS_TRACING && set -x echo "Updating cluster pull secret..." oc set data secret/pull-secret -n openshift-config \ From 34ae982eb114f32b0cff76646b7594b39e3dfe22 Mon Sep 17 00:00:00 2001 From: Jose Castillo Lema Date: Fri, 29 May 2026 11:30:21 +0200 Subject: [PATCH 3/4] Adjust timeout and machineCount wait condition Signed-off-by: Jose Castillo Lema --- ...erator-merge-stage-credentials-commands.sh | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh index e2985378d6bbb..2e12a39f5ecec 100644 --- a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh +++ b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-commands.sh @@ -32,15 +32,27 @@ echo "Updating cluster pull secret..." oc set data secret/pull-secret -n openshift-config \ --from-file=.dockerconfigjson=/tmp/new-dockerconfigjson -echo "Waiting for MCP worker pool to propagate..." -total=$(oc get mcp worker -o jsonpath='{.status.machineCount}') +echo "Waiting for MCP worker pool to start updating..." COUNTER=0 -while [ $COUNTER -lt 600 ]; do +while [ $COUNTER -lt 120 ]; do sleep 20 COUNTER=$((COUNTER + 20)) - updated=$(oc get mcp worker -o jsonpath='{.status.updatedMachineCount}') - echo "MCP rollout: ${updated}/${total} machines updated (${COUNTER}s elapsed)" - if [[ "${updated}" == "${total}" ]]; then + updating=$(oc get mcp worker -o jsonpath='{.status.conditions[?(@.type=="Updating")].status}') + echo "MCP Updating=${updating:-unknown} (${COUNTER}s elapsed)" + if [[ "${updating}" == "True" ]]; then + echo "MCP update in progress." + break + fi +done + +echo "Waiting for MCP worker pool to finish updating..." +COUNTER=0 +while [ $COUNTER -lt 420 ]; do + sleep 20 + COUNTER=$((COUNTER + 20)) + updated=$(oc get mcp worker -o jsonpath='{.status.conditions[?(@.type=="Updated")].status}') + echo "MCP Updated=${updated:-unknown} (${COUNTER}s elapsed)" + if [[ "${updated}" == "True" ]]; then echo "MCP rollout complete." exit 0 fi From 12e7fa999da18602c2d190ecc7acd7747d2388d7 Mon Sep 17 00:00:00 2001 From: Jose Castillo Lema Date: Fri, 29 May 2026 14:02:32 +0200 Subject: [PATCH 4/4] Switch from cli to cli-jq image Signed-off-by: Jose Castillo Lema --- .../nvidia-gpu-operator-merge-stage-credentials-ref.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml index 8ee121b97b47f..5bd06d2c3e24d 100644 --- a/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml +++ b/ci-operator/step-registry/nvidia-gpu-operator/merge-stage-credentials/nvidia-gpu-operator-merge-stage-credentials-ref.yaml @@ -1,6 +1,9 @@ ref: as: nvidia-gpu-operator-merge-stage-credentials - from: cli + from_image: + namespace: ocp + name: cli-jq + tag: latest cli: latest grace_period: 10m commands: nvidia-gpu-operator-merge-stage-credentials-commands.sh