diff --git a/.buildkite/hooks/pre-exit b/.buildkite/hooks/pre-exit deleted file mode 100644 index e738f76e7..000000000 --- a/.buildkite/hooks/pre-exit +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# Remove the docker container on which kind is running -# Also removes the volume used by it -docker container rm -v -f kind-${BUILDKITE_JOB_ID}-control-plane -# Remove the docker image created for the local PR code -docker image rm -f vitess-operator-pr:latest - -# This hack exists because vitess-operator modifies the permissions on the git -# checkout during CI from inside docker. This causes future jobs run on the same -# node to fail the git checkout step due to permission errors -# -# Our fix is to reset the perms after each job step. We can't run arbitrary -# sudo commands as the buildkite-agent user but we _can_ run the /usr/bin/fix-buildkite-agent-builds-permissions -# tool via sudo -# -# these cmds stolen from: https://github.com/buildkite/elastic-ci-stack-for-aws/blob/da3aef5d96cecb796636a7ac25d7b205a6a0cc90/packer/linux/conf/buildkite-agent/hooks/environment#L117-L141 - -set -euo pipefail - -AGENT_ORG_PIPELINE_DIR="${BUILDKITE_BUILD_CHECKOUT_PATH#"${BUILDKITE_BUILD_PATH}/"}" -AGENT_DIR="${AGENT_ORG_PIPELINE_DIR%%/*}" - -set -x -sudo /usr/bin/fix-buildkite-agent-builds-permissions "$AGENT_DIR" planetscale vitess-operator diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml deleted file mode 100644 index 8c7a67b89..000000000 --- a/.buildkite/pipeline.yml +++ /dev/null @@ -1,183 +0,0 @@ -agents: - queue: "public" - -env: - GO_VERSION_FILE: "go1.26.2.linux-amd64.tar.gz" - -# Mount the docker.sock as to the docker container, so that we are able to -# run docker build command and kind is spawned as a sibling container. -steps: - - name: "Upgrade Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make upgrade-test - concurrency: 1 - concurrency_group: 'vtop/upgrade-downgrade-test' - timeout_in_minutes: 30 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: &retry_policy_tests - # Automatically retry tests on unexpected Buildkite Agent exit codes - automatic: - - exit_status: -1 # Agent lost - limit: 2 - - exit_status: 143 # Graceful agent termination - limit: 2 - - exit_status: 255 # Forceful agent termination - limit: 2 - - - name: "Backup Restore Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-restore-test - concurrency: 1 - concurrency_group: 'vtop/backup-restore-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Backup Schedule Cluster/Keyspace Scope Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-schedule-keyspace-test - concurrency: 1 - concurrency_group: 'vtop/backup-schedule-keyspace-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Backup Schedule Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-schedule-test - concurrency: 1 - concurrency_group: 'vtop/backup-schedule-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Backup Schedule vtctldclient Method Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-schedule-vtctldclient-test - concurrency: 1 - concurrency_group: 'vtop/backup-schedule-vtctldclient-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "VTOrc and VTAdmin Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat chromium - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make vtorc-vtadmin-test - concurrency: 1 - concurrency_group: 'vtop/vtorc-vtadmin-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Unmanaged Tablet Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat coreutils - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make unmanaged-tablet-test - concurrency: 1 - concurrency_group: 'vtop/unmanaged-tablet-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "HPA Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make hpa-test - concurrency: 1 - concurrency_group: 'vtop/hpa-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml new file mode 100644 index 000000000..263a7570a --- /dev/null +++ b/.github/workflows/e2e-test.yaml @@ -0,0 +1,105 @@ +name: e2e-test +on: + push: + branches: + - main + - release-** + pull_request: + branches: + - main + - release-** + +jobs: + e2e: + name: ${{ matrix.test.name }} + runs-on: vitess-operator-runner + timeout-minutes: 40 + strategy: + fail-fast: false + matrix: + test: + - name: "Upgrade Test" + target: upgrade-test + - name: "Backup Restore Test" + target: backup-restore-test + - name: "Backup Schedule Cluster/Keyspace Scope Test" + target: backup-schedule-keyspace-test + - name: "Backup Schedule Test" + target: backup-schedule-test + - name: "Backup Schedule vtctldclient Method Test" + target: backup-schedule-vtctldclient-test + - name: "VTOrc and VTAdmin Test" + target: vtorc-vtadmin-test + - name: "Unmanaged Tablet Test" + target: unmanaged-tablet-test + - name: "HPA Test" + target: hpa-test + env: + # Kept short because kind sets the control-plane container hostname to + # "kind--control-plane", and Linux HOST_NAME_MAX is 64. + CI_JOB_ID: ${{ matrix.test.target }} + steps: + - name: Prepare runner for kind (Ubuntu 24.04) + run: | + # Ubuntu 24.04 ships with several kernel-level restrictions on + # unprivileged user namespaces that break mysqld when it runs inside + # a nested container (vttablet pod → kind → docker → runner VM). + # Buildkite's older-kernel agents don't hit these. + # + # Only sysctls here — do NOT stop apparmor.service or run + # aa-teardown, because Docker/BuildKit applies the docker-default + # AppArmor profile to build containers and will fail with + # "unable to apply apparmor profile" if the profile is unloaded. + sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 || true + sudo sysctl -w kernel.apparmor_restrict_unprivileged_unconfined=0 || true + sudo sysctl -w kernel.unprivileged_userns_clone=1 || true + sudo sysctl -w user.max_user_namespaces=65536 || true + # kind needs generous inotify limits once pod count grows. + sudo sysctl -w fs.inotify.max_user_watches=524288 + sudo sysctl -w fs.inotify.max_user_instances=512 + + - name: Check out code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install test dependencies + run: | + sudo apt-get update + sudo apt-get install -y mysql-client + + - name: Install chromium (vtorc-vtadmin only) + if: matrix.test.target == 'vtorc-vtadmin-test' + uses: browser-actions/setup-chrome@v2 + with: + chrome-version: stable + + - name: Alias chrome as chromium-browser + if: matrix.test.target == 'vtorc-vtadmin-test' + run: | + CHROME_BIN="$(command -v chrome || command -v google-chrome)" + sudo ln -sf "$CHROME_BIN" /usr/local/bin/chromium-browser + + - name: Build operator image + # Build before we tear down AppArmor. BuildKit refuses to start build + # containers once the docker-default AppArmor profile is unloaded. + # The image is tagged vitess-operator-pr:latest, which the test's + # setupBuildContainerImage will detect and skip rebuilding. + run: docker build --progress plain --file build/Dockerfile.release --tag vitess-operator-pr:latest . + + - name: Disable AppArmor before kind + # Now tear down AppArmor so the kind container and the pods it runs + # (in particular mysqld inside vttablet pods) aren't subject to Ubuntu + # 24.04's docker-default profile, which appears to be what's killing + # mysqld on startup ("Failed to open required defaults file" within + # ~17ms of spawn even though mysqlctld just wrote the file). + run: | + sudo systemctl stop apparmor.service || true + sudo systemctl disable apparmor.service || true + sudo aa-teardown || true + + - name: Run ${{ matrix.test.name }} + run: make ${{ matrix.test.target }} diff --git a/docs/release-process.md b/docs/release-process.md index 9e0f51eb8..ff7291416 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -112,7 +112,7 @@ The `upgrade_test.sh`, `backup_restore_test.sh`, `vtorc_vtadmin_test.sh` and `un ##### CI Failures > **Note** -> It is likely that the buildkite tests will fail on the release PR initially because of the unavailability of the latest vitess and vitess-operator docker images. This however doesn't block the release. The tests should be restarted after the said images are built and available. +> It is likely that the end-to-end tests will fail on the release PR initially because of the unavailability of the latest vitess and vitess-operator docker images. This however doesn't block the release. The tests should be restarted after the said images are built and available. ------------------- diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh index 9c2f0f232..38e813581 100644 --- a/test/endtoend/utils.sh +++ b/test/endtoend/utils.sh @@ -6,7 +6,7 @@ # set -x shopt -s expand_aliases alias vtctldclient="vtctldclient --server=localhost:15999" -BUILDKITE_JOB_ID="${BUILDKITE_JOB_ID:-0}" +CI_JOB_ID="${CI_JOB_ID:-0}" # Suppress warnings when using MariaDB Client mysql_version="$(mysql --version 2>/dev/null)" @@ -521,11 +521,19 @@ function assertSelect() { } function setupBuildContainerImage() { + # Skip the build if the image is already present. This lets CI build the + # image in a dedicated step before disabling AppArmor (BuildKit refuses to + # run if the docker-default AppArmor profile is unloaded). + if docker image inspect vitess-operator-pr:latest >/dev/null 2>&1; then + echo "vitess-operator-pr:latest already present, skipping build" + return + fi + echo "Building the container image" - # Clean up build output in CI + # Use plain progress output in CI so logs are line-buffered and readable. local progress="auto" - if [[ "${BUILDKITE_JOB_ID}" != "0" ]]; then + if [[ -n "${CI:-}" ]]; then progress="plain" fi @@ -535,25 +543,9 @@ function setupBuildContainerImage() { function setupKindCluster() { setupBuildContainerImage createKindCluster - setupKubectlAccessForCI createExampleNamespace } -function setupKubectlAccessForCI() { - if [[ "${BUILDKITE_JOB_ID}" != "0" ]]; then - # The script is being run from buildkite, so we need to do stuff - # https://github.com/kubernetes-sigs/kind/issues/1846#issuecomment-691565834 - # Since kind is running in a sibling container, communicating with it through kubectl is not trivial. - # To accomplish we need to add the current docker container in the same network as the kind container - # and change the kubectl configuration to use the port listed in the internal endpoint instead of the one - # that is exported to the localhost by kind. - local docker_container_name - docker_container_name="$(hostname -s)" - docker network connect kind "${docker_container_name}" - kind get kubeconfig --internal --name "kind-${BUILDKITE_JOB_ID}" > "${HOME}/.kube/config" - fi -} - # shellcheck disable=SC2120 # function has an optional argument function setupPortForwarding() { local with_vtadmin="${1:-}" # Pass `with_vtadmin` to also enable port forwarding to VTAdmin @@ -590,14 +582,14 @@ function setupPortForwarding() { function teardownKindCluster() { echo "Deleting the Kind cluster. This also deletes the volume associated with it." - kind delete cluster --name "kind-${BUILDKITE_JOB_ID}" + kind delete cluster --name "kind-${CI_JOB_ID}" } function createKindCluster() { echo "Creating Kind cluster" - kind create cluster --wait 30s --name "kind-${BUILDKITE_JOB_ID}" --image "${KIND_VERSION}" + kind create cluster --wait 30s --name "kind-${CI_JOB_ID}" --image "${KIND_VERSION}" echo "Loading docker image into Kind cluster" - kind load docker-image vitess-operator-pr:latest --name "kind-${BUILDKITE_JOB_ID}" + kind load docker-image vitess-operator-pr:latest --name "kind-${CI_JOB_ID}" } function createExampleNamespace() {