bootjp · bootjp · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.github/workflows/rolling-update.yml b/.github/workflows/rolling-update.yml
@@ -0,0 +1,199 @@
+name: Rolling update
+
+# Manually-triggered production rollout. Joins the Tailnet, SSHes over
+# MagicDNS into each node, and invokes scripts/rolling-update.sh.
+# See docs/design/2026_04_24_proposed_deploy_via_tailscale.md.
+
+on:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: Git ref (tag or sha) to deploy. Also used as the image tag unless image_tag is set.
+        required: true
+        type: string
+      image_tag:
+        description: Override the image tag (default = ref). Used for rollbacks.
+        required: false
+        type: string
+        default: ""
+      nodes:
+        description: Comma-separated raft IDs to roll (e.g. "n1,n2"). Empty = all nodes in NODES_RAFT_MAP.
+        required: false
+        type: string
+        default: ""
+      dry_run:
+        description: Render the plan and run a reachability check only; do NOT touch containers.
+        required: true
+        type: boolean
+        default: true
+
+permissions:
+  contents: read
+  id-token: write   # required by tailscale/github-action OIDC flow
+
+concurrency:
+  group: rolling-update
+  cancel-in-progress: false
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    # Approval gate — see GitHub environment settings for required reviewers.
+    # Dry-runs also use this environment so the secret wiring is identical;
+    # the environment's approval rule should be configured to auto-approve
+    # dry-runs if that distinction is desired (GitHub UI: "Deployment
+    # protection rules").
+    environment: production
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ inputs.ref }}
+
+      - name: Install jq
+        run: sudo apt-get install -y --no-install-recommends jq
+
+      - name: Verify image exists on ghcr.io
+        env:
+          IMAGE_BASE: ${{ vars.IMAGE_BASE }}
+          IMAGE_TAG: ${{ inputs.image_tag || inputs.ref }}
+          GHCR_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -euo pipefail
+          if [[ -z "$IMAGE_BASE" ]]; then
+            echo "::error::IMAGE_BASE repository variable is not set"
+            exit 1
+          fi
+          echo "Checking $IMAGE_BASE:$IMAGE_TAG"
+          echo "$GHCR_TOKEN" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin >/dev/null
+          if ! docker manifest inspect "$IMAGE_BASE:$IMAGE_TAG" >/dev/null; then
+            echo "::error::image $IMAGE_BASE:$IMAGE_TAG not found on ghcr.io"
+            exit 1
+          fi
+
+      - name: Join Tailnet (ephemeral)
+        uses: tailscale/github-action@v3
+        with:
+          oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
+          oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
+          tags: tag:ci-deploy
+
+      - name: Configure SSH
+        env:
+          SSH_KEY: ${{ secrets.DEPLOY_SSH_PRIVATE_KEY }}
+          KNOWN_HOSTS: ${{ secrets.DEPLOY_KNOWN_HOSTS }}
+        run: |
+          set -euo pipefail
+          mkdir -p ~/.ssh
+          chmod 700 ~/.ssh
+          printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          printf '%s\n' "$KNOWN_HOSTS" > ~/.ssh/known_hosts
+          chmod 644 ~/.ssh/known_hosts
+          # Sanity: no stray CRLF in the key, no empty file.
+          test -s ~/.ssh/id_ed25519 || { echo "::error::DEPLOY_SSH_PRIVATE_KEY is empty"; exit 1; }
+          ssh-keygen -lf ~/.ssh/id_ed25519 >/dev/null
+
+      - name: Render NODES and SSH_TARGETS
+        id: render
+        env:
+          NODES_RAFT_MAP: ${{ vars.NODES_RAFT_MAP }}
+          SSH_TARGETS_MAP: ${{ vars.SSH_TARGETS_MAP }}
+          NODES_FILTER: ${{ inputs.nodes }}
+        run: |
+          set -euo pipefail
+          if [[ -z "$NODES_RAFT_MAP" || -z "$SSH_TARGETS_MAP" ]]; then
+            echo "::error::NODES_RAFT_MAP or SSH_TARGETS_MAP is not set in the production environment variables"
+            exit 1
+          fi
+          if [[ -n "$NODES_FILTER" ]]; then
+            # Filter NODES_RAFT_MAP and SSH_TARGETS_MAP to the requested subset.
+            filter_csv() {
+              local all="$1"
+              local filter="$2"
+              local out=""
+              IFS=',' read -r -a entries <<< "$all"
+              IFS=',' read -r -a wanted <<< "$filter"
+              for e in "${entries[@]}"; do
+                key="${e%%=*}"
+                for w in "${wanted[@]}"; do
+                  if [[ "$key" == "$w" ]]; then
+                    out+="${e},"
+                    break
+                  fi
+                done
+              done
+              echo "${out%,}"
+            }
+            NODES_RAFT_MAP="$(filter_csv "$NODES_RAFT_MAP" "$NODES_FILTER")"
+            SSH_TARGETS_MAP="$(filter_csv "$SSH_TARGETS_MAP" "$NODES_FILTER")"
+            if [[ -z "$NODES_RAFT_MAP" ]]; then
+              echo "::error::nodes filter '$NODES_FILTER' matches nothing in NODES_RAFT_MAP"
+              exit 1
+            fi
+          fi
+          {
+            echo "NODES=$NODES_RAFT_MAP"
+            echo "SSH_TARGETS=$SSH_TARGETS_MAP"
+          } >> "$GITHUB_OUTPUT"
+          echo "::group::Deploy plan"
+          echo "NODES=$NODES_RAFT_MAP"
+          echo "SSH_TARGETS=$SSH_TARGETS_MAP"
+          echo "::endgroup::"
+
+      - name: Tailscale reachability check
+        env:
+          SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }}
+        run: |
+          set -euo pipefail
+          IFS=',' read -r -a entries <<< "$SSH_TARGETS"
+          failed=0
+          for e in "${entries[@]}"; do
+            host="${e##*=}"
+            host="${host%%:*}"
+            # strip user@ if present
+            host="${host##*@}"
+            if tailscale ping --c 2 --timeout 3s "$host" >/dev/null 2>&1; then
+              echo "  ok   $host"
+            else
+              echo "::error::$host not reachable over tailnet"
+              failed=1
+            fi
+          done
+          if [[ "$failed" -ne 0 ]]; then
+            exit 1
+          fi
+
+      - name: Dry-run summary
+        if: ${{ inputs.dry_run }}
+        env:
+          NODES: ${{ steps.render.outputs.NODES }}
+          SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }}
+          IMAGE_BASE: ${{ vars.IMAGE_BASE }}
+          IMAGE_TAG: ${{ inputs.image_tag || inputs.ref }}
+          SSH_USER: ${{ vars.SSH_USER }}
+        run: |
+          set -euo pipefail
+          cat <<EOF
+          ==== DRY RUN — no containers were touched ====
+          image:       ${IMAGE_BASE}:${IMAGE_TAG}
+          SSH user:    ${SSH_USER}
+          NODES:       ${NODES}
+          SSH_TARGETS: ${SSH_TARGETS}
+          ref:         ${{ inputs.ref }}
+          Re-run with dry_run=false to apply.
+          EOF
+
+      - name: Roll cluster
+        if: ${{ !inputs.dry_run }}
+        env:
+          NODES: ${{ steps.render.outputs.NODES }}
+          SSH_TARGETS: ${{ steps.render.outputs.SSH_TARGETS }}
+          SSH_USER: ${{ vars.SSH_USER }}
+          IMAGE: ${{ vars.IMAGE_BASE }}:${{ inputs.image_tag || inputs.ref }}
+          SSH_STRICT_HOST_KEY_CHECKING: "yes"
+        run: |
+          set -euo pipefail
+          ./scripts/rolling-update.sh
diff --git a/docs/deploy_via_tailscale_runbook.md b/docs/deploy_via_tailscale_runbook.md
@@ -0,0 +1,153 @@
+# Deploy via Tailscale + GitHub Actions — Runbook
+
+Companion doc to `docs/design/2026_04_24_proposed_deploy_via_tailscale.md`. This
+runbook is for operators: what to configure on GitHub and Tailscale so the
+`rolling-update` workflow can execute a production deploy.
+
+## 1. Precondition: Tailscale on every node
+
+Each cluster node must have `tailscale` installed, logged into the tailnet, and
+tagged so the CI runner's ACL can reach it.
+
+```
+# on each kv0X node
+sudo tailscale up \
+  --ssh=false \
+  --advertise-tags=tag:elastickv-node \
+  --accept-routes=false
+```
+
+Verify the node is reachable by MagicDNS from another tailnet peer:
+
+```
+tailscale status | grep kv0X
+ping kv0X.<tailnet>.ts.net
+```
+
+## 2. Tailscale ACL
+
+In the Tailscale admin console, add the deploy rule to the tailnet ACL:
+
+```jsonc
+"tagOwners": {
+  "tag:ci-deploy":      ["autogroup:admin"],
+  "tag:elastickv-node": ["autogroup:admin"],
+},
+"acls": [
+  {
+    "action": "accept",
+    "src":    ["tag:ci-deploy"],
+    "dst":    ["tag:elastickv-node:22"],
+  },
+],
+```
+
+`tag:ci-deploy` must NOT have access to any other port on the tailnet. The
+deploy workflow only needs SSH.
+
+## 3. Tailscale OAuth client
+
+Admin console → Settings → OAuth clients → New client:
+
+- Description: `elastickv GitHub Actions deploy`
+- Scopes: `auth_keys` (write). Recent `tailscale/github-action` versions
+  may additionally require `devices:core` (write); enable that if the
-  may additionally require `devices:core` (write); enable that if the
+  may additionally require devices (write); enable that if the
-  may additionally require `devices:core` (write); enable that if the
+  may additionally require devices (write); enable that if the
+  join step fails with an authorization error. The action's README is
+  the definitive source for current scope requirements.
+- Tags: `tag:ci-deploy`
+
+Copy the client ID and secret; they go into GitHub in the next step.
+
+## 4. GitHub environment: `production`
+
+Repo → Settings → Environments → New environment: `production`.
+
+### Required reviewers
+Configure "Required reviewers" on the environment. Non-dry-run deploys will
+pause until one of the reviewers approves. Configure "Deployment protection
+rules" to auto-approve if the workflow input `dry_run == true` (optional; cuts
+friction for previews).
+
+### Environment secrets
+
+| Name | Value |
+|------|-------|
+| `TS_OAUTH_CLIENT_ID`        | Tailscale OAuth client ID from step 3 |
+| `TS_OAUTH_SECRET`           | Tailscale OAuth secret from step 3 |
+| `DEPLOY_SSH_PRIVATE_KEY`    | OpenSSH private key, authorized on every node under the deploy user |
+| `DEPLOY_KNOWN_HOSTS`        | `ssh-keyscan kv01.<tailnet>.ts.net kv02.<tailnet>.ts.net …` output (one host per line) |
+
+The SSH key should be ed25519, dedicated to CI (not a reused developer key).
+Regenerate on operator rotation.
+
+### Environment variables
+
+| Name | Value | Example |
+|------|-------|---------|
+| `IMAGE_BASE`      | Container image path (no tag)     | `ghcr.io/bootjp/elastickv` |
+| `SSH_USER`        | SSH login on every node           | `bootjp` |
+| `NODES_RAFT_MAP`  | Comma-separated `raftId=host` (no port — the script appends `RAFT_PORT`) | `n1=kv01,n2=kv02,n3=kv03,n4=kv04,n5=kv05` |
+| `SSH_TARGETS_MAP` | Comma-separated `raftId=ssh-host` | `n1=kv01.<tailnet>.ts.net,n2=kv02.<tailnet>.ts.net,...` |
+
+## 5. Running a deploy
+
+Actions tab → "Rolling update" → Run workflow.
+
+Inputs:
+
+- `ref` — the git tag or sha to deploy (also used as the container image tag)
+- `image_tag` — override only for rollbacks (e.g., deploy tag `v1.2.3` of a
+  commit that was also `v1.2.3`)
+- `nodes` — subset of raft IDs, e.g., `n1,n2`. Empty rolls all nodes.
+- `dry_run` — default `true`. Renders the plan and checks reachability without
+  touching containers.
+
+Recommended first-run sequence:
+
+1. `dry_run: true`, `nodes: n1`, `ref: <target>` — confirms tailnet join,
+   SSH config, image availability, target mapping. No production impact.
+2. `dry_run: false`, `nodes: n1` — roll a single node, verify the cluster
+   stays healthy and the image is correct.
+3. `dry_run: false`, `nodes:` (empty) — full roll.
+
+## 6. Rollback
+
+Re-run the workflow with `image_tag` set to the previous-known-good sha. The
+`nodes` input can target specific nodes if only some carry the bad image.
+
+## 7. What the workflow does NOT do (yet)
+
+- **No post-deploy health verification beyond tailnet reachability.** The
+  script itself blocks on `raftadmin` leadership transfer and health-gate
+  timeouts, but the workflow does not independently probe Prometheus or
+  Redis after the roll. Add this when we have a canonical post-deploy
+  assertion suite.
+- **No auto-rollback on failure.** If the script exits non-zero mid-roll,
+  the cluster is left in whatever state the script reached. The operator
+  must inspect and either re-roll or roll back manually.
+- **No Jepsen gate.** The deploy does not require a green Jepsen run on
+  `ref` before proceeding.
+- **No image-signature check.** `cosign verify` on the image is a follow-up.
+
+## 8. Troubleshooting
+
+### Job pauses indefinitely at "Waiting for approval"
+Expected for non-dry-run deploys — a reviewer from the `production` environment
+must click Approve. Check the "Required reviewers" list in the environment
+settings.
+
+### `tailscale ping` fails for a node
+The node may not be running `tailscaled`, not tagged `tag:elastickv-node`, or
+the tailnet ACL may have drifted. `tailscale status` on the node should show
+the tag; the admin console should show the IP in the `tag:elastickv-node`
+group.
+
+### `image ... not found on ghcr.io`
+The verification step hit the ghcr manifest API and got a 404. Either the
+image tag was not pushed (check the `Docker Image CI` workflow for `ref`) or
+the tag is a moving tag (`latest`) that the verification step can't
+distinguish from stale. Specify an immutable tag.
+
+### SSH `Host key verification failed`
+`DEPLOY_KNOWN_HOSTS` is stale. Re-run `ssh-keyscan` against every node and
+update the secret.