diff --git a/.github/workflows/auto-tag.yaml b/.github/workflows/auto-tag.yaml index 246e3bc..07a1b6e 100644 --- a/.github/workflows/auto-tag.yaml +++ b/.github/workflows/auto-tag.yaml @@ -3,6 +3,10 @@ name: Auto-tag patch release on: push: branches: [main] + # Trigger on changes that affect the built lantern-box binary only. + # deploy/packer/** intentionally NOT in this list: under Reflog's + # Option B, packer changes don't produce a new .deb, so they don't + # need a version bump. build-images.yaml self-triggers on those. paths: - '**.go' - 'go.mod' @@ -10,7 +14,6 @@ on: - 'Dockerfile' - 'Dockerfile.goreleaser' - 'cmd/release/*.service' - - 'deploy/packer/**' permissions: contents: write diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml index 37e9eae..a96ae10 100644 --- a/.github/workflows/build-images.yaml +++ b/.github/workflows/build-images.yaml @@ -1,16 +1,23 @@ name: Build Packer Images on: - # Run after a release is published - release: - types: [published] + # Packer image contents changed — rebuild. Under Reflog's Option B the + # lantern-box binary no longer lives in the image, so a plain Go release + # doesn't require an image rebuild. This trigger fires only when + # something in the image itself actually changed (provision.sh, + # cloud-init scripts, packer HCL, otel config, the workflow itself). + push: + branches: [main] + paths: + - 'deploy/packer/**' + - '.github/workflows/build-images.yaml' - # Allow manual trigger for testing + # Allow manual trigger for ad-hoc rebuilds (base-image updates, etc.) workflow_dispatch: inputs: version: - description: "lantern-box version to bake into the image (e.g. 1.2.3)" - required: true + description: "Label to apply to the built image (e.g. 1.2.3 — used only for image naming; the lantern-box binary comes from cloud-init at boot time)" + required: false type: string builders: description: "Comma-separated builders (linode,oracle-oci,alicloud-ecs)" @@ -31,33 +38,60 @@ jobs: version: ${{ steps.version.outputs.version }} matrix: ${{ steps.matrix.outputs.matrix }} steps: + - name: Checkout (full history — needed to find the latest tag) + uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Determine version id: version env: EVENT_NAME: ${{ github.event_name }} - INPUT_VERSION: ${{ inputs.version }} - REF_NAME: ${{ github.ref_name }} + # github.event.inputs.version resolves on every event type + # (including push, where it's simply empty). Plain + # `inputs.version` is only defined for workflow_dispatch / + # workflow_call — using it on a push trigger risks a stricter + # evaluator rejecting the expression before the shell script + # even runs. + INPUT_VERSION: ${{ github.event.inputs.version }} run: | - if [ "$EVENT_NAME" = "release" ]; then - echo "version=${REF_NAME#v}" >> "$GITHUB_OUTPUT" - else - if [[ "$INPUT_VERSION" == v* ]]; then - echo "::error::Version should not start with 'v' (got '$INPUT_VERSION'). Use e.g. 1.2.3" - exit 1 - fi - echo "version=$INPUT_VERSION" >> "$GITHUB_OUTPUT" - fi + case "$EVENT_NAME" in + workflow_dispatch) + if [ -z "$INPUT_VERSION" ]; then + # Operator didn't specify — fall through to "latest tag" path. + : + elif [[ "$INPUT_VERSION" == v* ]]; then + echo "::error::Version should not start with 'v' (got '$INPUT_VERSION'). Use e.g. 1.2.3" + exit 1 + else + echo "version=$INPUT_VERSION" >> "$GITHUB_OUTPUT" + exit 0 + fi + ;; + esac + # push or workflow_dispatch without version: label the image with + # the latest tag on main. The image itself is version-agnostic + # under Option B — this is purely for image-name bookkeeping and + # the per-provider latestImage() helpers that filter on prefix. + latest=$(git tag -l 'v*' --sort=-v:refname | head -1) + latest="${latest:-v0.0.0}" + echo "version=${latest#v}" >> "$GITHUB_OUTPUT" - name: Build matrix id: matrix env: EVENT_NAME: ${{ github.event_name }} - INPUT_BUILDERS: ${{ inputs.builders }} + # See note on INPUT_VERSION above — same reasoning for using + # github.event.inputs rather than bare inputs so push events + # don't choke on an undefined context. + INPUT_BUILDERS: ${{ github.event.inputs.builders }} run: | - if [ "$EVENT_NAME" = "release" ]; then + if [ "$EVENT_NAME" = "push" ]; then builders="linode,oracle-oci,alicloud-ecs" - else + elif [ "$EVENT_NAME" = "workflow_dispatch" ] && [ -n "$INPUT_BUILDERS" ]; then builders="$INPUT_BUILDERS" + else + builders="linode,oracle-oci,alicloud-ecs" fi # Validate that we have at least one builder if [ -z "$builders" ]; then diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9c4f79e..e0edf58 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -44,10 +44,9 @@ jobs: DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} FURY_TOKEN: ${{ secrets.FURY_TOKEN }} - - name: Trigger Packer image builds - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Strip 'v' prefix for the version input - version="${GITHUB_REF_NAME#v}" - gh workflow run build-images.yaml -f "version=$version" + # Under Reflog's Option B the packer image no longer bakes in a + # specific lantern-box version — cloud-init installs the target + # tag on first boot, and the central-orchestration hot-swap + # worker pulls existing boxes forward. So a Go release no longer + # needs a packer image rebuild. build-images.yaml self-triggers + # on deploy/packer/** changes and on manual workflow_dispatch. diff --git a/deploy/packer/README.md b/deploy/packer/README.md index 217000a..b43ba96 100644 --- a/deploy/packer/README.md +++ b/deploy/packer/README.md @@ -1,13 +1,34 @@ # Packer Images for lantern-box -Pre-baked VM images with lantern-box installed. Boot-to-proxy-ready in ~35-60 seconds (vs 2-4 minutes with a stock image). +Pre-baked VM images with runtime dependencies, systemd drop-ins, and +sidecars (otelcol-contrib, Tailscale) already present. The +`lantern-box` binary itself is **not** baked in — cloud-init apt-installs +it on first boot (Reflog's Option B; see the "Not in the image" section +below). Boot-to-proxy-ready is still fast because the heavy work +(package install, otel config, CA cert, user setup) is done at image +build time; only the small apt-install step runs on first boot. ## What's in the image - Ubuntu 24.04 LTS - Runtime deps: ca-certificates, tzdata, nftables, wireguard-tools -- lantern-box binary (from Gemfury .deb) -- systemd service (installed but not enabled — cloud-init starts it) +- otelcol-contrib + systemd drop-in for host metrics +- systemd drop-ins for lantern-box env (OTel, etc.) +- `/etc/lantern-box/` and `/var/lib/lantern-box/` directories + +**Not in the image (Reflog's Option B):** the `lantern-box` binary itself. +Under the central-orchestration design in +`lantern-cloud/docs/design/central-vps-updates.md`, cloud-init +apt-installs the target release tag on first boot — decoupling release +cadence (frequent) from base-image cadence (rare). The packer image is +now version-agnostic; only base-image changes (Ubuntu patches, systemd +drop-in updates, new sidecars) need a rebuild. + +**Operators:** before rolling out a new image built from this code, +ensure `bandit_vps_default_release_tag` (or a per-track override) is set +in the lantern-cloud settings. Otherwise new VMs boot without lantern-box +installed and the provision worker's `systemctl enable --now lantern-box` +call will fail. ## Prerequisites diff --git a/deploy/packer/provision.sh b/deploy/packer/provision.sh index 8753dda..fec855d 100755 --- a/deploy/packer/provision.sh +++ b/deploy/packer/provision.sh @@ -1,7 +1,13 @@ #!/usr/bin/env bash set -euo pipefail -# VERSION is passed as an environment variable by Packer. +# VERSION is passed as an environment variable by Packer. Under Reflog's +# Option B the packer image no longer installs a specific lantern-box +# release — cloud-init installs the target tag on first boot. VERSION is +# still required and used purely as a label for the built image (see +# lantern-box.pkr.hcl's `image_name = "lantern-box-${var.lantern_box_version}-..."`). +# It stays set so tooling that slices by image label (e.g. the per-provider +# latestImage() helpers in lantern-cloud/cmd/api/vps/*.go) keeps working. : "${VERSION:?VERSION must be set}" export DEBIAN_FRONTEND=noninteractive @@ -60,28 +66,33 @@ apt-get "${APT_OPTS[@]}" update -q apt-get "${APT_OPTS[@]}" install -y -q \ ca-certificates \ tzdata \ - nftables - -echo "==> Downloading lantern-box .deb from GitHub release" -arch=$(dpkg --print-architecture) # amd64 or arm64 -deb_name="lantern-box_${VERSION}_linux_${arch}.deb" -deb_url="https://github.com/getlantern/lantern-box/releases/download/v${VERSION}/${deb_name}" -echo " URL: ${deb_url}" -curl -fsSL -o "/tmp/${deb_name}" "${deb_url}" - -echo "==> Installing ${deb_name}" -apt-get "${APT_OPTS[@]}" install -y -q "/tmp/${deb_name}" -rm -f "/tmp/${deb_name}" + nftables \ + wireguard-tools + +# Reflog's Option B (Slack thread ts=1776197690.140869 in +# #infrastructure-and-services, 2026-04-16): the packer image no longer +# bakes in a specific lantern-box version. Cloud-init apt-installs the +# release tag the orchestrator picked for this route — see +# `getlantern/lantern-cloud` cmd/api/vps/cloudinit_packer.go. This +# decouples release cadence (frequent) from base-image cadence (rare). +# +# The packer image contributes: runtime deps (installed above), systemd +# drop-ins for OTel env (below), and /etc/lantern-box and +# /var/lib/lantern-box dirs. The lantern-box .deb itself lands via +# cloud-init on first boot. +# +# Operators: BEFORE building + rolling out new images from this change, +# set bandit_vps_default_release_tag in the lantern-cloud settings table +# (or a per-track override in bandit_vps_image_targets). Without either, +# cloud-init will skip the apt-install step and new VMs will boot +# without a lantern-box binary — `systemctl enable --now lantern-box` +# during config push will then fail. Revert is: re-merge the pre-Option-B +# provision.sh. +arch=$(dpkg --print-architecture) # amd64 or arm64 — still used below echo "==> Setting up directories" mkdir -p /etc/lantern-box /var/lib/lantern-box -systemctl daemon-reload - -# Do NOT enable the service here — it would start on boot before cloud-init -# writes the config, causing a startup failure loop. Cloud-init should run: -# systemctl enable --now lantern-box - echo "==> Installing OTel Collector for host metrics" otelcol_version="0.120.0" otelcol_deb="otelcol-contrib_${otelcol_version}_linux_${arch}.deb" @@ -121,132 +132,13 @@ DROPIN systemctl daemon-reload # Do NOT enable — cloud-init writes env vars first, then enables the service. -echo "==> Setting up lantern-box auto-update (via GitHub Releases)" -mkdir -p /var/log/lantern-box -cat > /usr/local/bin/lantern-box-update <<'SCRIPT' -#!/bin/bash -set -uo pipefail -# NOTE: no -e — we handle errors explicitly so we can log them to SigNoz. - -LOG_FILE="/var/log/lantern-box/update.log" - -# Structured JSON log line for the OTel filelog receiver → SigNoz pipeline. -# Uses python3 json.dumps for safe escaping of all dynamic fields. -log_json() { - local level="$1" msg="$2" - local ts json_line - ts=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - json_line=$( - TS="$ts" \ - LEVEL="$level" \ - MSG="$msg" \ - CURRENT_VER="${current_ver:-unknown}" \ - LATEST_VER="${latest_ver:-unknown}" \ - python3 -c ' -import json, os -print(json.dumps({ - "timestamp": os.environ["TS"], - "severity": os.environ["LEVEL"], - "body": os.environ["MSG"], - "source": "lantern-box-update", - "current_ver": os.environ.get("CURRENT_VER", "unknown"), - "latest_ver": os.environ.get("LATEST_VER", "unknown"), -}, ensure_ascii=False)) -' - ) - printf '%s\n' "$json_line" >> "$LOG_FILE" -} - -# Prevent overlapping runs — cron may start a new instance while the -# previous one is still sleeping or installing. -exec 9>/var/lock/lantern-box-update.lock -if ! flock -n 9; then - exit 0 -fi - -# Derive a per-machine sleep (0-599s) from machine-id so instances stagger naturally -# within the 10-minute check interval. -sleep $(( $(cksum /etc/machine-id | cut -d' ' -f1) % 600 )) - -arch=$(dpkg --print-architecture) -current_ver=$(dpkg-query -W -f='${Version}' lantern-box 2>/dev/null || echo "none") - -# Fetch latest release tag by following the /releases/latest redirect and -# extracting the tag from the final URL. This is more robust than reading -# %{redirect_url} from a HEAD request, which can fail when curl is behind -# a transparent proxy or CDN that follows the 302 before curl sees it. -curl_err=$(mktemp) -final_url=$(curl -fsSL --retry 3 --max-time 30 -o /dev/null -w '%{url_effective}' \ - https://github.com/getlantern/lantern-box/releases/latest 2>"$curl_err") || true -curl_stderr=$(cat "$curl_err" 2>/dev/null) -rm -f "$curl_err" -latest_tag="${final_url##*/}" - -if [ -z "$latest_tag" ] || [ "$latest_tag" = "latest" ]; then - log_json "ERROR" "failed to fetch latest release tag (final_url=${final_url:-empty}, curl_err=${curl_stderr:-none})" - exit 1 -fi - -latest_ver="${latest_tag#v}" - -if [ "$current_ver" = "$latest_ver" ]; then - exit 0 -fi - -log_json "INFO" "update available: ${current_ver} -> ${latest_ver}" - -deb_name="lantern-box_${latest_ver}_linux_${arch}.deb" -deb_url="https://github.com/getlantern/lantern-box/releases/download/${latest_tag}/${deb_name}" - -tmpfile=$(mktemp /tmp/lantern-box-update-XXXXXX.deb) || tmpfile="" -if [ -z "$tmpfile" ]; then - log_json "ERROR" "failed to create temporary file for downloading ${deb_url}" - exit 1 -fi -trap 'rm -f "$tmpfile"' EXIT - -if ! curl -fsSL --retry 3 --max-time 120 -o "$tmpfile" "${deb_url}"; then - log_json "ERROR" "failed to download ${deb_url}" - exit 1 -fi - -if ! dpkg -o DPkg::Lock::Timeout=120 -i "$tmpfile"; then - apt-get -o DPkg::Lock::Timeout=120 update -qq && apt-get -o DPkg::Lock::Timeout=120 install -f -y -qq -fi - -new_ver=$(dpkg-query -W -f='${Version}' lantern-box 2>/dev/null || echo "none") -if [ "$new_ver" != "$latest_ver" ]; then - log_json "ERROR" "install failed: expected ${latest_ver} but got ${new_ver}" - exit 1 -fi - -log_json "INFO" "upgraded ${current_ver} -> ${new_ver}, restarting service" -if ! systemctl restart lantern-box; then - log_json "ERROR" "failed to restart lantern-box service after upgrade to ${new_ver}" - exit 1 -fi -SCRIPT -chmod 755 /usr/local/bin/lantern-box-update - -# Rotate the update log so it doesn't grow unbounded. -cat > /etc/logrotate.d/lantern-box <<'LOGROTATE' -/var/log/lantern-box/*.log { - daily - rotate 7 - compress - missingok - notifempty - copytruncate -} -LOGROTATE - -cat > /etc/cron.d/lantern-box-update <<'CRON' -SHELL=/bin/bash -PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -MAILTO="" -*/10 * * * * root /usr/local/bin/lantern-box-update 2>&1 | logger -t lantern-box-update -CRON -chmod 644 /etc/cron.d/lantern-box-update +# Auto-update is now centrally orchestrated from lantern-cloud — see +# docs/design/central-vps-updates.md. BanditVPSHotSwapWorker SSHes in +# and installs the target release tag; if SSH fails repeatedly, +# BanditVPSAutoreplaceWorker drains the route and a fresh VM provisions +# with the right tag via cloud-init. No more per-host cron, no more +# silent "install failed" errors with no way to identify affected +# hosts (we had 266/hour of those at peak). # Re-enable unattended-upgrades so the final image receives security updates. systemctl unmask unattended-upgrades.service 2>/dev/null || true @@ -295,10 +187,32 @@ ln -sf /etc/ssl/certs/lanternet.crt /usr/local/share/ca-certificates/lantern/lan update-ca-certificates echo " lanternet CA installed" -echo "==> Verifying installation" -if ! command -v lantern-box >/dev/null 2>&1; then - echo "lantern-box not found on PATH" >&2 +echo "==> Verifying image contents" +# Under Option B the lantern-box binary is NOT expected in the image — +# cloud-init apt-installs it on first boot. Check the things the packer +# image actually contributes instead: the systemd drop-ins, the data +# dirs, and the sidecars that ARE baked in here. +missing="" +for path in \ + /etc/systemd/system/lantern-box.service.d/otel.conf \ + /etc/systemd/system/otelcol-contrib.service.d/env.conf \ + /etc/lantern-box \ + /var/lib/lantern-box \ + /etc/otelcol-contrib/config.yaml \ + /etc/ssl/certs/lanternet.crt; do + [ -e "$path" ] || missing="$missing $path" +done +if [ -n "$missing" ]; then + echo "image verification failed; missing:$missing" >&2 + exit 1 +fi +if ! command -v tailscale >/dev/null 2>&1; then + echo "tailscale not found on PATH" >&2 + exit 1 +fi +if ! command -v otelcol-contrib >/dev/null 2>&1; then + echo "otelcol-contrib not found on PATH" >&2 exit 1 fi -echo " lantern-box installed at $(command -v lantern-box)" +echo " image contents verified" echo "==> Done. Image ready."