diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index de9d920c..1056f870 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -179,6 +179,27 @@ jobs: restore-keys: | ${{ env.CCACHE_DIR }}-${{ env.CCACHE_CACHE_NUMBER }}-${{ env.EMSCRIPTEN_VERSION }}-${{ runner.os }} + # ===== Wheel cache: compute fingerprints, restore, and apply mtime trick ===== + - name: Compute build fingerprints + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + run: | + python tools/compute_build_plan.py --output build-plan.json + + - name: Restore wheel cache + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + with: + path: .wheel-cache + key: wheel-cache-v1-${{ hashFiles('build-plan.json') }} + restore-keys: | + wheel-cache-v1- + + - name: Extract cached wheels + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + run: | + python tools/restore_build_cache.py --cache-dir .wheel-cache --build-plan build-plan.json + # ===== End wheel cache restore ===== + - name: Calculate recipes to build (pull_request) if: github.event_name == 'pull_request' id: calculate_recipes_pr @@ -248,6 +269,20 @@ jobs: pyodide build-recipes ${STEPS_CALCULATE_RECIPES_PR_OUTPUTS_RECIPES} --install --install-dir=./repodata --log-dir=build-logs | tee result/build_output.log ccache -s + # ===== Wheel cache: save built wheels ===== + - name: Bundle wheels for cache + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + run: | + python tools/save_build_cache.py --build-plan build-plan.json --cache-dir .wheel-cache + + - name: Save wheel cache + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' + uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + with: + path: .wheel-cache + key: wheel-cache-v1-${{ hashFiles('build-plan.json') }} + # ===== End wheel cache save ===== + - name: Parse build results id: parse_results env: diff --git a/docs/build-cache-strategy.md b/docs/build-cache-strategy.md new file mode 100644 index 00000000..163fc49b --- /dev/null +++ b/docs/build-cache-strategy.md @@ -0,0 +1,113 @@ +# Wheel Build Cache + +The wheel build cache avoids rebuilding packages on `main` branch CI when their inputs haven't changed. It sits in front of the existing `build-recipes "*"` command and requires no changes to `pyodide-build`. + +## How It Works + +``` +compute_build_plan.py → actions/cache/restore → build-recipes "*" → save_build_cache.py → actions/cache/save +``` + +1. `tools/compute_build_plan.py` computes a content fingerprint for every package and writes `build-plan.json`. +2. `actions/cache/restore` fetches `.wheel-cache/` from a previous run using `build-plan.json`'s hash as the cache key. +3. `tools/restore_build_cache.py` copies cached wheels whose fingerprints still match into `packages/{name}/dist/` and sets their mtime to year 2099. +4. `build-recipes "*" --install` runs unchanged. `needs_rebuild()` compares each wheel's mtime against `meta.yaml`'s mtime. Cached wheels (mtime 2099) are newer than any source file, so they are skipped. Uncached packages rebuild normally. +5. `tools/save_build_cache.py` collects all `dist/` directories into `.wheel-cache/` with a manifest, and `actions/cache/save` stores it. + +## Fingerprint Computation + +Each package's fingerprint captures every input that affects its build output: + +``` +fingerprint(P) = sha256(toolchain_hash, recipe_hash(P), sorted(fingerprint(D) for D in host_deps(P))) +``` + +**`toolchain_hash`** is global and shared across all packages: + +| Input | Source | +|-------|--------| +| pyodide-build submodule commit | `git -C pyodide-build rev-parse HEAD` | +| Emscripten version | `pyodide config get emscripten_version` | +| Cross-build env URL | `pyproject.toml` → `default_cross_build_env_url` | +| Rust toolchain version | `pyproject.toml` → `rust_toolchain` | +| Python version | `environment.yml` | +| Build constraints | `tools/constraints.txt` content | +| Native tool versions | `environment.yml` content | + +**`recipe_hash(P)`** is per-package: + +| Input | Source | +|-------|--------| +| Recipe definition | `packages/{name}/meta.yaml` file content | +| Patches | Each file listed in `source.patches`, sorted | +| Extra files | Each file listed in `source.extras`, sorted | +| Upstream source identity | `source.url` and `source.sha256` strings | +| In-tree source | All files under `source.path`, if set | + +**Host dependency propagation** — fingerprints are computed in topological order (leaves first). If numpy's fingerprint changes, scipy's automatically changes because scipy's hash includes `fingerprint(numpy)`. Only `requirements.host` dependencies propagate, not `requirements.run`, matching the existing `generate_needs_build_set()` semantics in `graph_builder.py`. + +Run dependencies, test files, ccache settings, and the runner image are excluded — they don't affect wheel binary output. + +## The mtime Trick + +`needs_rebuild()` in `pyodide-build` determines whether to rebuild a package by checking if the wheel in `dist/` is newer than `meta.yaml` and its patches. On a fresh CI checkout, `git` resets all source file timestamps to "now", so any restored wheel would appear older and trigger a rebuild. + +The restore script works around this by setting restored wheels' mtime to `4102444800` (2099-12-31). This is always newer than any source file timestamp, so `needs_rebuild()` returns `False` and the package is skipped. + +`--force-rebuild` is not used because it operates at the `build_from_graph()` level and rebuilds all resolved dependencies, not just the target package. + +## Cross-Build-Env Packages + +Packages with `cross-build-env: true` in `meta.yaml` (numpy, scipy, pycparser, cffi) are **always rebuilt**. These packages install files into the host Python site-packages (headers, static libraries, `.pxd` files) that downstream packages link against during compilation. Restoring only the wheel from cache would leave those host files missing and break dependents. + +The restore script identifies these packages via `build-plan.json` and skips them. Without a cached wheel in `dist/`, `needs_rebuild()` returns `True` and they rebuild naturally. + +## Cache Storage + +All wheels are stored in a single `.wheel-cache/` directory managed by `actions/cache`: + +``` +.wheel-cache/ +├── manifest.json # maps package names → fingerprints +├── numpy/ +│ └── numpy-2.2.5-cp313-cp313-pyodide_2025_0_wasm32.whl +├── requests/ +│ └── requests-2.32.4-py3-none-any.whl +├── .libs/ # shared/static library outputs +│ └── ... +└── ... +``` + +**Cache key**: `wheel-cache-v1-{hashFiles('build-plan.json')}`. Any fingerprint change produces a new key. `restore-keys: wheel-cache-v1-` enables prefix matching, so a partial hit restores the most recent cache. The restore script then validates each wheel individually — stale wheels are not extracted and those packages rebuild. + +**Sizing**: ~349 packages × ~2 MB average ≈ ~700 MB, well within the 10 GB per-repo limit. Upload/download takes ~30–60 seconds. + +**Limits**: Cache keys are immutable (no overwrites). Entries unused for 7 days are evicted. Upload rate is capped at 200/minute/repo; the bundled approach uses 1 save + 1 restore per run. + +**Branch scoping**: Caches from `main` are readable by PR branches. PR caches are isolated from other PRs and `main`. + +## Scripts + +| Script | Input | Output | +|--------|-------|--------| +| `tools/compute_build_plan.py` | `packages/*/meta.yaml`, `pyproject.toml`, `environment.yml`, `tools/constraints.txt` | `build-plan.json` | +| `tools/restore_build_cache.py` | `build-plan.json`, `.wheel-cache/` | Wheels in `packages/*/dist/` with mtime = 2099 | +| `tools/save_build_cache.py` | `build-plan.json`, `packages/*/dist/` | `.wheel-cache/` with `manifest.json` | + +## Edge Cases + +**`tag:always` packages** (hashlib, micropip, ssl, etc.) — the tag means "always include in the build set", not "always rebuild". They are fingerprinted and cached normally. + +**Static/shared libraries** (libopenblas, libproj, etc.) — output goes to `packages/.libs/` instead of or in addition to `dist/`. The cache includes `.libs/` contents. Rebuild detection uses a `.packaged` token file. + +**Partial build failures** — the save step runs after the build. Only packages with artifacts in `dist/` are cached. Failed packages have no cached entry and will be retried on the next run. + +**Stale cache entries** — when `restore-keys` matches an older cache, it may contain wheels with outdated fingerprints. The restore script compares each cached fingerprint against the current `build-plan.json` and only extracts matches. + +## Future Improvements + +**Cache cross-build-env packages** — instead of always rebuilding them, cache their wheels plus their `cross-build-files` and replay the host site-packages installation on restore. This would eliminate the ~25 minute floor on cached builds. + +**Per-package cache entries** — if the bundled approach causes too much churn, switch to per-package cache keys (`wheel-v1-{name}-{fingerprint[:16]}`) using the GitHub Actions internal cache API (`ACTIONS_CACHE_URL` + `ACTIONS_RUNTIME_TOKEN`). + +**Content-based `needs_rebuild()`** — add a recipe hash check to `pyodide-build`'s `needs_rebuild()` to make it mtime-immune, removing the need for the 2099 timestamp workaround. diff --git a/tools/compute_build_plan.py b/tools/compute_build_plan.py new file mode 100644 index 00000000..28cf7b23 --- /dev/null +++ b/tools/compute_build_plan.py @@ -0,0 +1,364 @@ +#!/usr/bin/env python +"""Compute per-package build fingerprints for the wheel cache system. + +For each package, computes a content-addressable fingerprint based on: + - Global toolchain hash (pyodide-build commit, emscripten version, etc.) + - Per-package recipe hash (meta.yaml, patches, extras, source URL/sha256) + - Recursive host dependency fingerprints + +Outputs a build-plan.json file used by restore_build_cache.py and save_build_cache.py. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import subprocess +import sys +from concurrent.futures import ThreadPoolExecutor +from graphlib import TopologicalSorter +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent.parent +PACKAGES_DIR = BASE_DIR / "packages" +PYODIDE_BUILD_DIR = BASE_DIR / "pyodide-build" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Compute build fingerprints for all packages" + ) + parser.add_argument( + "--output", + type=str, + default="build-plan.json", + help="Output path for build-plan.json", + ) + parser.add_argument( + "--packages-dir", + type=str, + default=str(PACKAGES_DIR), + help="Path to the packages directory", + ) + return parser.parse_args() + + +def load_yaml(path: Path) -> dict: + """Load a YAML file. Uses ruamel.yaml if available, falls back to PyYAML.""" + try: + from ruamel.yaml import YAML + + yaml = YAML(typ="safe") + return yaml.load(path) + except ImportError: + import yaml + + with open(path) as f: + return yaml.safe_load(f) + + +def compute_toolchain_hash() -> str: + """Compute a hash of all global toolchain inputs. + + Inputs: + - pyodide-build submodule commit SHA + - emscripten version (from pyodide config or pyodide-build) + - xbuildenv URL (from pyproject.toml) + - python version (from environment.yml) + - constraints.txt content + - environment.yml content + - rust_toolchain (from pyproject.toml) + """ + hasher = hashlib.sha256() + + # 1. pyodide-build submodule commit + try: + result = subprocess.run( + ["git", "-C", str(PYODIDE_BUILD_DIR), "rev-parse", "HEAD"], + capture_output=True, + text=True, + check=True, + ) + hasher.update(f"pyodide-build-commit:{result.stdout.strip()}".encode()) + except (subprocess.CalledProcessError, FileNotFoundError): + # Fallback: hash the pyodide-build directory content marker + hasher.update(b"pyodide-build-commit:unknown") + + # 2. Emscripten version - try `pyodide config get emscripten_version` first + emscripten_version = _get_emscripten_version() + hasher.update(f"emscripten:{emscripten_version}".encode()) + + # 3. xbuildenv URL and rust_toolchain from pyproject.toml + pyproject_path = BASE_DIR / "pyproject.toml" + if pyproject_path.exists(): + pyproject_content = pyproject_path.read_text() + # Extract relevant fields instead of hashing the whole file + # (pyproject.toml also has linter config that doesn't affect builds) + for line in pyproject_content.splitlines(): + line = line.strip() + if line.startswith("default_cross_build_env_url"): + hasher.update(f"xbuildenv:{line}".encode()) + elif line.startswith("rust_toolchain"): + hasher.update(f"rust:{line}".encode()) + + # 4. Python version from environment.yml + env_yml_path = BASE_DIR / "environment.yml" + if env_yml_path.exists(): + hasher.update(f"environment.yml:{env_yml_path.read_text()}".encode()) + + # 5. constraints.txt + constraints_path = BASE_DIR / "tools" / "constraints.txt" + if constraints_path.exists(): + hasher.update(f"constraints:{constraints_path.read_text()}".encode()) + + return hasher.hexdigest() + + +def _get_emscripten_version() -> str: + """Get emscripten version from pyodide config or fallback to parsing.""" + try: + result = subprocess.run( + ["pyodide", "config", "get", "emscripten_version"], + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError): + # Fallback: try to extract from pyodide-build config files + config_path = PYODIDE_BUILD_DIR / "pyodide_build" / "config.py" + if config_path.exists(): + content = config_path.read_text() + # Simple extraction — not robust but good enough as fallback + for line in content.splitlines(): + if "PYODIDE_EMSCRIPTEN_VERSION" in line and "=" in line: + return line.split("=")[-1].strip().strip('"').strip("'") + return "unknown" + + +def compute_recipe_hash(pkg_dir: Path, meta: dict) -> str: + """Compute the hash of a single package's recipe inputs. + + Inputs: + - meta.yaml file content + - patch file contents (sorted by name) + - extra file contents (sorted by name) + - source URL string + - source sha256 string + """ + hasher = hashlib.sha256() + + # 1. meta.yaml content (the full file, not the parsed dict) + meta_yaml_path = pkg_dir / "meta.yaml" + if meta_yaml_path.exists(): + hasher.update(meta_yaml_path.read_bytes()) + + source = meta.get("source", {}) + + # 2. Patch file contents (sorted) + patches = source.get("patches", []) + for patch_name in sorted(patches): + patch_path = pkg_dir / patch_name + if patch_path.exists(): + hasher.update(patch_path.read_bytes()) + else: + # Hash the name even if file is missing (indicates a problem) + hasher.update(f"missing-patch:{patch_name}".encode()) + + # 3. Extra file contents (sorted) + extras = source.get("extras", []) + for extra in sorted( + extras, key=lambda x: x[0] if isinstance(x, (list, tuple)) else str(x) + ): + if isinstance(extra, (list, tuple)): + extra_src = extra[0] + else: + extra_src = str(extra) + extra_path = pkg_dir / extra_src + if extra_path.exists(): + hasher.update(extra_path.read_bytes()) + else: + hasher.update(f"missing-extra:{extra_src}".encode()) + + # 4. Source URL + url = source.get("url") + if url: + hasher.update(f"url:{url}".encode()) + + # 5. Source sha256 + sha256 = source.get("sha256") + if sha256: + hasher.update(f"sha256:{sha256}".encode()) + + # 6. Source path (for in-tree sources) + path = source.get("path") + if path: + src_path = (pkg_dir / path).resolve() + if src_path.exists() and src_path.is_dir(): + # Hash all files in the source tree + for f in sorted(src_path.rglob("*")): + if f.is_file(): + hasher.update(f.read_bytes()) + elif src_path.exists(): + hasher.update(src_path.read_bytes()) + + return hasher.hexdigest() + + +def _load_single_package( + meta_path: Path, +) -> tuple[str, dict | None, str | None]: + """Load meta.yaml and compute recipe hash for a single package. + + Returns (pkg_name, meta_dict, recipe_hash) or (pkg_name, None, None) on failure. + """ + pkg_name = meta_path.parent.name + pkg_dir = meta_path.parent + try: + meta = load_yaml(meta_path) + if meta is None: + print(f"Warning: empty meta.yaml for {pkg_name}", file=sys.stderr) + return pkg_name, None, None + recipe_hash = compute_recipe_hash(pkg_dir, meta) + return pkg_name, meta, recipe_hash + except Exception as e: + print(f"Warning: failed to parse {meta_path}: {e}", file=sys.stderr) + return pkg_name, None, None + + +def load_all_package_metadata( + packages_dir: Path, +) -> tuple[dict[str, dict], dict[str, str]]: + """Load meta.yaml and compute recipe hashes for all packages concurrently. + + Returns (all_meta, recipe_hashes) dicts. + """ + meta_paths = sorted(packages_dir.glob("*/meta.yaml")) + packages: dict[str, dict] = {} + recipe_hashes: dict[str, str] = {} + + max_workers = min(32, (os.cpu_count() or 4) + 4) + with ThreadPoolExecutor(max_workers=max_workers) as pool: + results = pool.map(_load_single_package, meta_paths) + for pkg_name, meta, recipe_hash in results: + if meta is not None and recipe_hash is not None: + packages[pkg_name] = meta + recipe_hashes[pkg_name] = recipe_hash + + return packages, recipe_hashes + + +def get_host_dependencies(meta: dict) -> list[str]: + """Extract host dependencies from a package's metadata.""" + requirements = meta.get("requirements", {}) + return requirements.get("host", []) + + +def is_cross_build_env(meta: dict) -> bool: + """Check if a package has cross-build-env: true.""" + build = meta.get("build", {}) + return build.get("cross-build-env", False) + + +def get_package_type(meta: dict) -> str: + """Get the package type (package, static_library, shared_library, cpython_module).""" + build = meta.get("build", {}) + return build.get("type", "package") + + +def compute_all_fingerprints( + all_meta: dict[str, dict], + recipe_hashes: dict[str, str], + toolchain_hash: str, +) -> dict[str, str]: + """Compute fingerprints for all packages in topological order. + + Fingerprints are computed leaves-first so that each package's fingerprint + includes the fingerprints of its host dependencies (recursive). + """ + graph: dict[str, set[str]] = {} + for name, meta in all_meta.items(): + host_deps = get_host_dependencies(meta) + graph[name] = {dep for dep in host_deps if dep in all_meta} + + fingerprints: dict[str, str] = {} + + try: + topo_order = list(TopologicalSorter(graph).static_order()) + except Exception as e: + print(f"Error: dependency cycle detected: {e}", file=sys.stderr) + sys.exit(1) + + for name in topo_order: + if name not in all_meta: + continue + + hasher = hashlib.sha256() + + # 1. Toolchain hash (global) + hasher.update(f"toolchain:{toolchain_hash}".encode()) + + # 2. Recipe hash (per-package) + hasher.update(f"recipe:{recipe_hashes[name]}".encode()) + + # 3. Host dependency fingerprints (recursive, sorted for determinism) + host_deps = sorted(graph.get(name, set())) + for dep in host_deps: + if dep in fingerprints: + hasher.update(f"dep:{dep}:{fingerprints[dep]}".encode()) + + fingerprints[name] = hasher.hexdigest() + + return fingerprints + + +def main() -> None: + args = parse_args() + packages_dir = Path(args.packages_dir) + + print(f"Loading package metadata from {packages_dir}...") + all_meta, recipe_hashes = load_all_package_metadata(packages_dir) + print(f"Found {len(all_meta)} packages") + + print("Computing toolchain hash...") + toolchain_hash = compute_toolchain_hash() + print(f"Toolchain hash: {toolchain_hash[:16]}...") + + print("Computing per-package fingerprints...") + fingerprints = compute_all_fingerprints(all_meta, recipe_hashes, toolchain_hash) + + # Identify cross-build-env packages + cross_build_packages = sorted( + name for name, meta in all_meta.items() if is_cross_build_env(meta) + ) + + # Identify library packages (static_library, shared_library) + # These don't produce wheels — needs_rebuild() checks build/.packaged token instead + library_packages = sorted( + name + for name, meta in all_meta.items() + if get_package_type(meta) in ("static_library", "shared_library") + ) + + build_plan = { + "toolchain_hash": toolchain_hash, + "fingerprints": {k: v for k, v in sorted(fingerprints.items())}, + "cross_build_packages": cross_build_packages, + "library_packages": library_packages, + } + + output_path = Path(args.output) + output_path.write_text(json.dumps(build_plan, indent=2) + "\n") + print(f"Build plan written to {output_path}") + print(f" Total packages: {len(fingerprints)}") + print(f" Cross-build-env (always rebuild): {cross_build_packages}") + print(f" Library packages: {len(library_packages)}") + + num_unique_fps = len(set(fingerprints.values())) + print(f" Unique fingerprints: {num_unique_fps}") + + +if __name__ == "__main__": + main() diff --git a/tools/restore_build_cache.py b/tools/restore_build_cache.py new file mode 100644 index 00000000..f5fafa3a --- /dev/null +++ b/tools/restore_build_cache.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +"""Restore cached wheels from the .wheel-cache directory and apply the mtime trick. + +This script is run after `actions/cache/restore` has restored .wheel-cache/. +For each package whose cached fingerprint matches the current build-plan.json, +it copies the wheel into packages/{name}/dist/ and sets the mtime to the far +future (year 2099) so that needs_rebuild() skips it. + +Cross-build-env and library packages are always skipped. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import sys +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent.parent +PACKAGES_DIR = BASE_DIR / "packages" + +# 2099-12-31T00:00:00Z — guaranteed newer than any git checkout mtime +FUTURE_MTIME = 4_102_444_800 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Restore cached wheels and apply mtime trick" + ) + parser.add_argument( + "--cache-dir", + type=str, + required=True, + help="Path to the .wheel-cache directory", + ) + parser.add_argument( + "--build-plan", + type=str, + required=True, + help="Path to build-plan.json", + ) + parser.add_argument( + "--packages-dir", + type=str, + default=str(PACKAGES_DIR), + help="Path to the packages directory", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + cache_dir = Path(args.cache_dir) + build_plan_path = Path(args.build_plan) + packages_dir = Path(args.packages_dir) + + if not build_plan_path.exists(): + print(f"Error: build plan not found at {build_plan_path}", file=sys.stderr) + sys.exit(1) + + build_plan = json.loads(build_plan_path.read_text()) + current_fingerprints: dict[str, str] = build_plan["fingerprints"] + cross_build_packages: list[str] = build_plan.get("cross_build_packages", []) + library_packages: set[str] = set(build_plan.get("library_packages", [])) + skip_packages = set(cross_build_packages) | library_packages + + manifest_path = cache_dir / "manifest.json" + if not manifest_path.exists(): + print("No cache manifest found — cold start, nothing to restore.") + print(f" Cache dir: {cache_dir}") + print(f" Expected manifest at: {manifest_path}") + return + + cached_manifest = json.loads(manifest_path.read_text()) + cached_fingerprints: dict[str, str] = cached_manifest.get("fingerprints", {}) + + restored = 0 + skipped_always_rebuild = 0 + skipped_stale = 0 + skipped_missing = 0 + skipped_no_fingerprint = 0 + + for pkg_name, current_fp in sorted(current_fingerprints.items()): + if pkg_name in skip_packages: + skipped_always_rebuild += 1 + continue + + cached_fp = cached_fingerprints.get(pkg_name) + if cached_fp is None: + skipped_no_fingerprint += 1 + continue + + if cached_fp != current_fp: + skipped_stale += 1 + continue + + cached_pkg_dir = cache_dir / pkg_name + if not cached_pkg_dir.exists(): + skipped_missing += 1 + continue + + dist_dir = packages_dir / pkg_name / "dist" + dist_dir.mkdir(parents=True, exist_ok=True) + + files_restored = 0 + for cached_file in cached_pkg_dir.iterdir(): + if cached_file.is_file(): + dest = dist_dir / cached_file.name + shutil.copy2(cached_file, dest) + os.utime(dest, (FUTURE_MTIME, FUTURE_MTIME)) + files_restored += 1 + + if files_restored > 0: + restored += 1 + + total = len(current_fingerprints) + will_build = total - restored - skipped_always_rebuild + print(f"Cache restore summary:") + print(f" Total packages in build plan: {total}") + print(f" Restored from cache: {restored}") + print(f" Skipped (always rebuild): {skipped_always_rebuild}") + print(f" Skipped (fingerprint changed): {skipped_stale}") + print(f" Skipped (not in cache): {skipped_no_fingerprint}") + print(f" Skipped (cached files missing): {skipped_missing}") + print(f" Will need to build: {will_build}") + + +if __name__ == "__main__": + main() diff --git a/tools/save_build_cache.py b/tools/save_build_cache.py new file mode 100644 index 00000000..78546e58 --- /dev/null +++ b/tools/save_build_cache.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +"""Bundle all built wheels into .wheel-cache/ for GitHub Actions cache. + +After the build completes, this script collects all wheels from +packages/*/dist/ and copies them into .wheel-cache/{name}/ along +with a manifest.json that maps package names to their fingerprints. + +The .wheel-cache/ directory is then saved by `actions/cache/save`. +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent.parent +PACKAGES_DIR = BASE_DIR / "packages" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Bundle built wheels into cache directory" + ) + parser.add_argument( + "--build-plan", + type=str, + required=True, + help="Path to build-plan.json", + ) + parser.add_argument( + "--cache-dir", + type=str, + required=True, + help="Path to the .wheel-cache directory to create/update", + ) + parser.add_argument( + "--packages-dir", + type=str, + default=str(PACKAGES_DIR), + help="Path to the packages directory", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + build_plan_path = Path(args.build_plan) + cache_dir = Path(args.cache_dir) + packages_dir = Path(args.packages_dir) + + if not build_plan_path.exists(): + print(f"Error: build plan not found at {build_plan_path}", file=sys.stderr) + sys.exit(1) + + build_plan = json.loads(build_plan_path.read_text()) + fingerprints: dict[str, str] = build_plan["fingerprints"] + library_packages: set[str] = set(build_plan.get("library_packages", [])) + + if cache_dir.exists(): + shutil.rmtree(cache_dir) + cache_dir.mkdir(parents=True) + + cached_count = 0 + skipped_no_dist = 0 + skipped_library = 0 + total_size = 0 + + for pkg_name in sorted(fingerprints): + if pkg_name in library_packages: + skipped_library += 1 + continue + + dist_dir = packages_dir / pkg_name / "dist" + if not dist_dir.exists(): + skipped_no_dist += 1 + continue + + artifacts = [f for f in dist_dir.iterdir() if f.is_file()] + if not artifacts: + skipped_no_dist += 1 + continue + + pkg_cache_dir = cache_dir / pkg_name + pkg_cache_dir.mkdir(parents=True, exist_ok=True) + + for artifact in artifacts: + dest = pkg_cache_dir / artifact.name + shutil.copy2(artifact, dest) + total_size += artifact.stat().st_size + + cached_count += 1 + + manifest = { + "fingerprints": fingerprints, + "toolchain_hash": build_plan.get("toolchain_hash", ""), + "cross_build_packages": build_plan.get("cross_build_packages", []), + "library_packages": sorted(library_packages), + } + manifest_path = cache_dir / "manifest.json" + manifest_path.write_text(json.dumps(manifest, indent=2) + "\n") + + total_size_mb = total_size / (1024 * 1024) + print(f"Cache save summary:") + print(f" Wheel packages cached: {cached_count}") + print(f" Skipped (library packages): {skipped_library}") + print(f" Skipped (no dist/): {skipped_no_dist}") + print(f" Total cache size: {total_size_mb:.1f} MB") + print(f" Cache directory: {cache_dir}") + + +if __name__ == "__main__": + main()