From 1e8b17e7fc47283b8e34429b40a63069d9d5e1c0 Mon Sep 17 00:00:00 2001 From: berkkirik Date: Fri, 24 Apr 2026 11:06:54 +0300 Subject: [PATCH 1/2] fix: default to auto-detected device so CPU-only machines don't crash The CLI --device flag defaulted to "cuda", which crashed with a raw PyTorch traceback ("Found no NVIDIA driver on your system ...") on machines without a GPU. Users had to discover --device cpu themselves. Add an "auto" mode that picks the best available backend (cuda if detected, otherwise cpu) and make it the default. Users who explicitly pass --device cuda still get the original loud failure on non-CUDA machines, which is the correct behavior when they ask for cuda by name. - opf/_common/device.py (new): resolve_device("auto"|...) helper. - opf/_cli/common.py: flip --device default to "auto", expand help text. - opf/_core/runtime.py, opf/_train/runner.py: call resolve_device() where device names turn into torch.device objects. Stderr on auto-fallback: info: no CUDA device detected; falling back to CPU (pass --device cuda to override). Fixes #12 --- opf/_cli/common.py | 9 +++++++-- opf/_common/device.py | 30 ++++++++++++++++++++++++++++++ opf/_core/runtime.py | 3 ++- opf/_train/runner.py | 3 ++- 4 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 opf/_common/device.py diff --git a/opf/_cli/common.py b/opf/_cli/common.py index 31224e1..b75f043 100644 --- a/opf/_cli/common.py +++ b/opf/_cli/common.py @@ -55,8 +55,13 @@ def add_device_arg(parser: object) -> None: parser.add_argument( "--device", type=str, - default="cuda", - help="Device to run on", + default="auto", + help=( + "Device to run on. 'auto' (default) picks cuda if a GPU is " + "available, otherwise cpu. Pass 'cuda' or 'cpu' explicitly to " + "override or to get a loud error when the requested backend is " + "unavailable." + ), ) diff --git a/opf/_common/device.py b/opf/_common/device.py new file mode 100644 index 0000000..f2b162c --- /dev/null +++ b/opf/_common/device.py @@ -0,0 +1,30 @@ +"""Device-name resolution helpers shared by CLI entrypoints.""" + +from __future__ import annotations + +import sys + +import torch + +AUTO_DEVICE: str = "auto" + + +def resolve_device(device_name: str) -> torch.device: + """Resolve a user-supplied device name into a concrete ``torch.device``. + + ``"auto"`` selects the best available device: CUDA if a GPU is detected, + otherwise CPU. Any other value is passed through to ``torch.device`` as-is + so that explicit requests like ``"cuda"`` or ``"cpu"`` still fail loudly + when the underlying backend is unavailable. + """ + if device_name == AUTO_DEVICE: + if torch.cuda.is_available(): + return torch.device("cuda") + print( + "info: no CUDA device detected; falling back to CPU " + "(pass --device cuda to override).", + file=sys.stderr, + flush=True, + ) + return torch.device("cpu") + return torch.device(device_name) diff --git a/opf/_core/runtime.py b/opf/_core/runtime.py index 2c3034e..5ed4f1a 100644 --- a/opf/_core/runtime.py +++ b/opf/_core/runtime.py @@ -19,6 +19,7 @@ REDACTED_OUTPUT_LABEL, REDACTED_OUTPUT_PLACEHOLDER, ) +from .._common.device import resolve_device from .._common.env import get_env_bool from .decoding import ViterbiCRFDecoder from .._common.label_space import resolve_label_space_from_config @@ -215,7 +216,7 @@ def load_inference_runtime( if output_mode not in OUTPUT_MODES: raise ValueError(f"Unsupported output_mode: {output_mode!r}") _validate_checkpoint_dir(checkpoint) - device = torch.device(device_name) + device = resolve_device(device_name) checkpoint_config = _load_checkpoint_config(checkpoint) n_ctx = _resolve_n_ctx(checkpoint_config, n_ctx_override, device) encoding_name = checkpoint_config.get("encoding") diff --git a/opf/_train/runner.py b/opf/_train/runner.py index b00e23f..274fa20 100644 --- a/opf/_train/runner.py +++ b/opf/_train/runner.py @@ -16,6 +16,7 @@ from .args import parse_args from .._api import resolve_checkpoint_path from .._common.constants import SCHEMA_VERSION +from .._common.device import resolve_device from .._common.label_space import ( resolve_checkpoint_label_space, resolve_label_space_from_config, @@ -588,7 +589,7 @@ def main(argv: Sequence[str] | None = None, *, prog: str | None = None) -> int: progress_interval_s = parsed_interval checkpoint = resolve_checkpoint_path(args.checkpoint) - device = torch.device(args.device) + device = resolve_device(args.device) # Default to Triton-backed MoE kernels on non-CPU devices unless callers # explicitly opt out. CPU uses torch ops by default so Triton stays optional. From 3b14893d0aabbc94456254dc2cfa5a4f51e26e56 Mon Sep 17 00:00:00 2001 From: berkkirik Date: Fri, 24 Apr 2026 19:51:33 +0300 Subject: [PATCH 2/2] feat: auto-detect Apple Silicon (MPS) and keep Triton CUDA-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the --device auto resolution from #17 to include Apple Silicon (MPS) so Mac users get GPU acceleration by default instead of falling back to CPU. Two coordinated changes make this safe: 1. opf/_common/device.py — "auto" now picks cuda > mps > cpu. Each fallback emits an info line on stderr so the user always knows which backend was selected. 2. opf/_model/model.py — the Triton-backed MoE kernels are CUDA-only (Triton does not target Metal). Previously the default enabled Triton on any non-CPU device, so trying mps crashed once the MoE layer was hit. Narrow the auto-enable to device.type == "cuda"; mps and cpu both fall back to the torch-ops path unless the user explicitly sets OPF_MOE_TRITON=1. 3. opf/_train/runner.py — mirror the same CUDA-only gate when setting OPF_MOE_TRITON=1 on behalf of the user (previously set it for any non-CPU device, which would silently enable Triton on mps). 4. opf/_cli/common.py — expand --device help text to list the full backend order (cuda > mps > cpu). Verified on macOS (Apple Silicon, Python 3.14, torch 2.11): - resolve_device("auto") → mps (with stderr info line) - resolve_device("mps") → mps - resolve_device("cpu") → cpu - resolve_device("cuda") → returns cuda device (still fails loudly at tensor alloc when the user explicitly asks for it — unchanged) Low-level MPS op sanity check passed for embedding, attention-like matmul/softmax, log_softmax, topk, argsort, bincount — all the ops the inference path relies on. Fixes #21 --- opf/_cli/common.py | 8 ++++---- opf/_common/device.py | 33 +++++++++++++++++++++++++++------ opf/_model/model.py | 7 +++++-- opf/_train/runner.py | 7 ++++--- 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/opf/_cli/common.py b/opf/_cli/common.py index b75f043..c1505e2 100644 --- a/opf/_cli/common.py +++ b/opf/_cli/common.py @@ -57,10 +57,10 @@ def add_device_arg(parser: object) -> None: type=str, default="auto", help=( - "Device to run on. 'auto' (default) picks cuda if a GPU is " - "available, otherwise cpu. Pass 'cuda' or 'cpu' explicitly to " - "override or to get a loud error when the requested backend is " - "unavailable." + "Device to run on. 'auto' (default) picks the best available " + "backend: cuda > mps (Apple Silicon) > cpu. Pass an explicit " + "value like 'cuda', 'mps', or 'cpu' to override or to get a " + "loud error when the requested backend is unavailable." ), ) diff --git a/opf/_common/device.py b/opf/_common/device.py index f2b162c..dc62a08 100644 --- a/opf/_common/device.py +++ b/opf/_common/device.py @@ -9,20 +9,41 @@ AUTO_DEVICE: str = "auto" +def _mps_is_available() -> bool: + """Return True when the current PyTorch build supports Apple Metal (MPS).""" + backend = getattr(torch.backends, "mps", None) + if backend is None: + return False + is_available = getattr(backend, "is_available", None) + if is_available is None: + return False + try: + return bool(is_available()) + except Exception: + return False + + def resolve_device(device_name: str) -> torch.device: """Resolve a user-supplied device name into a concrete ``torch.device``. - ``"auto"`` selects the best available device: CUDA if a GPU is detected, - otherwise CPU. Any other value is passed through to ``torch.device`` as-is - so that explicit requests like ``"cuda"`` or ``"cpu"`` still fail loudly - when the underlying backend is unavailable. + ``"auto"`` selects the best available device in this order: CUDA (NVIDIA + GPU) > MPS (Apple Silicon GPU) > CPU. Any other value is passed through + to ``torch.device`` as-is so that explicit requests like ``"cuda"`` or + ``"mps"`` still fail loudly when the underlying backend is unavailable. """ if device_name == AUTO_DEVICE: if torch.cuda.is_available(): return torch.device("cuda") + if _mps_is_available(): + print( + "info: no CUDA device detected; using Apple Metal (MPS).", + file=sys.stderr, + flush=True, + ) + return torch.device("mps") print( - "info: no CUDA device detected; falling back to CPU " - "(pass --device cuda to override).", + "info: no CUDA or MPS device detected; falling back to CPU " + "(pass --device cuda or --device mps to override).", file=sys.stderr, flush=True, ) diff --git a/opf/_model/model.py b/opf/_model/model.py index 22b3d4c..e363307 100644 --- a/opf/_model/model.py +++ b/opf/_model/model.py @@ -750,8 +750,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: expert_indices = experts.indices expert_weights = expert_weights / self.experts_per_token experts_per_token_eff = self.experts_per_token - not_running_on_cpu = t.device.type != "cpu" - use_triton = get_env_bool("OPF_MOE_TRITON", default=not_running_on_cpu) + # Triton kernels are CUDA-only; auto-enable only on CUDA devices. MPS + # and CPU fall back to the torch-ops path unless the user explicitly + # opts in via OPF_MOE_TRITON=1. + is_cuda_device = t.device.type == "cuda" + use_triton = get_env_bool("OPF_MOE_TRITON", default=is_cuda_device) if use_triton: _require_triton() diff --git a/opf/_train/runner.py b/opf/_train/runner.py index 274fa20..f0904f1 100644 --- a/opf/_train/runner.py +++ b/opf/_train/runner.py @@ -591,9 +591,10 @@ def main(argv: Sequence[str] | None = None, *, prog: str | None = None) -> int: checkpoint = resolve_checkpoint_path(args.checkpoint) device = resolve_device(args.device) - # Default to Triton-backed MoE kernels on non-CPU devices unless callers - # explicitly opt out. CPU uses torch ops by default so Triton stays optional. - if device.type != "cpu": + # Default to Triton-backed MoE kernels on CUDA devices unless callers + # explicitly opt out. CPU and MPS use torch ops by default so Triton + # stays CUDA-only (the kernels don't run on Metal). + if device.type == "cuda": os.environ.setdefault("OPF_MOE_TRITON", "1") base_config = _load_checkpoint_config(checkpoint)