From 29ef90febcbc9153477732b13e6a7cc3c290ef56 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 27 Feb 2026 15:53:04 +0100 Subject: [PATCH 01/12] adding easyconfigs: DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb and patches: DeepSpeed-0.14.2_no-ninja-dep.patch, DeepSpeed-0.14.5_avoid-access-to-home.patch, DeepSpeed-0.14.5_pdsh-env-vars.patch, DeepSpeed-0.14.5_pic-compile.patch, DeepSpeed-0.14.5_test-nvme-offload.patch, DeepSpeed-0.14.5_use-eb-cutlass.patch --- .../DeepSpeed-0.14.2_no-ninja-dep.patch | 57 +++++++ ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 102 +++++++++++++ ...eepSpeed-0.14.5_avoid-access-to-home.patch | 60 ++++++++ .../DeepSpeed-0.14.5_pdsh-env-vars.patch | 36 +++++ .../DeepSpeed-0.14.5_pic-compile.patch | 141 ++++++++++++++++++ .../DeepSpeed-0.14.5_test-nvme-offload.patch | 135 +++++++++++++++++ .../DeepSpeed-0.14.5_use-eb-cutlass.patch | 55 +++++++ 7 files changed, 586 insertions(+) create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch new file mode 100644 index 000000000000..8a51596fb3b6 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch @@ -0,0 +1,57 @@ +Patch away dependency on ninja python package by falling back to checking +returncode of `ninja --version`. + +Author: Viktor Rehnberg (Chalmers University of Technology) + + +diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py +index 85a2f9b2..8bb64626 100644 +--- a/deepspeed/env_report.py ++++ b/deepspeed/env_report.py +@@ -62,7 +62,7 @@ def ninja_installed(): + try: + import ninja # noqa: F401 # type: ignore + except ImportError: +- return False ++ return (subprocess.run(["ninja", "--version"]).returncode == 0) + return True + + +diff --git a/op_builder/builder.py b/op_builder/builder.py +index 8dc825c7..970d18b2 100644 +--- a/op_builder/builder.py ++++ b/op_builder/builder.py +@@ -487,7 +487,8 @@ class OpBuilder(ABC): + try: + import ninja # noqa: F401 # type: ignore + except ImportError: +- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") ++ if subprocess.run(["ninja", "--version"]).returncode != 0: ++ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") + + if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch(): + self.build_for_cpu = not torch.cuda.is_available() +diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py +index 81b15f19..cf0a1cc0 100644 +--- a/op_builder/xpu/builder.py ++++ b/op_builder/xpu/builder.py +@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder): + try: + import ninja # noqa: F401 + except ImportError: +- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") ++ if subprocess.run(["ninja", "--version"]).returncode != 0: ++ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") + + self.jit_mode = True + from intel_extension_for_pytorch.xpu.cpp_extension import load +diff --git a/requirements/requirements.txt b/requirements/requirements.txt +index 80c9f9b3..eed77fa3 100755 +--- a/requirements/requirements.txt ++++ b/requirements/requirements.txt +@@ -1,5 +1,4 @@ + hjson +-ninja + numpy + packaging>=20.0 + psutil diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb new file mode 100644 index 000000000000..eac19e66f317 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -0,0 +1,102 @@ +easyblock = 'PythonBundle' + +name = 'DeepSpeed' +version = '0.14.5' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'http://www.deepspeed.ai/' +description = """ +DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective. +""" + +toolchain = {'name': 'foss', 'version': '2023b'} + +builddependencies = [ + ('Ninja', '1.11.1'), + ('Transformers', '4.44.0'), +] +local_pytorch_version = '2.3.0' +dependencies = [ + ('Python', '3.11.5'), + ('CUDA', '12.4.0', '', SYSTEM), + ('NCCL', '2.20.5', '-CUDA-%(cudaver)s'), + ('CUTLASS', '4.1.0', '-CUDA-%(cudaver)s'), + ('CuPy', '13.6.0', '-CUDA-%(cudaver)s'), + ('Triton', '2.3.1', '-CUDA-%(cudaver)s'), + ('accelerate', '1.10.0', '-CUDA-%(cudaver)s'), + ('PyTorch', local_pytorch_version, '-CUDA-%(cudaver)s'), + ('PyTorch-bundle', local_pytorch_version, '-CUDA-%(cudaver)s'), + ('mpi4py', '3.1.5'), + ('DLPack', '1.2'), + ('py-cpuinfo', '9.0.0'), + ('pydantic', '2.7.4'), + ('tqdm', '4.66.2'), + ('pdsh', '2.36'), + ('Seaborn', '0.13.2'), # dependency for mup + ('libaio', '0.3.113'), # for async_io (builddep only?) +] + +local_excluded_ds_tests = ( + 'TestTensorBoard', + 'TestWandb', + 'TestCometMonitor', + 'TestQuantizedInt', # Downloads model from internet + 'test_fp_quant[256-qbits8-bf16]', # Error of 0.00909423828125 > 0.004 +) + +github_account = 'microsoft' +exts_list = [ + ('hjson', '3.1.0', { + 'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'], + }), + ('nvidia-ml-py', '12.535.161', { + 'checksums': ['2bcc31ff7a0ea291ed8d7fc39b149391a42c2fb1cb4256c935e692de488b4d17'], + 'modulename': 'pynvml', + }), + ('mup', '1.0.0', { + 'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'], + }), + ('qtorch', '0.3.0', { + 'checksums': ['3fc2e9b27d58d18304ac46511ea03a3eb20f852944f6a5b6ef71b974c2da20bf'], + 'preinstallopts': "TORCH_CUDA_ARCH_LIST='%(cuda_cc_semicolon_sep)s' ", + }), + ('DeepSpeed', '0.14.5', { + 'source_urls': [GITHUB_SOURCE], + # Test suite not available on pypi + 'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}], + 'patches': [ + 'DeepSpeed-0.14.2_no-ninja-dep.patch', + 'DeepSpeed-0.14.5_avoid-access-to-home.patch', + 'DeepSpeed-0.14.5_pdsh-env-vars.patch', + 'DeepSpeed-0.14.5_pic-compile.patch', + 'DeepSpeed-0.14.5_test-nvme-offload.patch', + 'DeepSpeed-0.14.5_use-eb-cutlass.patch', + ], + 'checksums': [ + {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, + {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, + {'DeepSpeed-0.14.5_avoid-access-to-home.patch': + '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'}, + {'DeepSpeed-0.14.5_pdsh-env-vars.patch': + '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'}, + {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'}, + {'DeepSpeed-0.14.5_test-nvme-offload.patch': + '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'}, + {'DeepSpeed-0.14.5_use-eb-cutlass.patch': + '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'}, + ], + 'jit_only_ops': [ + 'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS', + # Cannot prebuild with transformer OP: https://github.com/deepspeedai/DeepSpeed/issues/949 + # 'STOCHASTIC_TRANSFORMER', + ], + 'testinstall': True, + 'runtest': ' && '.join(( + 'ln -s $PWD/tests/ ../tests', + 'cd ..', + f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}'", + )), + }), +] + +moduleclass = 'ai' diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch new file mode 100644 index 000000000000..85f967a3388b --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch @@ -0,0 +1,60 @@ +From 9d17116fcdb44b81eb00d3bce91431dc35cd69b1 Mon Sep 17 00:00:00 2001 +From: "Joshua C. Randall" +Date: Wed, 4 Sep 2024 19:22:07 +0100 +Subject: [PATCH] print warning if actual triton cache dir is on NFS, not just + for default (#6487) + +move the logic that prints a warning when triton cache dir is on NFS to +act on the actual calculated cache_dir rather than on the default. + +this means that: +- when the default directory (in the user's home directory) is on NFS +but `TRITON_CACHE_DIR` is set to a non-NFS directory, no warning will be +printed whereas prior to this change a spurious and confusing warning +was printed +- when the user's home directory is not on NFS but `TRITON_CACHE_DIR` is +set to an NFS directory, a warning will be printed whereas prior to this +change no warning would be printed + +fixes #6486 +--- + .../ops/transformer/inference/triton/matmul_ext.py | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py +index c77d8a8e11c0..412c8740a216 100644 +--- a/deepspeed/ops/transformer/inference/triton/matmul_ext.py ++++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py +@@ -40,13 +40,17 @@ class TritonCacheDir: + _warning_printed = False + + @staticmethod +- def default_cache_dir(): +- tmp_path = os.path.join(Path.home(), ".triton", "autotune") +- if is_nfs_path(tmp_path) and not TritonCacheDir._warning_printed: ++ def warn_if_nfs(cache_dir): ++ if is_nfs_path(cache_dir) and not TritonCacheDir._warning_printed: + print( +- f"Warning: The default cache directory for DeepSpeed Triton autotune, {tmp_path}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path." ++ f"Warning: The cache directory for DeepSpeed Triton autotune, {cache_dir}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path." + ) + TritonCacheDir._warning_printed = True ++ return ++ ++ @staticmethod ++ def default_cache_dir(): ++ tmp_path = os.path.join(Path.home(), ".triton", "autotune") + return tmp_path + + +@@ -80,9 +84,9 @@ def __init__(self, key): + self.lock_path = None + # if caching is enabled, get the lock and bin path + self.cache_dir = os.environ.get('TRITON_CACHE_DIR', TritonCacheDir.default_cache_dir()) ++ TritonCacheDir.warn_if_nfs(self.cache_dir) + if self.cache_dir: + os.makedirs(self.cache_dir, exist_ok=True) +- if self.cache_dir: + self.file_path = os.path.join(self.cache_dir, self.key + ".pickle") + self.lock_path = self.file_path + ".lock" + diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch new file mode 100644 index 000000000000..9d4342f66bcb --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch @@ -0,0 +1,36 @@ +From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Wed, 25 Sep 2024 09:29:23 +0000 +Subject: [PATCH] Add software relevant environment variables + +The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is +not included in these exports then the python .so file may not be found. +Also including what seemed important and was added from loading DeepSpeed. +(Couldn't add everything, then argumet list becomes too long). + +See + - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 +for more details. +--- + deepspeed/launcher/runner.py | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py +index 07d1713e..e9cd61b8 100755 +--- a/deepspeed/launcher/runner.py ++++ b/deepspeed/launcher/runner.py +@@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator + + DLTS_HOSTFILE = "/job/hostfile" + EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX'] ++EXPORT_ENVS += [ # Extra based on what's added by module load DeepSpeed ++ 'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA', # important ++ 'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', 'MPL', 'NCCL', ++ 'PKG_CONFIG_PATH', 'XDG_DATA_DIRS', ++] + EXPORT_ENVS += NEBULA_EXPORT_ENVS + DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env") + DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.'] +-- +2.39.3 + diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch new file mode 100644 index 000000000000..707bc826e889 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch @@ -0,0 +1,141 @@ +From 90afd671dadf9fd6a7a221428f2c04c16d637494 Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Thu, 23 May 2024 07:09:53 +0000 +Subject: [PATCH] Compile with PIC + +--- + op_builder/builder.py | 15 ++++++++++----- + op_builder/cpu/builder.py | 3 ++- + op_builder/fused_adam.py | 4 +++- + op_builder/fused_lamb.py | 4 +++- + op_builder/fused_lion.py | 4 +++- + op_builder/xpu/builder.py | 3 ++- + 6 files changed, 23 insertions(+), 10 deletions(-) + +diff --git a/op_builder/builder.py b/op_builder/builder.py +index ec7566aa..f08e1799 100644 +--- a/op_builder/builder.py ++++ b/op_builder/builder.py +@@ -288,13 +288,13 @@ class OpBuilder(ABC): + ''' + Returns optional list of compiler flags to forward to nvcc when building CUDA sources + ''' +- return [] ++ return ['-Xcompiler', '-fPIC'] + + def cxx_args(self): + ''' + Returns optional list of compiler flags to forward to the build + ''' +- return [] ++ return ['-fPIC'] + + def is_compatible(self, verbose=True): + ''' +@@ -746,15 +746,18 @@ class CUDAOpBuilder(OpBuilder): + ) + + def cxx_args(self): ++ args = super().cxx_args() + if sys.platform == "win32": +- return ['-O2'] ++ args += ['-O2'] + else: +- return ['-O3', '-std=c++17', '-g', '-Wno-reorder'] ++ args += ['-O3', '-std=c++17', '-g', '-Wno-reorder'] ++ return args + + def nvcc_args(self): + if self.build_for_cpu: + return [] +- args = ['-O3'] ++ args = super().nvcc_args() ++ args += ['-O3'] + if self.is_rocm_pytorch(): + ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() + args += [ +@@ -835,6 +838,8 @@ class TorchCPUOpBuilder(CUDAOpBuilder): + '-lcublas', + '-g', + ] ++ else: ++ args += super(CUDAOpBuilder, self).cxx_args() + + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() +diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py +index d881842a..dfc5a31d 100644 +--- a/op_builder/cpu/builder.py ++++ b/op_builder/cpu/builder.py +@@ -30,7 +30,8 @@ class CPUOpBuilder(OpBuilder): + return cpp_ext + + def cxx_args(self): +- args = ['-O3', '-g', '-Wno-reorder'] ++ args = super().cxx_args() ++ args += ['-O3', '-g', '-Wno-reorder'] + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() + args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH] +diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py +index ac6e4eea..0c723572 100644 +--- a/op_builder/fused_adam.py ++++ b/op_builder/fused_adam.py +@@ -29,7 +29,9 @@ class FusedAdamBuilder(CUDAOpBuilder): + return args + self.version_dependent_macros() + + def nvcc_args(self): +- nvcc_flags = ['-O3'] + self.version_dependent_macros() ++ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args() ++ nvcc_flags += ['-O3'] ++ nvcc_flags += self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] + +diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py +index f0cb5577..a59b97d4 100644 +--- a/op_builder/fused_lamb.py ++++ b/op_builder/fused_lamb.py +@@ -29,7 +29,9 @@ class FusedLambBuilder(CUDAOpBuilder): + return args + self.version_dependent_macros() + + def nvcc_args(self): +- nvcc_flags = ['-O3'] + self.version_dependent_macros() ++ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args() ++ nvcc_flags += ['-O3'] ++ nvcc_flags += self.version_dependent_macros() + if self.is_rocm_pytorch(): + ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() + nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR] +diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py +index b900a8f2..119232b5 100644 +--- a/op_builder/fused_lion.py ++++ b/op_builder/fused_lion.py +@@ -29,7 +29,9 @@ class FusedLionBuilder(CUDAOpBuilder): + return args + self.version_dependent_macros() + + def nvcc_args(self): +- nvcc_flags = ['-O3'] + self.version_dependent_macros() ++ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args() ++ nvcc_flags += ['-O3'] ++ nvcc_flags += self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] + +diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py +index f430b7b6..5a1a2219 100644 +--- a/op_builder/xpu/builder.py ++++ b/op_builder/xpu/builder.py +@@ -52,7 +52,8 @@ class SYCLOpBuilder(OpBuilder): + return version_ge_1_1 + version_ge_1_3 + version_ge_1_5 + + def cxx_args(self): +- cxx_flags = [ ++ cxx_flags = super().cxx_args() ++ cxx_flags += [ + '-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64', + '-fno-strict-aliasing' + ] +-- +2.39.3 + diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch new file mode 100644 index 000000000000..dcff709f2ce1 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch @@ -0,0 +1,135 @@ +From ddbf7ab23ce2e83747ff6a1482ac512e06da82ca Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Mon, 4 Nov 2024 15:31:55 +0100 +Subject: [PATCH] Fix quantization tests + +NVME tests didn't always run because the hard-coded nvme_path wasn't +always writable. This commit changed to use tmp_path fixture instead and +disabled distributed test to avoid thread locks hanging. +--- + .../quantization/test_intX_quantization.py | 43 ++++++++++--------- + 1 file changed, 22 insertions(+), 21 deletions(-) + +diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py +index 77b51fcd..9e0d7ac0 100644 +--- a/tests/unit/inference/quantization/test_intX_quantization.py ++++ b/tests/unit/inference/quantization/test_intX_quantization.py +@@ -17,6 +17,7 @@ from transformers import AutoConfig, OPTConfig, AutoModel + import pytest + from collections import OrderedDict + from typing import Dict ++from pathlib import Path + + device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu' + +@@ -53,11 +54,11 @@ def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int): + assert mean_diff < 0.15 and max_diff < 0.5, f'Numeric error exceed threshold, mean diff {mean_diff} (threshold 0.15), max diff {max_diff} (threshold 0.5)' + + +-def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int): ++def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path): + import deepspeed + from transformers.integrations.deepspeed import HfDeepSpeedConfig + +- def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict: ++ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict: + GB = 1 << 30 + + ds_config = { +@@ -127,7 +128,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo + ds_config["zero_optimization"]["offload_param"] = dict( + device="nvme", + pin_memory=True, +- nvme_path='~/tmp_offload_dir', ++ nvme_path=str(tmp_path / "tmp_offload_dir"), + buffer_count=5, + buffer_size=1 * GB, + ) +@@ -142,7 +143,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo + return ds_config + + hf_config = AutoConfig.from_pretrained('facebook/opt-125m') +- ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits) ++ ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path) + + input_ids = torch.ones(1, 16, dtype=torch.int32, device=device) + attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device) +@@ -170,11 +171,11 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo + assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)' + + +-def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int): ++def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path): + import deepspeed + from transformers.integrations.deepspeed import HfDeepSpeedConfig + +- def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict: ++ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict: + GB = 1 << 30 + + ds_config = { +@@ -206,7 +207,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: + ds_config["zero_optimization"]["offload_param"] = dict( + device="nvme", + pin_memory=True, +- nvme_path='~/tmp_offload_dir', ++ nvme_path=str(tmp_path / "tmp_offload_dir"), + buffer_count=5, + buffer_size=1 * GB, + ) +@@ -221,7 +222,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: + return ds_config + + hf_config = AutoConfig.from_pretrained('facebook/opt-125m') +- ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits) ++ ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path) + + input_ids = torch.ones(1, 16, dtype=torch.int32, device=device) + attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device) +@@ -376,31 +377,31 @@ class TestQuantizedInt(DistributedTest): + quantization_test_helper(torch.float16, 8) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_post_init_quant(self, quantization_bits): ++ def test_zero3_int4_post_init_quant(self, quantization_bits, tmp_path): + reset_random() +- zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits) ++ zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits): ++ def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits, tmp_path): + reset_random() +- zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits) ++ zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_post_init_quant_nvme_offload(self): ++ def test_zero3_int4_post_init_quant_nvme_offload(self, tmp_path): + reset_random() +- zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4) ++ zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_quantized_initialization(self, quantization_bits): ++ def test_zero3_int4_quantized_initialization(self, quantization_bits, tmp_path): + reset_random() +- zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits) ++ zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits): ++ def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits, tmp_path): + reset_random() +- zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits) ++ zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_quantized_initialization_nvme_offload(self): ++ def test_zero3_int4_quantized_initialization_nvme_offload(self, tmp_path): + reset_random() +- zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4) ++ zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path) +-- +2.39.3 + diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch new file mode 100644 index 000000000000..35fe2cb8b66a --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch @@ -0,0 +1,55 @@ +From 27a64a22d6f84585ce9685e72ed9000f569ba941 Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Thu, 10 Oct 2024 05:57:38 +0000 +Subject: [PATCH] Use EB env vars to search for CUTLASS + +Instead of needing to set CUTLASS_PATH to the cutlass source directory, +use EBROOTCUTLASS to find the installation directory. +--- + op_builder/evoformer_attn.py | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py +index af3aa742..5545b1aa 100644 +--- a/op_builder/evoformer_attn.py ++++ b/op_builder/evoformer_attn.py +@@ -5,6 +5,7 @@ + + from .builder import CUDAOpBuilder, installed_cuda_version + import os ++from packaging.version import Version + + + class EvoformerAttnBuilder(CUDAOpBuilder): +@@ -14,7 +15,8 @@ class EvoformerAttnBuilder(CUDAOpBuilder): + def __init__(self, name=None): + name = self.NAME if name is None else name + super().__init__(name=name) +- self.cutlass_path = os.environ.get('CUTLASS_PATH') ++ self.cutlass_path = os.environ.get('EBROOTCUTLASS') ++ self.cutlass_version = os.environ.get('EBVERSIONCUTLASS') + + def absolute_name(self): + return f'deepspeed.ops.{self.NAME}_op' +@@ -50,13 +52,12 @@ class EvoformerAttnBuilder(CUDAOpBuilder): + return False + if self.cutlass_path is None: + if verbose: +- self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH") ++ self.warning("EBROOTCUTLASS not set, please load CUTLASS module.") ++ return False ++ if Version(self.cutlass_version) < Version('3.1.0'): ++ if verbose: ++ self.warning(f"Please use CUTLASS version >= 3.1.0, detected {self.cutlass_version}") + return False +- with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f: +- if '3.1.0' not in f.read(): +- if verbose: +- self.warning("Please use CUTLASS version >= 3.1.0") +- return False + cuda_okay = True + if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda + sys_cuda_major, _ = installed_cuda_version() +-- +2.39.3 + From 16b61a63cce6a5df9c3b9f784fe06817b98f5d9a Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 27 Feb 2026 16:30:21 +0100 Subject: [PATCH 02/12] Update patches --- .../DeepSpeed-0.14.2_no-ninja-dep.patch | 66 +++++++++++-------- ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 7 +- .../DeepSpeed-0.14.5_pdsh-env-vars.patch | 22 ++----- 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch index 8a51596fb3b6..3fdde8270575 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch @@ -1,50 +1,60 @@ -Patch away dependency on ninja python package by falling back to checking -returncode of `ninja --version`. +Patch away dependency on ninja python package by checking returncode of `ninja --version`. Author: Viktor Rehnberg (Chalmers University of Technology) - +Adapted by Alexander Grund (TU Dresden) diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py -index 85a2f9b2..8bb64626 100644 --- a/deepspeed/env_report.py +++ b/deepspeed/env_report.py -@@ -62,7 +62,7 @@ def ninja_installed(): - try: - import ninja # noqa: F401 # type: ignore - except ImportError: +@@ -59,11 +59,7 @@ def op_report(verbose=True): + + + def ninja_installed(): +- try: +- import ninja # noqa: F401 # type: ignore +- except ImportError: - return False -+ return (subprocess.run(["ninja", "--version"]).returncode == 0) - return True +- return True ++ return subprocess.run(["ninja", "--version"], check=False).returncode == 0 + def nvcc_version(): diff --git a/op_builder/builder.py b/op_builder/builder.py -index 8dc825c7..970d18b2 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py -@@ -487,7 +487,8 @@ class OpBuilder(ABC): - try: - import ninja # noqa: F401 # type: ignore - except ImportError: -- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") -+ if subprocess.run(["ninja", "--version"]).returncode != 0: -+ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") +@@ -484,9 +484,7 @@ class OpBuilder(ABC): + raise RuntimeError( + f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}" + ) +- try: +- import ninja # noqa: F401 # type: ignore +- except ImportError: ++ if subprocess.run(["ninja", "--version"], check=False).returncode != 0: + raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch(): - self.build_for_cpu = not torch.cuda.is_available() diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py -index 81b15f19..cf0a1cc0 100644 --- a/op_builder/xpu/builder.py +++ b/op_builder/xpu/builder.py -@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder): - try: - import ninja # noqa: F401 - except ImportError: -- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") -+ if subprocess.run(["ninja", "--version"]).returncode != 0: -+ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") +@@ -6,6 +6,7 @@ + import os + import time + import importlib ++import subprocess + + try: + # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed +@@ -86,9 +87,7 @@ class SYCLOpBuilder(OpBuilder): + raise RuntimeError( + f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}" + ) +- try: +- import ninja # noqa: F401 +- except ImportError: ++ if subprocess.run(["ninja", "--version"], check=False).returncode != 0: + raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") self.jit_mode = True - from intel_extension_for_pytorch.xpu.cpp_extension import load diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 80c9f9b3..eed77fa3 100755 --- a/requirements/requirements.txt diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index eac19e66f317..44fac2bab220 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -74,11 +74,12 @@ exts_list = [ ], 'checksums': [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, - {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, + {'DeepSpeed-0.14.2_no-ninja-dep.patch': + '681b4e7b9101a3decbd48f63fcb55b5262a10cf411a1c107a951ff58c1aa93cd'}, {'DeepSpeed-0.14.5_avoid-access-to-home.patch': '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': - '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'}, + '02556620ac643d273a2fa9c019d437cd874a6c19759fa59baaa0e9a41d0a5240'}, {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'}, {'DeepSpeed-0.14.5_test-nvme-offload.patch': '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'}, @@ -87,7 +88,7 @@ exts_list = [ ], 'jit_only_ops': [ 'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS', - # Cannot prebuild with transformer OP: https://github.com/deepspeedai/DeepSpeed/issues/949 + # Cannot be prebuild if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949 # 'STOCHASTIC_TRANSFORMER', ], 'testinstall': True, diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch index 9d4342f66bcb..be3e25a5a713 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch @@ -1,22 +1,17 @@ -From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001 -From: Viktor Rehnberg -Date: Wed, 25 Sep 2024 09:29:23 +0000 -Subject: [PATCH] Add software relevant environment variables +Add software relevant environment variables The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is not included in these exports then the python .so file may not be found. Also including what seemed important and was added from loading DeepSpeed. -(Couldn't add everything, then argumet list becomes too long). +(Couldn't add everything, then argument list becomes too long). -See - - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 -for more details. ---- - deepspeed/launcher/runner.py | 5 +++++ - 1 file changed, 5 insertions(+) +See https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 + +Note: Those are prefixes of variables to be included. + +Author: Viktor Rehnberg (Chalmers University of Technology) diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py -index 07d1713e..e9cd61b8 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py @@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator @@ -31,6 +26,3 @@ index 07d1713e..e9cd61b8 100755 EXPORT_ENVS += NEBULA_EXPORT_ENVS DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env") DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.'] --- -2.39.3 - From 54ea8a3b65d023bafc50ff68f227848a061464cc Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Fri, 27 Feb 2026 16:32:08 +0100 Subject: [PATCH 03/12] Add dependencies --- ...ccelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb | 36 +++++++++ .../CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb | 58 ++++++++++++++ .../CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb | 77 +++++++++++++++++++ .../d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb | 27 +++++++ 4 files changed, 198 insertions(+) create mode 100644 easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb create mode 100644 easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb create mode 100644 easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb create mode 100644 easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb diff --git a/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb new file mode 100644 index 000000000000..ca32d9c6984e --- /dev/null +++ b/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb @@ -0,0 +1,36 @@ +easyblock = 'PythonBundle' + +name = 'accelerate' +version = '1.10.0' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://github.com/huggingface/accelerate' +description = """A simple way to launch, train, and use PyTorch models on almost any device and +distributed configuration, automatic mixed precision (including fp8), +and easy-to-configure FSDP and DeepSpeed support.""" + +toolchain = {'name': 'foss', 'version': '2023b'} + +dependencies = [ + ('Python', '3.11.5'), + ('Python-bundle-PyPI', '2023.10'), + ('SciPy-bundle', '2023.11'), + ('CUDA', '12.4.0', '', SYSTEM), + ('PyTorch-bundle', '2.3.0', versionsuffix), + ('PyYAML', '6.0.1'), + ('Safetensors', '0.4.4'), +] + +exts_list = [ + ('huggingface-hub', '0.30.2', { + 'sources': ['huggingface_hub-%(version)s.tar.gz'], + 'checksums': ['9a7897c5b6fd9dad3168a794a8998d6378210f5b9688d0dfc180b1a228dc2466'], + }), + (name, version, { + 'checksums': ['8270568fda9036b5cccdc09703fef47872abccd56eb5f6d53b54ea5fb7581496'], + }), +] + +sanity_check_commands = ['accelerate test'] + +moduleclass = 'ai' diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb new file mode 100644 index 000000000000..ee1c2c569e94 --- /dev/null +++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb @@ -0,0 +1,58 @@ +easyblock = 'CMakeMake' + +name = 'CUTLASS' +version = '4.1.0' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://github.com/NVIDIA/cutlass' +description = """CUTLASS is a collection of CUDA C++ template +abstractions for implementing high-performance matrix-matrix +multiplication (GEMM) and related computations at all levels and scales +within CUDA. It incorporates strategies for hierarchical decomposition +and data movement similar to those used to implement cuBLAS and cuDNN. +CUTLASS decomposes these "moving parts" into reusable, modular software +components abstracted by C++ template classes. Primitives for different +levels of a conceptual parallelization hierarchy can be specialized and +tuned via custom tiling sizes, data types, and other algorithmic policy. +The resulting flexibility simplifies their use as building blocks within +custom kernels and applications.""" + +toolchain = {'name': 'foss', 'version': '2023b'} + +github_account = 'NVIDIA' +source_urls = [GITHUB_LOWER_SOURCE] +sources = [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}] +patches = [ + 'CUTLASS-4.1.0_fix-version.patch', + 'CUTLASS-4.1.0_add-arch-guards-to-tests.patch', +] +checksums = [ + {'CUTLASS-4.1.0.tar.gz': '8d4675b11e9e5207e3940eaac0f46db934ada371cbb3627c9fda642d912b6230'}, + {'CUTLASS-4.1.0_fix-version.patch': 'e2c7f66e6fd298b3af5339e17c0c75ded7d726cdf6cde003f60263e27ae46495'}, + {'CUTLASS-4.1.0_add-arch-guards-to-tests.patch': + '81cd18d83bdedf3ed1f7add68bbff1635cf9d76bb9e184efbc62cd95caee4275'}, +] + +builddependencies = [ + ('CMake', '3.27.6'), + ('Python', '3.11.5'), +] + +dependencies = [ + ('CUDA', '12.4.0', '', SYSTEM), + ('cuDNN', '9.1.1.17', versionsuffix, SYSTEM), +] + +_copts = [ + '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"', + '-DCUTLASS_ENABLE_CUBLAS=1', + '-DCUTLASS_ENABLE_CUDNN=1', +] +configopts = ' '.join(_copts) + +sanity_check_paths = { + 'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT], + 'dirs': ['lib/cmake'], +} + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb new file mode 100644 index 000000000000..fe488db1d3ec --- /dev/null +++ b/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb @@ -0,0 +1,77 @@ +easyblock = 'PythonBundle' + +name = 'CuPy' +version = '13.6.0' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://cupy.dev' +description = "CuPy is an open-source array library accelerated with NVIDIA CUDA." + +toolchain = {'name': 'foss', 'version': '2023b'} + +builddependencies = [ + ('hypothesis', '6.90.0'), + ('Cython', '3.0.10'), + ('setuptools', '80.9.0'), +] + +dependencies = [ + ('Python', '3.11.5'), + ('SciPy-bundle', '2023.11'), + ('CUDA', '12.4.0', '', SYSTEM), + ('NCCL', '2.20.5', versionsuffix), + ('cuTENSOR', '2.0.2.5', versionsuffix, SYSTEM), + ('cuSPARSELt', '0.8.0.4', versionsuffix, SYSTEM), # docs say 0.7.0 or 0.7.1 +] + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0'] + +exts_default_options = {'source_urls': [PYPI_LOWER_SOURCE]} + +_skip_tests = [ + '--ignore tests/example_tests', # examples are not included + '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_ltisys.py::Test_bode::test_from_state_space', + '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_fir_filter_design.py::TestFirls::test_firls', + '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_ltisys.py::TestPlacePoles::test_real_2', + # New failures in 13.6.0, they all seems to be related to on-the-fly compilation failing + '--deselect tests/cupy_tests/core_tests/test_raw.py::TestRawPicklable', # ::test_raw_picklable + '--deselect tests/cupy_tests/fft_tests/test_callback.py::Test1dCallbacks', +] + +# For testing with new versions of CuPy, please enable the slow testing setting below, +# but switch to the much lighter fast testing before submitting the .eb file, so users +# can install on GPUs with moderate RAM. + +# _parallel_tests, _test_type = 4, 'not slow' +_parallel_tests, _test_type = 1, 'fast' + +exts_list = [ + ('fastrlock', '0.8.3', { + 'checksums': ['4af6734d92eaa3ab4373e6c9a1dd0d5ad1304e172b1521733c6c3b3d73c8fa5d'], + }), + ('cupy', version, { + 'patches': [ + 'cupy-13.0.0_cusparselt_0.6.0.patch', + 'cupy-13.0.0_eb_ccc.patch', + 'CuPy-13.6.0-Disable_TestRaw_with_nvcc_backend.patch', + ], + 'preinstallopts': 'CUPY_NUM_BUILD_JOBS=%(parallel)s EB_CCC="%(cuda_cc_cmake)s" ', + 'runtest': 'export CUPY_TEST_GPU_LIMIT=1 CUPY_CACHE_DIR="%%(builddir)s" && ' + 'pytest -n %s tests -k "%s" ' % (_parallel_tests, _test_type) + ' '.join(_skip_tests), + 'testinstall': True, + 'checksums': [ + {'cupy-13.6.0.tar.gz': '3cba30ae3dd32b5d5c6536e710cb98015227cd4ba83c46b3f1825a7ae55b6667'}, + {'cupy-13.0.0_cusparselt_0.6.0.patch': '09cb12d26e78079c50b06f17002bf54c66e5e4743b917c5a218d3fe90124d499'}, + {'cupy-13.0.0_eb_ccc.patch': 'bfe8b46344759f58491f55418bd9c856d6f72d681ee5fef12820009f808d2db1'}, + {'CuPy-13.6.0-Disable_TestRaw_with_nvcc_backend.patch': + '958d80059b085017ed8c8de55ed82a0d52fdf964482e8ccc13d401515979d4b7'}, + ], + }), +] + +sanity_check_commands = [ + "python -c 'import cupy'", +] + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb b/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb new file mode 100644 index 000000000000..afb81cf369ba --- /dev/null +++ b/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb @@ -0,0 +1,27 @@ +easyblock = 'CMakeMake' + +name = 'DLPack' +version = '1.2' + +homepage = 'https://dmlc.github.io/dlpack/latest/' +description = """DLPack is a stable in-memory data structure for an ndarray +system to interact with a variety of frameworks.""" + +toolchain = {'name': 'GCCcore', 'version': '13.2.0'} + +github_account = 'dmlc' +source_urls = [GITHUB_LOWER_SOURCE] +sources = ['v%(version)s.tar.gz'] +checksums = ['58284a3b004a48450c958a23b30274527ebaf35a061124bbd4193fffa45efbd6'] + +builddependencies = [ + ('binutils', '2.40'), + ('CMake', '3.27.6'), +] + +sanity_check_paths = { + 'files': ['include/dlpack/dlpack.h', 'lib/cmake/dlpack/dlpackConfig.cmake'], + 'dirs': [], +} + +moduleclass = 'lib' From 32629970d9261ff0149aefaa7505d8d60bc2c23e Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 2 Mar 2026 15:43:46 +0100 Subject: [PATCH 04/12] Add dependencies --- .../cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb | 35 ++++++++++++++++ .../cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb | 40 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb create mode 100644 easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb diff --git a/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb new file mode 100644 index 000000000000..e59c12b3bf44 --- /dev/null +++ b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb @@ -0,0 +1,35 @@ +easyblock = 'Tarball' + +name = 'cuSPARSELt' +version = '0.8.0.4' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://docs.nvidia.com/cuda/cusparselt/index.html' +description = """NVIDIA cuSPARSELt is a high-performance CUDA library dedicated to general matrix-matrix operations in +which at least one operand is a sparse matrix""" + +toolchain = SYSTEM + +local_cudamajver = '12' + +local_arch = {'arm64': 'sbsa', 'aarch64': 'sbsa'}.get(ARCH, ARCH) +source_urls = ['https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-%s/' % local_arch] +sources = ['libcusparse_lt-linux-%s-%%(version)s_cuda%s-archive.tar.xz' % + (local_arch, local_cudamajver)] +checksums = [{ + 'libcusparse_lt-linux-x86_64-%%(version)s_cuda%s-archive.tar.xz' % local_cudamajver: + '483954591766bade877becef126d53908d5fef5d7468b503736af37388669c08', + 'libcusparse_lt-linux-sbsa-%%(version)s_cuda%s-archive.tar.xz' % local_cudamajver: + 'b59e2f8ffd154b156b2d74ccd7cad7775385693bec8cb9562596060072c515f2', +}] + +dependencies = [('CUDA', '12.4.0')] + +sanity_check_paths = { + 'files': ['include/cusparseLt.h', + 'lib/libcusparseLt.%s' % SHLIB_EXT, + 'lib/libcusparseLt_static.a'], + 'dirs': [], +} + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb new file mode 100644 index 000000000000..dd4624817a71 --- /dev/null +++ b/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb @@ -0,0 +1,40 @@ +easyblock = 'Tarball' + +name = 'cuTENSOR' +version = '2.0.2.5' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://developer.nvidia.com/cutensor' +description = """The cuTENSOR Library is a GPU-accelerated tensor linear algebra library providing tensor contraction, + reduction and elementwise operations.""" + +toolchain = SYSTEM + +source_urls = [ + 'https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-%(arch)s/' +] +sources = ['libcutensor-linux-%(arch)s-%(version)s-archive.tar.xz'] + +checksums = [{ + 'libcutensor-linux-sbsa-%(version)s-archive.tar.xz': + '5163dd40f11f328e469a6d9b0056c8346f5d59ed538c18d6b954e4ae657c69cc', + 'libcutensor-linux-x86_64-%(version)s-archive.tar.xz': + '0e957ae7b352f599de34b6fa1ba999b0617887f885d7436ac5737d71a6b83baa', +}] + +local_cudamajver = '12' +dependencies = [('CUDA', '12.4.0')] + +sanity_check_paths = { + 'files': ['include/cutensor.h', 'include/cutensor/types.h', + 'lib/%s/libcutensor.%s' % (local_cudamajver, SHLIB_EXT), + 'lib/%s/libcutensor_static.a' % local_cudamajver], + 'dirs': [], +} + +modextrapaths = { + 'LD_LIBRARY_PATH': ['lib/%s' % local_cudamajver], + 'LIBRARY_PATH': ['lib/%s' % local_cudamajver], +} + +moduleclass = 'lib' From 74c7cba943c8f010a3a67b76d6aac619e111f092 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 2 Mar 2026 17:21:25 +0100 Subject: [PATCH 05/12] Update patches and don't precompile EvoformerAttn --- .../DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 13 ++++++++----- ...ep.patch => DeepSpeed-0.14.5_no-ninja-dep.patch} | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) rename easybuild/easyconfigs/d/DeepSpeed/{DeepSpeed-0.14.2_no-ninja-dep.patch => DeepSpeed-0.14.5_no-ninja-dep.patch} (97%) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index 44fac2bab220..a3f2ead2c017 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -65,8 +65,8 @@ exts_list = [ # Test suite not available on pypi 'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}], 'patches': [ - 'DeepSpeed-0.14.2_no-ninja-dep.patch', 'DeepSpeed-0.14.5_avoid-access-to-home.patch', + 'DeepSpeed-0.14.5_no-ninja-dep.patch', 'DeepSpeed-0.14.5_pdsh-env-vars.patch', 'DeepSpeed-0.14.5_pic-compile.patch', 'DeepSpeed-0.14.5_test-nvme-offload.patch', @@ -74,10 +74,10 @@ exts_list = [ ], 'checksums': [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, - {'DeepSpeed-0.14.2_no-ninja-dep.patch': - '681b4e7b9101a3decbd48f63fcb55b5262a10cf411a1c107a951ff58c1aa93cd'}, {'DeepSpeed-0.14.5_avoid-access-to-home.patch': '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'}, + {'DeepSpeed-0.14.5_no-ninja-dep.patch': + 'e974a928b03a180da4e67da2f347c25968cb41f9c6037a9796ab776a9a4b0547'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': '02556620ac643d273a2fa9c019d437cd874a6c19759fa59baaa0e9a41d0a5240'}, {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'}, @@ -88,8 +88,11 @@ exts_list = [ ], 'jit_only_ops': [ 'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS', - # Cannot be prebuild if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949 - # 'STOCHASTIC_TRANSFORMER', + # Cannot be prebuilt if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949 + 'STOCHASTIC_TRANSFORMER', + # Cannot be prebuilt in several scenarious, e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present: + # See https://github.com/deepspeedai/DeepSpeed/pull/7760 + 'EVOFORMER_ATTN', ], 'testinstall': True, 'runtest': ' && '.join(( diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch similarity index 97% rename from easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch rename to easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch index 3fdde8270575..d4e2a0d66997 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch @@ -22,7 +22,7 @@ diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py diff --git a/op_builder/builder.py b/op_builder/builder.py --- a/op_builder/builder.py +++ b/op_builder/builder.py -@@ -484,9 +484,7 @@ class OpBuilder(ABC): +@@ -533,9 +533,7 @@ class OpBuilder(ABC): raise RuntimeError( f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}" ) @@ -63,5 +63,5 @@ index 80c9f9b3..eed77fa3 100755 hjson -ninja numpy + nvidia-ml-py packaging>=20.0 - psutil From ea062e82669ea72d62e50fde78e1724a78dbb721 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Wed, 4 Mar 2026 12:34:48 +0100 Subject: [PATCH 06/12] Fix line length --- .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index a3f2ead2c017..10d7991ebda2 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -90,7 +90,8 @@ exts_list = [ 'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS', # Cannot be prebuilt if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949 'STOCHASTIC_TRANSFORMER', - # Cannot be prebuilt in several scenarious, e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present: + # Cannot be prebuilt in several scenarious, + # e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present: # See https://github.com/deepspeedai/DeepSpeed/pull/7760 'EVOFORMER_ATTN', ], From 038c35ae2b87abf52bd363b1a924cc916695a50f Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Wed, 18 Mar 2026 08:16:48 +0100 Subject: [PATCH 07/12] Update patch and test config --- .../DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 4 ++-- .../DeepSpeed-0.14.5_avoid-access-to-home.patch | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index 10d7991ebda2..4d1143477835 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -75,7 +75,7 @@ exts_list = [ 'checksums': [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, {'DeepSpeed-0.14.5_avoid-access-to-home.patch': - '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'}, + 'edb39720a27b74170c87c8c51ecb8be6fd6fe2fa346f2a10b343a73884c5c412'}, {'DeepSpeed-0.14.5_no-ninja-dep.patch': 'e974a928b03a180da4e67da2f347c25968cb41f9c6037a9796ab776a9a4b0547'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': @@ -99,7 +99,7 @@ exts_list = [ 'runtest': ' && '.join(( 'ln -s $PWD/tests/ ../tests', 'cd ..', - f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}'", + f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}' --durations=0", )), }), ] diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch index 85f967a3388b..35704b40e883 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch @@ -17,15 +17,14 @@ set to an NFS directory, a warning will be printed whereas prior to this change no warning would be printed fixes #6486 ---- - .../ops/transformer/inference/triton/matmul_ext.py | 14 +++++++++----- - 1 file changed, 9 insertions(+), 5 deletions(-) + +Adapted to use $TRITON_HOME by Alexander Grund (TU Dresden) diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py index c77d8a8e11c0..412c8740a216 100644 --- a/deepspeed/ops/transformer/inference/triton/matmul_ext.py +++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py -@@ -40,13 +40,17 @@ class TritonCacheDir: +@@ -40,13 +40,18 @@ class TritonCacheDir: _warning_printed = False @staticmethod @@ -43,7 +42,8 @@ index c77d8a8e11c0..412c8740a216 100644 + + @staticmethod + def default_cache_dir(): -+ tmp_path = os.path.join(Path.home(), ".triton", "autotune") ++ tt_home = os.environ.get('TRITON_HOME') or os.path.join(Path.home(), ".triton") ++ tmp_path = os.path.join(tt_home, "autotune") return tmp_path From d075d93e97b0726fa9cc0294fd01ddb22e56de06 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 23 Mar 2026 12:47:53 +0100 Subject: [PATCH 08/12] Use CUTLASS checkout and pytest-forked --- ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 26 +++++++-- .../DeepSpeed-0.14.5_use-eb-cutlass.patch | 55 ------------------- .../pytest-forked-1.6.0-GCCcore-13.2.0.eb | 22 ++++++++ 3 files changed, 42 insertions(+), 61 deletions(-) delete mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch create mode 100644 easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index 4d1143477835..f36255209a85 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -14,13 +14,13 @@ toolchain = {'name': 'foss', 'version': '2023b'} builddependencies = [ ('Ninja', '1.11.1'), ('Transformers', '4.44.0'), + ('pytest-forked', '1.6.0'), ] local_pytorch_version = '2.3.0' dependencies = [ ('Python', '3.11.5'), ('CUDA', '12.4.0', '', SYSTEM), ('NCCL', '2.20.5', '-CUDA-%(cudaver)s'), - ('CUTLASS', '4.1.0', '-CUDA-%(cudaver)s'), ('CuPy', '13.6.0', '-CUDA-%(cudaver)s'), ('Triton', '2.3.1', '-CUDA-%(cudaver)s'), ('accelerate', '1.10.0', '-CUDA-%(cudaver)s'), @@ -44,6 +44,19 @@ local_excluded_ds_tests = ( 'test_fp_quant[256-qbits8-bf16]', # Error of 0.00909423828125 > 0.004 ) +components = [ + ('CUTLASS', '4.1.0', { + 'easyblock': 'Tarball', + 'source_urls': ['https://github.com/NVIDIA/cutlass/archive/refs/tags'], + 'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}], + 'start_dir': '%(namelower)s-%(version)s', + 'target_dir': 'extra/cutlass', + }), +] + +local_cutlass_path = '%(installdir)s/extra/cutlass' +local_cutlass_opt = f"export CUTLASS_PATH='{local_cutlass_path}' && " + github_account = 'microsoft' exts_list = [ ('hjson', '3.1.0', { @@ -70,7 +83,6 @@ exts_list = [ 'DeepSpeed-0.14.5_pdsh-env-vars.patch', 'DeepSpeed-0.14.5_pic-compile.patch', 'DeepSpeed-0.14.5_test-nvme-offload.patch', - 'DeepSpeed-0.14.5_use-eb-cutlass.patch', ], 'checksums': [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, @@ -83,18 +95,16 @@ exts_list = [ {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'}, {'DeepSpeed-0.14.5_test-nvme-offload.patch': '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'}, - {'DeepSpeed-0.14.5_use-eb-cutlass.patch': - '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'}, ], 'jit_only_ops': [ 'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS', - # Cannot be prebuilt if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949 - 'STOCHASTIC_TRANSFORMER', # Cannot be prebuilt in several scenarious, # e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present: # See https://github.com/deepspeedai/DeepSpeed/pull/7760 'EVOFORMER_ATTN', ], + 'preinstallopts': local_cutlass_opt, + 'pretestopts': local_cutlass_opt, 'testinstall': True, 'runtest': ' && '.join(( 'ln -s $PWD/tests/ ../tests', @@ -104,4 +114,8 @@ exts_list = [ }), ] +modextravars = { + 'CUTLASS_PATH': local_cutlass_path, +} + moduleclass = 'ai' diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch deleted file mode 100644 index 35fe2cb8b66a..000000000000 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 27a64a22d6f84585ce9685e72ed9000f569ba941 Mon Sep 17 00:00:00 2001 -From: Viktor Rehnberg -Date: Thu, 10 Oct 2024 05:57:38 +0000 -Subject: [PATCH] Use EB env vars to search for CUTLASS - -Instead of needing to set CUTLASS_PATH to the cutlass source directory, -use EBROOTCUTLASS to find the installation directory. ---- - op_builder/evoformer_attn.py | 15 ++++++++------- - 1 file changed, 8 insertions(+), 7 deletions(-) - -diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py -index af3aa742..5545b1aa 100644 ---- a/op_builder/evoformer_attn.py -+++ b/op_builder/evoformer_attn.py -@@ -5,6 +5,7 @@ - - from .builder import CUDAOpBuilder, installed_cuda_version - import os -+from packaging.version import Version - - - class EvoformerAttnBuilder(CUDAOpBuilder): -@@ -14,7 +15,8 @@ class EvoformerAttnBuilder(CUDAOpBuilder): - def __init__(self, name=None): - name = self.NAME if name is None else name - super().__init__(name=name) -- self.cutlass_path = os.environ.get('CUTLASS_PATH') -+ self.cutlass_path = os.environ.get('EBROOTCUTLASS') -+ self.cutlass_version = os.environ.get('EBVERSIONCUTLASS') - - def absolute_name(self): - return f'deepspeed.ops.{self.NAME}_op' -@@ -50,13 +52,12 @@ class EvoformerAttnBuilder(CUDAOpBuilder): - return False - if self.cutlass_path is None: - if verbose: -- self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH") -+ self.warning("EBROOTCUTLASS not set, please load CUTLASS module.") -+ return False -+ if Version(self.cutlass_version) < Version('3.1.0'): -+ if verbose: -+ self.warning(f"Please use CUTLASS version >= 3.1.0, detected {self.cutlass_version}") - return False -- with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f: -- if '3.1.0' not in f.read(): -- if verbose: -- self.warning("Please use CUTLASS version >= 3.1.0") -- return False - cuda_okay = True - if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda - sys_cuda_major, _ = installed_cuda_version() --- -2.39.3 - diff --git a/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb b/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb new file mode 100644 index 000000000000..56cbb558a237 --- /dev/null +++ b/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb @@ -0,0 +1,22 @@ +easyblock = 'PythonPackage' + +name = 'pytest-forked' +version = '1.6.0' + +homepage = 'https://github.com/pytest-dev/pytest-forked' +description = "Run tests in isolated forked subprocesses." + +toolchain = {'name': 'GCCcore', 'version': '13.2.0'} + +sources = [SOURCE_TAR_GZ] +checksums = ['4dafd46a9a600f65d822b8f605133ecf5b3e1941ebb3588e943b4e3eb71a5a3f'] + +builddependencies = [ + ('binutils', '2.40'), +] +dependencies = [ + ('Python', '3.11.5'), + ('Python-bundle-PyPI', '2023.10'), +] + +moduleclass = 'tools' From ee377bcecf278fe48da009bc425e46520fd0167b Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 23 Mar 2026 16:47:56 +0100 Subject: [PATCH 09/12] Don't use pytest-forked --- .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index f36255209a85..350de6028eb1 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -14,7 +14,6 @@ toolchain = {'name': 'foss', 'version': '2023b'} builddependencies = [ ('Ninja', '1.11.1'), ('Transformers', '4.44.0'), - ('pytest-forked', '1.6.0'), ] local_pytorch_version = '2.3.0' dependencies = [ @@ -49,6 +48,7 @@ components = [ 'easyblock': 'Tarball', 'source_urls': ['https://github.com/NVIDIA/cutlass/archive/refs/tags'], 'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}], + 'checksums': ['8d4675b11e9e5207e3940eaac0f46db934ada371cbb3627c9fda642d912b6230'], 'start_dir': '%(namelower)s-%(version)s', 'target_dir': 'extra/cutlass', }), From b580e99e08e80e129d5b94d5be78ae676096ee5c Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 23 Mar 2026 17:02:48 +0100 Subject: [PATCH 10/12] Skip test --- .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 1 + 1 file changed, 1 insertion(+) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index 350de6028eb1..2a6e654d959d 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -41,6 +41,7 @@ local_excluded_ds_tests = ( 'TestCometMonitor', 'TestQuantizedInt', # Downloads model from internet 'test_fp_quant[256-qbits8-bf16]', # Error of 0.00909423828125 > 0.004 + 'test_DS4Sci_EvoformerAttention[tensor_shape1-dtype1]', # Error of 0.05859375 > 0.05 ) components = [ From 16dd461c27915278978df539d689da40cec5d921 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 18 May 2026 12:46:51 +0200 Subject: [PATCH 11/12] Fix test failure --- ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 5 ++ ...epSpeed-0.14.5_fix-test-parameterize.patch | 82 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index 2a6e654d959d..fac73f793108 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -80,15 +80,19 @@ exts_list = [ 'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}], 'patches': [ 'DeepSpeed-0.14.5_avoid-access-to-home.patch', + 'DeepSpeed-0.14.5_fix-test-parameterize.patch', 'DeepSpeed-0.14.5_no-ninja-dep.patch', 'DeepSpeed-0.14.5_pdsh-env-vars.patch', 'DeepSpeed-0.14.5_pic-compile.patch', 'DeepSpeed-0.14.5_test-nvme-offload.patch', + 'debug14.patch', ], 'checksums': [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, {'DeepSpeed-0.14.5_avoid-access-to-home.patch': 'edb39720a27b74170c87c8c51ecb8be6fd6fe2fa346f2a10b343a73884c5c412'}, + {'DeepSpeed-0.14.5_fix-test-parameterize.patch': + '1df9c7ceeca0b37aff85390b7bd25e266ddf88c5b1380980e6c13a064840d1d8'}, {'DeepSpeed-0.14.5_no-ninja-dep.patch': 'e974a928b03a180da4e67da2f347c25968cb41f9c6037a9796ab776a9a4b0547'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': @@ -110,6 +114,7 @@ exts_list = [ 'runtest': ' && '.join(( 'ln -s $PWD/tests/ ../tests', 'cd ..', + 'export DS_UNITTEST_TIMEOUT=1200', f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}' --durations=0", )), }), diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch new file mode 100644 index 000000000000..8776d6e3d61a --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch @@ -0,0 +1,82 @@ +Avoid this failure during tests: +> assert not 'unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambFP16Pipeline::test[topo_config0]' + +Reason is that pytest-xdist doesn't seem to work well with having a dict in `pytest.mark.parameterize` +See https://github.com/pytest-dev/pytest-xdist/issues/922 + +It is not required at all here +diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py +--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py ++++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py +@@ -37,6 +37,8 @@ if get_accelerator().device_name() == 'hpu': + pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True) + + ++topo_config = {"num_pp": 2, "num_dp": 2} ++ + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"]) + class TestOneBitAdamBasic(DistributedTest): + world_size = 2 +@@ -342,19 +344,10 @@ class TestOneBitAdamCheckpointing(DistributedTest): + model.save_checkpoint(save_folder, tag=None) + + +-@pytest.mark.parametrize( +- "topo_config", +- [ +- { +- "num_pp": 2, +- "num_dp": 2 +- }, +- ], +-) + class TestOneBitAdamFP16Pipeline(DistributedTest): + world_size = 4 + +- def test(self, topo_config): ++ def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") + config_dict = { +@@ -709,19 +702,10 @@ class TestZeroOneAdamCheckpointing(DistributedTest): + model.save_checkpoint(save_folder, tag=None) + + +-@pytest.mark.parametrize( +- "topo_config", +- [ +- { +- "num_pp": 2, +- "num_dp": 2 +- }, +- ], +-) + class TestZeroOneAdamFP16Pipeline(DistributedTest): + world_size = 4 + +- def test(self, topo_config): ++ def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") + config_dict = { +@@ -1105,19 +1089,10 @@ class TestOneBitLambCheckpointing(DistributedTest): + model.save_checkpoint(save_folder, tag=None) + + +-@pytest.mark.parametrize( +- "topo_config", +- [ +- { +- "num_pp": 2, +- "num_dp": 2 +- }, +- ], +-) + class TestOneBitLambFP16Pipeline(DistributedTest): + world_size = 4 + +- def test(self, topo_config): ++ def test(self): + if not get_accelerator().is_fp16_supported(): + pytest.skip("fp16 is not supported") + config_dict = { From 3e61dd519fdd0e30647db4d06980b79120c9ae15 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 18 May 2026 16:59:18 +0200 Subject: [PATCH 12/12] Fix version --- .../DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb index fac73f793108..c4869f07bd0b 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb @@ -85,7 +85,6 @@ exts_list = [ 'DeepSpeed-0.14.5_pdsh-env-vars.patch', 'DeepSpeed-0.14.5_pic-compile.patch', 'DeepSpeed-0.14.5_test-nvme-offload.patch', - 'debug14.patch', ], 'checksums': [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, @@ -108,7 +107,14 @@ exts_list = [ # See https://github.com/deepspeedai/DeepSpeed/pull/7760 'EVOFORMER_ATTN', ], - 'preinstallopts': local_cutlass_opt, + 'preinstallopts': ' && '.join(( + # Use this version and no suffix + 'echo "%(version)s" > version.txt', + 'echo "" > build.txt', + # Disable use of git during build + 'sed -i "s/command_exists(\'git\')/False/" setup.py', + local_cutlass_opt, + )), 'pretestopts': local_cutlass_opt, 'testinstall': True, 'runtest': ' && '.join((