From 29ef90febcbc9153477732b13e6a7cc3c290ef56 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 27 Feb 2026 15:53:04 +0100
Subject: [PATCH 01/12] adding easyconfigs:
 DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb and patches:
 DeepSpeed-0.14.2_no-ninja-dep.patch,
 DeepSpeed-0.14.5_avoid-access-to-home.patch,
 DeepSpeed-0.14.5_pdsh-env-vars.patch, DeepSpeed-0.14.5_pic-compile.patch,
 DeepSpeed-0.14.5_test-nvme-offload.patch,
 DeepSpeed-0.14.5_use-eb-cutlass.patch

---
 .../DeepSpeed-0.14.2_no-ninja-dep.patch       |  57 +++++++
 ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 102 +++++++++++++
 ...eepSpeed-0.14.5_avoid-access-to-home.patch |  60 ++++++++
 .../DeepSpeed-0.14.5_pdsh-env-vars.patch      |  36 +++++
 .../DeepSpeed-0.14.5_pic-compile.patch        | 141 ++++++++++++++++++
 .../DeepSpeed-0.14.5_test-nvme-offload.patch  | 135 +++++++++++++++++
 .../DeepSpeed-0.14.5_use-eb-cutlass.patch     |  55 +++++++
 7 files changed, 586 insertions(+)
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
new file mode 100644
index 000000000000..8a51596fb3b6
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
@@ -0,0 +1,57 @@
+Patch away dependency on ninja python package by falling back to checking
+returncode of `ninja --version`.
+
+Author: Viktor Rehnberg (Chalmers University of Technology)
+
+
+diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
+index 85a2f9b2..8bb64626 100644
+--- a/deepspeed/env_report.py
++++ b/deepspeed/env_report.py
+@@ -62,7 +62,7 @@ def ninja_installed():
+     try:
+         import ninja  # noqa: F401 # type: ignore
+     except ImportError:
+-        return False
++        return (subprocess.run(["ninja", "--version"]).returncode == 0)
+     return True
+ 
+ 
+diff --git a/op_builder/builder.py b/op_builder/builder.py
+index 8dc825c7..970d18b2 100644
+--- a/op_builder/builder.py
++++ b/op_builder/builder.py
+@@ -487,7 +487,8 @@ class OpBuilder(ABC):
+         try:
+             import ninja  # noqa: F401 # type: ignore
+         except ImportError:
+-            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
++            if subprocess.run(["ninja", "--version"]).returncode != 0:
++                raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ 
+         if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
+             self.build_for_cpu = not torch.cuda.is_available()
+diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
+index 81b15f19..cf0a1cc0 100644
+--- a/op_builder/xpu/builder.py
++++ b/op_builder/xpu/builder.py
+@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder):
+         try:
+             import ninja  # noqa: F401
+         except ImportError:
+-            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
++            if subprocess.run(["ninja", "--version"]).returncode != 0:
++                raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ 
+         self.jit_mode = True
+         from intel_extension_for_pytorch.xpu.cpp_extension import load
+diff --git a/requirements/requirements.txt b/requirements/requirements.txt
+index 80c9f9b3..eed77fa3 100755
+--- a/requirements/requirements.txt
++++ b/requirements/requirements.txt
+@@ -1,5 +1,4 @@
+ hjson
+-ninja
+ numpy
+ packaging>=20.0
+ psutil
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..eac19e66f317
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,102 @@
+easyblock = 'PythonBundle'
+
+name = 'DeepSpeed'
+version = '0.14.5'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'http://www.deepspeed.ai/'
+description = """
+DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective.
+"""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+builddependencies = [
+    ('Ninja', '1.11.1'),
+    ('Transformers', '4.44.0'),
+]
+local_pytorch_version = '2.3.0'
+dependencies = [
+    ('Python', '3.11.5'),
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('NCCL', '2.20.5', '-CUDA-%(cudaver)s'),
+    ('CUTLASS', '4.1.0', '-CUDA-%(cudaver)s'),
+    ('CuPy', '13.6.0', '-CUDA-%(cudaver)s'),
+    ('Triton', '2.3.1', '-CUDA-%(cudaver)s'),
+    ('accelerate', '1.10.0', '-CUDA-%(cudaver)s'),
+    ('PyTorch', local_pytorch_version, '-CUDA-%(cudaver)s'),
+    ('PyTorch-bundle', local_pytorch_version, '-CUDA-%(cudaver)s'),
+    ('mpi4py', '3.1.5'),
+    ('DLPack', '1.2'),
+    ('py-cpuinfo', '9.0.0'),
+    ('pydantic', '2.7.4'),
+    ('tqdm', '4.66.2'),
+    ('pdsh', '2.36'),
+    ('Seaborn', '0.13.2'),  # dependency for mup
+    ('libaio', '0.3.113'),  # for async_io (builddep only?)
+]
+
+local_excluded_ds_tests = (
+    'TestTensorBoard',
+    'TestWandb',
+    'TestCometMonitor',
+    'TestQuantizedInt',  # Downloads model from internet
+    'test_fp_quant[256-qbits8-bf16]',  # Error of 0.00909423828125 > 0.004
+)
+
+github_account = 'microsoft'
+exts_list = [
+    ('hjson', '3.1.0', {
+        'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'],
+    }),
+    ('nvidia-ml-py', '12.535.161', {
+        'checksums': ['2bcc31ff7a0ea291ed8d7fc39b149391a42c2fb1cb4256c935e692de488b4d17'],
+        'modulename': 'pynvml',
+    }),
+    ('mup', '1.0.0', {
+        'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'],
+    }),
+    ('qtorch', '0.3.0', {
+        'checksums': ['3fc2e9b27d58d18304ac46511ea03a3eb20f852944f6a5b6ef71b974c2da20bf'],
+        'preinstallopts': "TORCH_CUDA_ARCH_LIST='%(cuda_cc_semicolon_sep)s' ",
+    }),
+    ('DeepSpeed', '0.14.5', {
+        'source_urls': [GITHUB_SOURCE],
+        # Test suite not available on pypi
+        'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}],
+        'patches': [
+            'DeepSpeed-0.14.2_no-ninja-dep.patch',
+            'DeepSpeed-0.14.5_avoid-access-to-home.patch',
+            'DeepSpeed-0.14.5_pdsh-env-vars.patch',
+            'DeepSpeed-0.14.5_pic-compile.patch',
+            'DeepSpeed-0.14.5_test-nvme-offload.patch',
+            'DeepSpeed-0.14.5_use-eb-cutlass.patch',
+        ],
+        'checksums': [
+            {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
+            {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'},
+            {'DeepSpeed-0.14.5_avoid-access-to-home.patch':
+             '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'},
+            {'DeepSpeed-0.14.5_pdsh-env-vars.patch':
+             '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'},
+            {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'},
+            {'DeepSpeed-0.14.5_test-nvme-offload.patch':
+             '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'},
+            {'DeepSpeed-0.14.5_use-eb-cutlass.patch':
+             '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'},
+        ],
+        'jit_only_ops': [
+            'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS',
+            # Cannot prebuild with transformer OP: https://github.com/deepspeedai/DeepSpeed/issues/949
+            # 'STOCHASTIC_TRANSFORMER',
+        ],
+        'testinstall': True,
+        'runtest': ' && '.join((
+            'ln -s $PWD/tests/ ../tests',
+            'cd ..',
+            f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}'",
+        )),
+    }),
+]
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
new file mode 100644
index 000000000000..85f967a3388b
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
@@ -0,0 +1,60 @@
+From 9d17116fcdb44b81eb00d3bce91431dc35cd69b1 Mon Sep 17 00:00:00 2001
+From: "Joshua C. Randall" <jcrandall@alum.mit.edu>
+Date: Wed, 4 Sep 2024 19:22:07 +0100
+Subject: [PATCH] print warning if actual triton cache dir is on NFS, not just
+ for default (#6487)
+
+move the logic that prints a warning when triton cache dir is on NFS to
+act on the actual calculated cache_dir rather than on the default.
+
+this means that:
+- when the default directory (in the user's home directory) is on NFS
+but `TRITON_CACHE_DIR` is set to a non-NFS directory, no warning will be
+printed whereas prior to this change a spurious and confusing warning
+was printed
+- when the user's home directory is not on NFS but `TRITON_CACHE_DIR` is
+set to an NFS directory, a warning will be printed whereas prior to this
+change no warning would be printed
+
+fixes #6486
+---
+ .../ops/transformer/inference/triton/matmul_ext.py | 14 +++++++++-----
+ 1 file changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
+index c77d8a8e11c0..412c8740a216 100644
+--- a/deepspeed/ops/transformer/inference/triton/matmul_ext.py
++++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
+@@ -40,13 +40,17 @@ class TritonCacheDir:
+     _warning_printed = False
+ 
+     @staticmethod
+-    def default_cache_dir():
+-        tmp_path = os.path.join(Path.home(), ".triton", "autotune")
+-        if is_nfs_path(tmp_path) and not TritonCacheDir._warning_printed:
++    def warn_if_nfs(cache_dir):
++        if is_nfs_path(cache_dir) and not TritonCacheDir._warning_printed:
+             print(
+-                f"Warning: The default cache directory for DeepSpeed Triton autotune, {tmp_path}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path."
++                f"Warning: The cache directory for DeepSpeed Triton autotune, {cache_dir}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path."
+             )
+             TritonCacheDir._warning_printed = True
++        return
++
++    @staticmethod
++    def default_cache_dir():
++        tmp_path = os.path.join(Path.home(), ".triton", "autotune")
+         return tmp_path
+ 
+ 
+@@ -80,9 +84,9 @@ def __init__(self, key):
+         self.lock_path = None
+         # if caching is enabled, get the lock and bin path
+         self.cache_dir = os.environ.get('TRITON_CACHE_DIR', TritonCacheDir.default_cache_dir())
++        TritonCacheDir.warn_if_nfs(self.cache_dir)
+         if self.cache_dir:
+             os.makedirs(self.cache_dir, exist_ok=True)
+-        if self.cache_dir:
+             self.file_path = os.path.join(self.cache_dir, self.key + ".pickle")
+             self.lock_path = self.file_path + ".lock"
+ 
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
new file mode 100644
index 000000000000..9d4342f66bcb
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
@@ -0,0 +1,36 @@
+From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
+Date: Wed, 25 Sep 2024 09:29:23 +0000
+Subject: [PATCH] Add software relevant environment variables
+
+The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is
+not included in these exports then the python .so file may not be found.
+Also including what seemed important and was added from loading DeepSpeed.
+(Couldn't add everything, then argumet list becomes too long).
+
+See
+ - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098
+for more details.
+---
+ deepspeed/launcher/runner.py | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
+index 07d1713e..e9cd61b8 100755
+--- a/deepspeed/launcher/runner.py
++++ b/deepspeed/launcher/runner.py
+@@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator
+ 
+ DLTS_HOSTFILE = "/job/hostfile"
+ EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX']
++EXPORT_ENVS += [ # Extra based on what's added by module load DeepSpeed
++    'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA',  # important
++    'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', 'MPL', 'NCCL',
++    'PKG_CONFIG_PATH', 'XDG_DATA_DIRS',
++]
+ EXPORT_ENVS += NEBULA_EXPORT_ENVS
+ DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
+ DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
+-- 
+2.39.3
+
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch
new file mode 100644
index 000000000000..707bc826e889
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch
@@ -0,0 +1,141 @@
+From 90afd671dadf9fd6a7a221428f2c04c16d637494 Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
+Date: Thu, 23 May 2024 07:09:53 +0000
+Subject: [PATCH] Compile with PIC
+
+---
+ op_builder/builder.py     | 15 ++++++++++-----
+ op_builder/cpu/builder.py |  3 ++-
+ op_builder/fused_adam.py  |  4 +++-
+ op_builder/fused_lamb.py  |  4 +++-
+ op_builder/fused_lion.py  |  4 +++-
+ op_builder/xpu/builder.py |  3 ++-
+ 6 files changed, 23 insertions(+), 10 deletions(-)
+
+diff --git a/op_builder/builder.py b/op_builder/builder.py
+index ec7566aa..f08e1799 100644
+--- a/op_builder/builder.py
++++ b/op_builder/builder.py
+@@ -288,13 +288,13 @@ class OpBuilder(ABC):
+         '''
+         Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+         '''
+-        return []
++        return ['-Xcompiler', '-fPIC']
+ 
+     def cxx_args(self):
+         '''
+         Returns optional list of compiler flags to forward to the build
+         '''
+-        return []
++        return ['-fPIC']
+ 
+     def is_compatible(self, verbose=True):
+         '''
+@@ -746,15 +746,18 @@ class CUDAOpBuilder(OpBuilder):
+             )
+ 
+     def cxx_args(self):
++        args = super().cxx_args()
+         if sys.platform == "win32":
+-            return ['-O2']
++            args += ['-O2']
+         else:
+-            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
++            args += ['-O3', '-std=c++17', '-g', '-Wno-reorder']
++        return args
+ 
+     def nvcc_args(self):
+         if self.build_for_cpu:
+             return []
+-        args = ['-O3']
++        args = super().nvcc_args()
++        args += ['-O3']
+         if self.is_rocm_pytorch():
+             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+             args += [
+@@ -835,6 +838,8 @@ class TorchCPUOpBuilder(CUDAOpBuilder):
+                 '-lcublas',
+                 '-g',
+             ]
++        else:
++            args += super(CUDAOpBuilder, self).cxx_args()
+ 
+         CPU_ARCH = self.cpu_arch()
+         SIMD_WIDTH = self.simd_width()
+diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
+index d881842a..dfc5a31d 100644
+--- a/op_builder/cpu/builder.py
++++ b/op_builder/cpu/builder.py
+@@ -30,7 +30,8 @@ class CPUOpBuilder(OpBuilder):
+         return cpp_ext
+ 
+     def cxx_args(self):
+-        args = ['-O3', '-g', '-Wno-reorder']
++        args = super().cxx_args()
++        args += ['-O3', '-g', '-Wno-reorder']
+         CPU_ARCH = self.cpu_arch()
+         SIMD_WIDTH = self.simd_width()
+         args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
+diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
+index ac6e4eea..0c723572 100644
+--- a/op_builder/fused_adam.py
++++ b/op_builder/fused_adam.py
+@@ -29,7 +29,9 @@ class FusedAdamBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+ 
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if not self.is_rocm_pytorch():
+             nvcc_flags.extend(
+                 ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
+index f0cb5577..a59b97d4 100644
+--- a/op_builder/fused_lamb.py
++++ b/op_builder/fused_lamb.py
+@@ -29,7 +29,9 @@ class FusedLambBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+ 
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if self.is_rocm_pytorch():
+             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+             nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py
+index b900a8f2..119232b5 100644
+--- a/op_builder/fused_lion.py
++++ b/op_builder/fused_lion.py
+@@ -29,7 +29,9 @@ class FusedLionBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+ 
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if not self.is_rocm_pytorch():
+             nvcc_flags.extend(
+                 ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
+index f430b7b6..5a1a2219 100644
+--- a/op_builder/xpu/builder.py
++++ b/op_builder/xpu/builder.py
+@@ -52,7 +52,8 @@ class SYCLOpBuilder(OpBuilder):
+         return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+ 
+     def cxx_args(self):
+-        cxx_flags = [
++        cxx_flags = super().cxx_args()
++        cxx_flags += [
+             '-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64',
+             '-fno-strict-aliasing'
+         ]
+-- 
+2.39.3
+
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch
new file mode 100644
index 000000000000..dcff709f2ce1
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch
@@ -0,0 +1,135 @@
+From ddbf7ab23ce2e83747ff6a1482ac512e06da82ca Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
+Date: Mon, 4 Nov 2024 15:31:55 +0100
+Subject: [PATCH] Fix quantization tests
+
+NVME tests didn't always run because the hard-coded nvme_path wasn't
+always writable. This commit changed to use tmp_path fixture instead and
+disabled distributed test to avoid thread locks hanging.
+---
+ .../quantization/test_intX_quantization.py    | 43 ++++++++++---------
+ 1 file changed, 22 insertions(+), 21 deletions(-)
+
+diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py
+index 77b51fcd..9e0d7ac0 100644
+--- a/tests/unit/inference/quantization/test_intX_quantization.py
++++ b/tests/unit/inference/quantization/test_intX_quantization.py
+@@ -17,6 +17,7 @@ from transformers import AutoConfig, OPTConfig, AutoModel
+ import pytest
+ from collections import OrderedDict
+ from typing import Dict
++from pathlib import Path
+ 
+ device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu'
+ 
+@@ -53,11 +54,11 @@ def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int):
+     assert mean_diff < 0.15 and max_diff < 0.5, f'Numeric error exceed threshold, mean diff {mean_diff} (threshold 0.15), max diff {max_diff} (threshold 0.5)'
+ 
+ 
+-def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
++def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path):
+     import deepspeed
+     from transformers.integrations.deepspeed import HfDeepSpeedConfig
+ 
+-    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
++    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict:
+         GB = 1 << 30
+ 
+         ds_config = {
+@@ -127,7 +128,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo
+             ds_config["zero_optimization"]["offload_param"] = dict(
+                 device="nvme",
+                 pin_memory=True,
+-                nvme_path='~/tmp_offload_dir',
++                nvme_path=str(tmp_path / "tmp_offload_dir"),
+                 buffer_count=5,
+                 buffer_size=1 * GB,
+             )
+@@ -142,7 +143,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo
+         return ds_config
+ 
+     hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
+-    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
++    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path)
+ 
+     input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
+     attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
+@@ -170,11 +171,11 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo
+     assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)'
+ 
+ 
+-def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
++def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path):
+     import deepspeed
+     from transformers.integrations.deepspeed import HfDeepSpeedConfig
+ 
+-    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
++    def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict:
+         GB = 1 << 30
+ 
+         ds_config = {
+@@ -206,7 +207,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload:
+             ds_config["zero_optimization"]["offload_param"] = dict(
+                 device="nvme",
+                 pin_memory=True,
+-                nvme_path='~/tmp_offload_dir',
++                nvme_path=str(tmp_path / "tmp_offload_dir"),
+                 buffer_count=5,
+                 buffer_size=1 * GB,
+             )
+@@ -221,7 +222,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload:
+         return ds_config
+ 
+     hf_config = AutoConfig.from_pretrained('facebook/opt-125m')
+-    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits)
++    ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path)
+ 
+     input_ids = torch.ones(1, 16, dtype=torch.int32, device=device)
+     attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device)
+@@ -376,31 +377,31 @@ class TestQuantizedInt(DistributedTest):
+         quantization_test_helper(torch.float16, 8)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_post_init_quant(self, quantization_bits):
++    def test_zero3_int4_post_init_quant(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
++        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits):
++    def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
++        zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_post_init_quant_nvme_offload(self):
++    def test_zero3_int4_post_init_quant_nvme_offload(self, tmp_path):
+         reset_random()
+-        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)
++        zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_quantized_initialization(self, quantization_bits):
++    def test_zero3_int4_quantized_initialization(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits)
++        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits):
++    def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits, tmp_path):
+         reset_random()
+-        zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits)
++        zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path)
+ 
+     @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM')
+-    def test_zero3_int4_quantized_initialization_nvme_offload(self):
++    def test_zero3_int4_quantized_initialization_nvme_offload(self, tmp_path):
+         reset_random()
+-        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4)
++        zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path)
+-- 
+2.39.3
+
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch
new file mode 100644
index 000000000000..35fe2cb8b66a
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch
@@ -0,0 +1,55 @@
+From 27a64a22d6f84585ce9685e72ed9000f569ba941 Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
+Date: Thu, 10 Oct 2024 05:57:38 +0000
+Subject: [PATCH] Use EB env vars to search for CUTLASS
+
+Instead of needing to set CUTLASS_PATH to the cutlass source directory,
+use EBROOTCUTLASS to find the installation directory.
+---
+ op_builder/evoformer_attn.py | 15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
+index af3aa742..5545b1aa 100644
+--- a/op_builder/evoformer_attn.py
++++ b/op_builder/evoformer_attn.py
+@@ -5,6 +5,7 @@
+ 
+ from .builder import CUDAOpBuilder, installed_cuda_version
+ import os
++from packaging.version import Version
+ 
+ 
+ class EvoformerAttnBuilder(CUDAOpBuilder):
+@@ -14,7 +15,8 @@ class EvoformerAttnBuilder(CUDAOpBuilder):
+     def __init__(self, name=None):
+         name = self.NAME if name is None else name
+         super().__init__(name=name)
+-        self.cutlass_path = os.environ.get('CUTLASS_PATH')
++        self.cutlass_path = os.environ.get('EBROOTCUTLASS')
++        self.cutlass_version = os.environ.get('EBVERSIONCUTLASS')
+ 
+     def absolute_name(self):
+         return f'deepspeed.ops.{self.NAME}_op'
+@@ -50,13 +52,12 @@ class EvoformerAttnBuilder(CUDAOpBuilder):
+             return False
+         if self.cutlass_path is None:
+             if verbose:
+-                self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
++                self.warning("EBROOTCUTLASS not set, please load CUTLASS module.")
++            return False
++        if Version(self.cutlass_version) < Version('3.1.0'):
++            if verbose:
++                self.warning(f"Please use CUTLASS version >= 3.1.0, detected {self.cutlass_version}")
+             return False
+-        with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
+-            if '3.1.0' not in f.read():
+-                if verbose:
+-                    self.warning("Please use CUTLASS version >= 3.1.0")
+-                return False
+         cuda_okay = True
+         if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+             sys_cuda_major, _ = installed_cuda_version()
+-- 
+2.39.3
+

From 16b61a63cce6a5df9c3b9f784fe06817b98f5d9a Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 27 Feb 2026 16:30:21 +0100
Subject: [PATCH 02/12] Update patches

---
 .../DeepSpeed-0.14.2_no-ninja-dep.patch       | 66 +++++++++++--------
 ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb |  7 +-
 .../DeepSpeed-0.14.5_pdsh-env-vars.patch      | 22 ++-----
 3 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
index 8a51596fb3b6..3fdde8270575 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
@@ -1,50 +1,60 @@
-Patch away dependency on ninja python package by falling back to checking
-returncode of `ninja --version`.
+Patch away dependency on ninja python package by checking returncode of `ninja --version`.
 
 Author: Viktor Rehnberg (Chalmers University of Technology)
-
+Adapted by Alexander Grund (TU Dresden)
 
 diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
-index 85a2f9b2..8bb64626 100644
 --- a/deepspeed/env_report.py
 +++ b/deepspeed/env_report.py
-@@ -62,7 +62,7 @@ def ninja_installed():
-     try:
-         import ninja  # noqa: F401 # type: ignore
-     except ImportError:
+@@ -59,11 +59,7 @@ def op_report(verbose=True):
+ 
+ 
+ def ninja_installed():
+-    try:
+-        import ninja  # noqa: F401 # type: ignore
+-    except ImportError:
 -        return False
-+        return (subprocess.run(["ninja", "--version"]).returncode == 0)
-     return True
+-    return True
++    return subprocess.run(["ninja", "--version"], check=False).returncode == 0
  
  
+ def nvcc_version():
 diff --git a/op_builder/builder.py b/op_builder/builder.py
-index 8dc825c7..970d18b2 100644
 --- a/op_builder/builder.py
 +++ b/op_builder/builder.py
-@@ -487,7 +487,8 @@ class OpBuilder(ABC):
-         try:
-             import ninja  # noqa: F401 # type: ignore
-         except ImportError:
--            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
-+            if subprocess.run(["ninja", "--version"]).returncode != 0:
-+                raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+@@ -484,9 +484,7 @@ class OpBuilder(ABC):
+             raise RuntimeError(
+                 f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+             )
+-        try:
+-            import ninja  # noqa: F401 # type: ignore
+-        except ImportError:
++        if subprocess.run(["ninja", "--version"], check=False).returncode != 0:
+             raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
  
          if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
-             self.build_for_cpu = not torch.cuda.is_available()
 diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
-index 81b15f19..cf0a1cc0 100644
 --- a/op_builder/xpu/builder.py
 +++ b/op_builder/xpu/builder.py
-@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder):
-         try:
-             import ninja  # noqa: F401
-         except ImportError:
--            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
-+            if subprocess.run(["ninja", "--version"]).returncode != 0:
-+                raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+@@ -6,6 +6,7 @@
+ import os
+ import time
+ import importlib
++import subprocess
+ 
+ try:
+     # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+@@ -86,9 +87,7 @@ class SYCLOpBuilder(OpBuilder):
+             raise RuntimeError(
+                 f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+             )
+-        try:
+-            import ninja  # noqa: F401
+-        except ImportError:
++        if subprocess.run(["ninja", "--version"], check=False).returncode != 0:
+             raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
  
          self.jit_mode = True
-         from intel_extension_for_pytorch.xpu.cpp_extension import load
 diff --git a/requirements/requirements.txt b/requirements/requirements.txt
 index 80c9f9b3..eed77fa3 100755
 --- a/requirements/requirements.txt
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index eac19e66f317..44fac2bab220 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -74,11 +74,12 @@ exts_list = [
         ],
         'checksums': [
             {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
-            {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'},
+            {'DeepSpeed-0.14.2_no-ninja-dep.patch':
+             '681b4e7b9101a3decbd48f63fcb55b5262a10cf411a1c107a951ff58c1aa93cd'},
             {'DeepSpeed-0.14.5_avoid-access-to-home.patch':
              '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'},
             {'DeepSpeed-0.14.5_pdsh-env-vars.patch':
-             '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'},
+             '02556620ac643d273a2fa9c019d437cd874a6c19759fa59baaa0e9a41d0a5240'},
             {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'},
             {'DeepSpeed-0.14.5_test-nvme-offload.patch':
              '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'},
@@ -87,7 +88,7 @@ exts_list = [
         ],
         'jit_only_ops': [
             'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS',
-            # Cannot prebuild with transformer OP: https://github.com/deepspeedai/DeepSpeed/issues/949
+            # Cannot be prebuild if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949
             # 'STOCHASTIC_TRANSFORMER',
         ],
         'testinstall': True,
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
index 9d4342f66bcb..be3e25a5a713 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
@@ -1,22 +1,17 @@
-From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001
-From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
-Date: Wed, 25 Sep 2024 09:29:23 +0000
-Subject: [PATCH] Add software relevant environment variables
+Add software relevant environment variables
 
 The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is
 not included in these exports then the python .so file may not be found.
 Also including what seemed important and was added from loading DeepSpeed.
-(Couldn't add everything, then argumet list becomes too long).
+(Couldn't add everything, then argument list becomes too long).
 
-See
- - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098
-for more details.
----
- deepspeed/launcher/runner.py | 5 +++++
- 1 file changed, 5 insertions(+)
+See https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098
+
+Note: Those are prefixes of variables to be included.
+
+Author: Viktor Rehnberg (Chalmers University of Technology)
 
 diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
-index 07d1713e..e9cd61b8 100755
 --- a/deepspeed/launcher/runner.py
 +++ b/deepspeed/launcher/runner.py
 @@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator
@@ -31,6 +26,3 @@ index 07d1713e..e9cd61b8 100755
  EXPORT_ENVS += NEBULA_EXPORT_ENVS
  DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
  DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
--- 
-2.39.3
-

From 54ea8a3b65d023bafc50ff68f227848a061464cc Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Fri, 27 Feb 2026 16:32:08 +0100
Subject: [PATCH 03/12] Add dependencies

---
 ...ccelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb | 36 +++++++++
 .../CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb   | 58 ++++++++++++++
 .../CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb     | 77 +++++++++++++++++++
 .../d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb     | 27 +++++++
 4 files changed, 198 insertions(+)
 create mode 100644 easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb
 create mode 100644 easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb
 create mode 100644 easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb
 create mode 100644 easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb

diff --git a/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..ca32d9c6984e
--- /dev/null
+++ b/easybuild/easyconfigs/a/accelerate/accelerate-1.10.0-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,36 @@
+easyblock = 'PythonBundle'
+
+name = 'accelerate'
+version = '1.10.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://github.com/huggingface/accelerate'
+description = """A simple way to launch, train, and use PyTorch models on almost any device and
+distributed configuration, automatic mixed precision (including fp8),
+and easy-to-configure FSDP and DeepSpeed support."""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+dependencies = [
+    ('Python', '3.11.5'),
+    ('Python-bundle-PyPI', '2023.10'),
+    ('SciPy-bundle', '2023.11'),
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('PyTorch-bundle', '2.3.0', versionsuffix),
+    ('PyYAML', '6.0.1'),
+    ('Safetensors', '0.4.4'),
+]
+
+exts_list = [
+    ('huggingface-hub', '0.30.2', {
+        'sources': ['huggingface_hub-%(version)s.tar.gz'],
+        'checksums': ['9a7897c5b6fd9dad3168a794a8998d6378210f5b9688d0dfc180b1a228dc2466'],
+    }),
+    (name, version, {
+        'checksums': ['8270568fda9036b5cccdc09703fef47872abccd56eb5f6d53b54ea5fb7581496'],
+    }),
+]
+
+sanity_check_commands = ['accelerate test']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..ee1c2c569e94
--- /dev/null
+++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-4.1.0-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,58 @@
+easyblock = 'CMakeMake'
+
+name = 'CUTLASS'
+version = '4.1.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://github.com/NVIDIA/cutlass'
+description = """CUTLASS is a collection of CUDA C++ template
+abstractions for implementing high-performance matrix-matrix
+multiplication (GEMM) and related computations at all levels and scales
+within CUDA. It incorporates strategies for hierarchical decomposition
+and data movement similar to those used to implement cuBLAS and cuDNN.
+CUTLASS decomposes these "moving parts" into reusable, modular software
+components abstracted by C++ template classes. Primitives for different
+levels of a conceptual parallelization hierarchy can be specialized and
+tuned via custom tiling sizes, data types, and other algorithmic policy.
+The resulting flexibility simplifies their use as building blocks within
+custom kernels and applications."""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+github_account = 'NVIDIA'
+source_urls = [GITHUB_LOWER_SOURCE]
+sources = [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}]
+patches = [
+    'CUTLASS-4.1.0_fix-version.patch',
+    'CUTLASS-4.1.0_add-arch-guards-to-tests.patch',
+]
+checksums = [
+    {'CUTLASS-4.1.0.tar.gz': '8d4675b11e9e5207e3940eaac0f46db934ada371cbb3627c9fda642d912b6230'},
+    {'CUTLASS-4.1.0_fix-version.patch': 'e2c7f66e6fd298b3af5339e17c0c75ded7d726cdf6cde003f60263e27ae46495'},
+    {'CUTLASS-4.1.0_add-arch-guards-to-tests.patch':
+     '81cd18d83bdedf3ed1f7add68bbff1635cf9d76bb9e184efbc62cd95caee4275'},
+]
+
+builddependencies = [
+    ('CMake', '3.27.6'),
+    ('Python', '3.11.5'),
+]
+
+dependencies = [
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('cuDNN', '9.1.1.17', versionsuffix, SYSTEM),
+]
+
+_copts = [
+    '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"',
+    '-DCUTLASS_ENABLE_CUBLAS=1',
+    '-DCUTLASS_ENABLE_CUDNN=1',
+]
+configopts = ' '.join(_copts)
+
+sanity_check_paths = {
+    'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT],
+    'dirs': ['lib/cmake'],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..fe488db1d3ec
--- /dev/null
+++ b/easybuild/easyconfigs/c/CuPy/CuPy-13.6.0-foss-2023b-CUDA-12.4.0.eb
@@ -0,0 +1,77 @@
+easyblock = 'PythonBundle'
+
+name = 'CuPy'
+version = '13.6.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://cupy.dev'
+description = "CuPy is an open-source array library accelerated with NVIDIA CUDA."
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+builddependencies = [
+    ('hypothesis', '6.90.0'),
+    ('Cython', '3.0.10'),
+    ('setuptools', '80.9.0'),
+]
+
+dependencies = [
+    ('Python', '3.11.5'),
+    ('SciPy-bundle', '2023.11'),
+    ('CUDA', '12.4.0', '', SYSTEM),
+    ('NCCL', '2.20.5', versionsuffix),
+    ('cuTENSOR', '2.0.2.5', versionsuffix, SYSTEM),
+    ('cuSPARSELt', '0.8.0.4', versionsuffix, SYSTEM),  # docs say 0.7.0 or 0.7.1
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['5.0', '6.0', '7.0', '7.5', '8.0', '8.6', '9.0']
+
+exts_default_options = {'source_urls': [PYPI_LOWER_SOURCE]}
+
+_skip_tests = [
+    '--ignore tests/example_tests',  # examples are not included
+    '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_ltisys.py::Test_bode::test_from_state_space',
+    '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_fir_filter_design.py::TestFirls::test_firls',
+    '--deselect tests/cupyx_tests/scipy_tests/signal_tests/test_ltisys.py::TestPlacePoles::test_real_2',
+    # New failures in 13.6.0, they all seems to be related to on-the-fly compilation failing
+    '--deselect tests/cupy_tests/core_tests/test_raw.py::TestRawPicklable',   # ::test_raw_picklable
+    '--deselect tests/cupy_tests/fft_tests/test_callback.py::Test1dCallbacks',
+]
+
+# For testing with new versions of CuPy, please enable the slow testing setting below,
+# but switch to the much lighter fast testing before submitting the .eb file, so users
+# can install on GPUs with moderate RAM.
+
+# _parallel_tests, _test_type = 4, 'not slow'
+_parallel_tests, _test_type = 1, 'fast'
+
+exts_list = [
+    ('fastrlock', '0.8.3', {
+        'checksums': ['4af6734d92eaa3ab4373e6c9a1dd0d5ad1304e172b1521733c6c3b3d73c8fa5d'],
+    }),
+    ('cupy', version, {
+        'patches': [
+            'cupy-13.0.0_cusparselt_0.6.0.patch',
+            'cupy-13.0.0_eb_ccc.patch',
+            'CuPy-13.6.0-Disable_TestRaw_with_nvcc_backend.patch',
+        ],
+        'preinstallopts': 'CUPY_NUM_BUILD_JOBS=%(parallel)s EB_CCC="%(cuda_cc_cmake)s" ',
+        'runtest': 'export CUPY_TEST_GPU_LIMIT=1 CUPY_CACHE_DIR="%%(builddir)s" && '
+                   'pytest -n %s tests -k "%s" ' % (_parallel_tests, _test_type) + ' '.join(_skip_tests),
+        'testinstall': True,
+        'checksums': [
+            {'cupy-13.6.0.tar.gz': '3cba30ae3dd32b5d5c6536e710cb98015227cd4ba83c46b3f1825a7ae55b6667'},
+            {'cupy-13.0.0_cusparselt_0.6.0.patch': '09cb12d26e78079c50b06f17002bf54c66e5e4743b917c5a218d3fe90124d499'},
+            {'cupy-13.0.0_eb_ccc.patch': 'bfe8b46344759f58491f55418bd9c856d6f72d681ee5fef12820009f808d2db1'},
+            {'CuPy-13.6.0-Disable_TestRaw_with_nvcc_backend.patch':
+             '958d80059b085017ed8c8de55ed82a0d52fdf964482e8ccc13d401515979d4b7'},
+        ],
+    }),
+]
+
+sanity_check_commands = [
+    "python -c 'import cupy'",
+]
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb b/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb
new file mode 100644
index 000000000000..afb81cf369ba
--- /dev/null
+++ b/easybuild/easyconfigs/d/DLPack/DLPack-1.2-GCCcore-13.2.0.eb
@@ -0,0 +1,27 @@
+easyblock = 'CMakeMake'
+
+name = 'DLPack'
+version = '1.2'
+
+homepage = 'https://dmlc.github.io/dlpack/latest/'
+description = """DLPack is a stable in-memory data structure for an ndarray
+system to interact with a variety of frameworks."""
+
+toolchain = {'name': 'GCCcore', 'version': '13.2.0'}
+
+github_account = 'dmlc'
+source_urls = [GITHUB_LOWER_SOURCE]
+sources = ['v%(version)s.tar.gz']
+checksums = ['58284a3b004a48450c958a23b30274527ebaf35a061124bbd4193fffa45efbd6']
+
+builddependencies = [
+    ('binutils', '2.40'),
+    ('CMake', '3.27.6'),
+]
+
+sanity_check_paths = {
+    'files': ['include/dlpack/dlpack.h', 'lib/cmake/dlpack/dlpackConfig.cmake'],
+    'dirs': [],
+}
+
+moduleclass = 'lib'

From 32629970d9261ff0149aefaa7505d8d60bc2c23e Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 2 Mar 2026 15:43:46 +0100
Subject: [PATCH 04/12] Add dependencies

---
 .../cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb         | 35 ++++++++++++++++
 .../cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb  | 40 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb
 create mode 100644 easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb

diff --git a/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..e59c12b3bf44
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuSPARSELt/cuSPARSELt-0.8.0.4-CUDA-12.4.0.eb
@@ -0,0 +1,35 @@
+easyblock = 'Tarball'
+
+name = 'cuSPARSELt'
+version = '0.8.0.4'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://docs.nvidia.com/cuda/cusparselt/index.html'
+description = """NVIDIA cuSPARSELt is a high-performance CUDA library dedicated to general matrix-matrix operations in
+which at least one operand is a sparse matrix"""
+
+toolchain = SYSTEM
+
+local_cudamajver = '12'
+
+local_arch = {'arm64': 'sbsa', 'aarch64': 'sbsa'}.get(ARCH, ARCH)
+source_urls = ['https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-%s/' % local_arch]
+sources = ['libcusparse_lt-linux-%s-%%(version)s_cuda%s-archive.tar.xz' %
+           (local_arch, local_cudamajver)]
+checksums = [{
+    'libcusparse_lt-linux-x86_64-%%(version)s_cuda%s-archive.tar.xz' % local_cudamajver:
+        '483954591766bade877becef126d53908d5fef5d7468b503736af37388669c08',
+    'libcusparse_lt-linux-sbsa-%%(version)s_cuda%s-archive.tar.xz' % local_cudamajver:
+        'b59e2f8ffd154b156b2d74ccd7cad7775385693bec8cb9562596060072c515f2',
+}]
+
+dependencies = [('CUDA', '12.4.0')]
+
+sanity_check_paths = {
+    'files': ['include/cusparseLt.h',
+              'lib/libcusparseLt.%s' % SHLIB_EXT,
+              'lib/libcusparseLt_static.a'],
+    'dirs': [],
+}
+
+moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb b/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb
new file mode 100644
index 000000000000..dd4624817a71
--- /dev/null
+++ b/easybuild/easyconfigs/c/cuTENSOR/cuTENSOR-2.0.2.5-CUDA-12.4.0.eb
@@ -0,0 +1,40 @@
+easyblock = 'Tarball'
+
+name = 'cuTENSOR'
+version = '2.0.2.5'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://developer.nvidia.com/cutensor'
+description = """The cuTENSOR Library is a GPU-accelerated tensor linear algebra library providing tensor contraction,
+ reduction and elementwise operations."""
+
+toolchain = SYSTEM
+
+source_urls = [
+    'https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-%(arch)s/'
+]
+sources = ['libcutensor-linux-%(arch)s-%(version)s-archive.tar.xz']
+
+checksums = [{
+    'libcutensor-linux-sbsa-%(version)s-archive.tar.xz':
+        '5163dd40f11f328e469a6d9b0056c8346f5d59ed538c18d6b954e4ae657c69cc',
+    'libcutensor-linux-x86_64-%(version)s-archive.tar.xz':
+        '0e957ae7b352f599de34b6fa1ba999b0617887f885d7436ac5737d71a6b83baa',
+}]
+
+local_cudamajver = '12'
+dependencies = [('CUDA', '12.4.0')]
+
+sanity_check_paths = {
+    'files': ['include/cutensor.h', 'include/cutensor/types.h',
+              'lib/%s/libcutensor.%s' % (local_cudamajver, SHLIB_EXT),
+              'lib/%s/libcutensor_static.a' % local_cudamajver],
+    'dirs': [],
+}
+
+modextrapaths = {
+    'LD_LIBRARY_PATH': ['lib/%s' % local_cudamajver],
+    'LIBRARY_PATH': ['lib/%s' % local_cudamajver],
+}
+
+moduleclass = 'lib'

From 74c7cba943c8f010a3a67b76d6aac619e111f092 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 2 Mar 2026 17:21:25 +0100
Subject: [PATCH 05/12] Update patches and don't precompile EvoformerAttn

---
 .../DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb      | 13 ++++++++-----
 ...ep.patch => DeepSpeed-0.14.5_no-ninja-dep.patch} |  4 ++--
 2 files changed, 10 insertions(+), 7 deletions(-)
 rename easybuild/easyconfigs/d/DeepSpeed/{DeepSpeed-0.14.2_no-ninja-dep.patch => DeepSpeed-0.14.5_no-ninja-dep.patch} (97%)

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index 44fac2bab220..a3f2ead2c017 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -65,8 +65,8 @@ exts_list = [
         # Test suite not available on pypi
         'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}],
         'patches': [
-            'DeepSpeed-0.14.2_no-ninja-dep.patch',
             'DeepSpeed-0.14.5_avoid-access-to-home.patch',
+            'DeepSpeed-0.14.5_no-ninja-dep.patch',
             'DeepSpeed-0.14.5_pdsh-env-vars.patch',
             'DeepSpeed-0.14.5_pic-compile.patch',
             'DeepSpeed-0.14.5_test-nvme-offload.patch',
@@ -74,10 +74,10 @@ exts_list = [
         ],
         'checksums': [
             {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
-            {'DeepSpeed-0.14.2_no-ninja-dep.patch':
-             '681b4e7b9101a3decbd48f63fcb55b5262a10cf411a1c107a951ff58c1aa93cd'},
             {'DeepSpeed-0.14.5_avoid-access-to-home.patch':
              '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'},
+            {'DeepSpeed-0.14.5_no-ninja-dep.patch':
+             'e974a928b03a180da4e67da2f347c25968cb41f9c6037a9796ab776a9a4b0547'},
             {'DeepSpeed-0.14.5_pdsh-env-vars.patch':
              '02556620ac643d273a2fa9c019d437cd874a6c19759fa59baaa0e9a41d0a5240'},
             {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'},
@@ -88,8 +88,11 @@ exts_list = [
         ],
         'jit_only_ops': [
             'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS',
-            # Cannot be prebuild if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949
-            # 'STOCHASTIC_TRANSFORMER',
+            # Cannot be prebuilt if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949
+            'STOCHASTIC_TRANSFORMER',
+            # Cannot be prebuilt in several scenarious, e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present:
+            # See https://github.com/deepspeedai/DeepSpeed/pull/7760
+            'EVOFORMER_ATTN',
         ],
         'testinstall': True,
         'runtest': ' && '.join((
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch
similarity index 97%
rename from easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
rename to easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch
index 3fdde8270575..d4e2a0d66997 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_no-ninja-dep.patch
@@ -22,7 +22,7 @@ diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
 diff --git a/op_builder/builder.py b/op_builder/builder.py
 --- a/op_builder/builder.py
 +++ b/op_builder/builder.py
-@@ -484,9 +484,7 @@ class OpBuilder(ABC):
+@@ -533,9 +533,7 @@ class OpBuilder(ABC):
              raise RuntimeError(
                  f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
              )
@@ -63,5 +63,5 @@ index 80c9f9b3..eed77fa3 100755
  hjson
 -ninja
  numpy
+ nvidia-ml-py
  packaging>=20.0
- psutil

From ea062e82669ea72d62e50fde78e1724a78dbb721 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 4 Mar 2026 12:34:48 +0100
Subject: [PATCH 06/12] Fix line length

---
 .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index a3f2ead2c017..10d7991ebda2 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -90,7 +90,8 @@ exts_list = [
             'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS',
             # Cannot be prebuilt if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949
             'STOCHASTIC_TRANSFORMER',
-            # Cannot be prebuilt in several scenarious, e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present:
+            # Cannot be prebuilt in several scenarious,
+            # e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present:
             # See https://github.com/deepspeedai/DeepSpeed/pull/7760
             'EVOFORMER_ATTN',
         ],

From 038c35ae2b87abf52bd363b1a924cc916695a50f Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 18 Mar 2026 08:16:48 +0100
Subject: [PATCH 07/12] Update patch and test config

---
 .../DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb         |  4 ++--
 .../DeepSpeed-0.14.5_avoid-access-to-home.patch        | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index 10d7991ebda2..4d1143477835 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -75,7 +75,7 @@ exts_list = [
         'checksums': [
             {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
             {'DeepSpeed-0.14.5_avoid-access-to-home.patch':
-             '36fe0c66b2692995d47de77c54192fb504a97b0a129959b2165dcbb8072ac07c'},
+             'edb39720a27b74170c87c8c51ecb8be6fd6fe2fa346f2a10b343a73884c5c412'},
             {'DeepSpeed-0.14.5_no-ninja-dep.patch':
              'e974a928b03a180da4e67da2f347c25968cb41f9c6037a9796ab776a9a4b0547'},
             {'DeepSpeed-0.14.5_pdsh-env-vars.patch':
@@ -99,7 +99,7 @@ exts_list = [
         'runtest': ' && '.join((
             'ln -s $PWD/tests/ ../tests',
             'cd ..',
-            f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}'",
+            f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}' --durations=0",
         )),
     }),
 ]
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
index 85f967a3388b..35704b40e883 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_avoid-access-to-home.patch
@@ -17,15 +17,14 @@ set to an NFS directory, a warning will be printed whereas prior to this
 change no warning would be printed
 
 fixes #6486
----
- .../ops/transformer/inference/triton/matmul_ext.py | 14 +++++++++-----
- 1 file changed, 9 insertions(+), 5 deletions(-)
+
+Adapted to use $TRITON_HOME by Alexander Grund (TU Dresden)
 
 diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
 index c77d8a8e11c0..412c8740a216 100644
 --- a/deepspeed/ops/transformer/inference/triton/matmul_ext.py
 +++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
-@@ -40,13 +40,17 @@ class TritonCacheDir:
+@@ -40,13 +40,18 @@ class TritonCacheDir:
      _warning_printed = False
  
      @staticmethod
@@ -43,7 +42,8 @@ index c77d8a8e11c0..412c8740a216 100644
 +
 +    @staticmethod
 +    def default_cache_dir():
-+        tmp_path = os.path.join(Path.home(), ".triton", "autotune")
++        tt_home = os.environ.get('TRITON_HOME') or os.path.join(Path.home(), ".triton")
++        tmp_path = os.path.join(tt_home, "autotune")
          return tmp_path
  
  

From d075d93e97b0726fa9cc0294fd01ddb22e56de06 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 23 Mar 2026 12:47:53 +0100
Subject: [PATCH 08/12] Use CUTLASS checkout and pytest-forked

---
 ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb | 26 +++++++--
 .../DeepSpeed-0.14.5_use-eb-cutlass.patch     | 55 -------------------
 .../pytest-forked-1.6.0-GCCcore-13.2.0.eb     | 22 ++++++++
 3 files changed, 42 insertions(+), 61 deletions(-)
 delete mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch
 create mode 100644 easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index 4d1143477835..f36255209a85 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -14,13 +14,13 @@ toolchain = {'name': 'foss', 'version': '2023b'}
 builddependencies = [
     ('Ninja', '1.11.1'),
     ('Transformers', '4.44.0'),
+    ('pytest-forked', '1.6.0'),
 ]
 local_pytorch_version = '2.3.0'
 dependencies = [
     ('Python', '3.11.5'),
     ('CUDA', '12.4.0', '', SYSTEM),
     ('NCCL', '2.20.5', '-CUDA-%(cudaver)s'),
-    ('CUTLASS', '4.1.0', '-CUDA-%(cudaver)s'),
     ('CuPy', '13.6.0', '-CUDA-%(cudaver)s'),
     ('Triton', '2.3.1', '-CUDA-%(cudaver)s'),
     ('accelerate', '1.10.0', '-CUDA-%(cudaver)s'),
@@ -44,6 +44,19 @@ local_excluded_ds_tests = (
     'test_fp_quant[256-qbits8-bf16]',  # Error of 0.00909423828125 > 0.004
 )
 
+components = [
+    ('CUTLASS', '4.1.0', {
+        'easyblock': 'Tarball',
+        'source_urls': ['https://github.com/NVIDIA/cutlass/archive/refs/tags'],
+        'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}],
+        'start_dir': '%(namelower)s-%(version)s',
+        'target_dir': 'extra/cutlass',
+    }),
+]
+
+local_cutlass_path = '%(installdir)s/extra/cutlass'
+local_cutlass_opt = f"export CUTLASS_PATH='{local_cutlass_path}' && "
+
 github_account = 'microsoft'
 exts_list = [
     ('hjson', '3.1.0', {
@@ -70,7 +83,6 @@ exts_list = [
             'DeepSpeed-0.14.5_pdsh-env-vars.patch',
             'DeepSpeed-0.14.5_pic-compile.patch',
             'DeepSpeed-0.14.5_test-nvme-offload.patch',
-            'DeepSpeed-0.14.5_use-eb-cutlass.patch',
         ],
         'checksums': [
             {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
@@ -83,18 +95,16 @@ exts_list = [
             {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'},
             {'DeepSpeed-0.14.5_test-nvme-offload.patch':
              '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'},
-            {'DeepSpeed-0.14.5_use-eb-cutlass.patch':
-             '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'},
         ],
         'jit_only_ops': [
             'SPARSE_ATTN', 'FP_QUANTIZER', 'CUTLASS_OPS', 'RAGGED_DEVICE_OPS',
-            # Cannot be prebuilt if "Transformer" OP is also built: https://github.com/deepspeedai/DeepSpeed/issues/949
-            'STOCHASTIC_TRANSFORMER',
             # Cannot be prebuilt in several scenarious,
             # e.g. multiple GPU archs (cuda-compute-capabilities), no GPU present:
             # See https://github.com/deepspeedai/DeepSpeed/pull/7760
             'EVOFORMER_ATTN',
         ],
+        'preinstallopts': local_cutlass_opt,
+        'pretestopts': local_cutlass_opt,
         'testinstall': True,
         'runtest': ' && '.join((
             'ln -s $PWD/tests/ ../tests',
@@ -104,4 +114,8 @@ exts_list = [
     }),
 ]
 
+modextravars = {
+    'CUTLASS_PATH': local_cutlass_path,
+}
+
 moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch
deleted file mode 100644
index 35fe2cb8b66a..000000000000
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch
+++ /dev/null
@@ -1,55 +0,0 @@
-From 27a64a22d6f84585ce9685e72ed9000f569ba941 Mon Sep 17 00:00:00 2001
-From: Viktor Rehnberg <viktor.rehnberg@gmail.com>
-Date: Thu, 10 Oct 2024 05:57:38 +0000
-Subject: [PATCH] Use EB env vars to search for CUTLASS
-
-Instead of needing to set CUTLASS_PATH to the cutlass source directory,
-use EBROOTCUTLASS to find the installation directory.
----
- op_builder/evoformer_attn.py | 15 ++++++++-------
- 1 file changed, 8 insertions(+), 7 deletions(-)
-
-diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
-index af3aa742..5545b1aa 100644
---- a/op_builder/evoformer_attn.py
-+++ b/op_builder/evoformer_attn.py
-@@ -5,6 +5,7 @@
- 
- from .builder import CUDAOpBuilder, installed_cuda_version
- import os
-+from packaging.version import Version
- 
- 
- class EvoformerAttnBuilder(CUDAOpBuilder):
-@@ -14,7 +15,8 @@ class EvoformerAttnBuilder(CUDAOpBuilder):
-     def __init__(self, name=None):
-         name = self.NAME if name is None else name
-         super().__init__(name=name)
--        self.cutlass_path = os.environ.get('CUTLASS_PATH')
-+        self.cutlass_path = os.environ.get('EBROOTCUTLASS')
-+        self.cutlass_version = os.environ.get('EBVERSIONCUTLASS')
- 
-     def absolute_name(self):
-         return f'deepspeed.ops.{self.NAME}_op'
-@@ -50,13 +52,12 @@ class EvoformerAttnBuilder(CUDAOpBuilder):
-             return False
-         if self.cutlass_path is None:
-             if verbose:
--                self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
-+                self.warning("EBROOTCUTLASS not set, please load CUTLASS module.")
-+            return False
-+        if Version(self.cutlass_version) < Version('3.1.0'):
-+            if verbose:
-+                self.warning(f"Please use CUTLASS version >= 3.1.0, detected {self.cutlass_version}")
-             return False
--        with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
--            if '3.1.0' not in f.read():
--                if verbose:
--                    self.warning("Please use CUTLASS version >= 3.1.0")
--                return False
-         cuda_okay = True
-         if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
-             sys_cuda_major, _ = installed_cuda_version()
--- 
-2.39.3
-
diff --git a/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb b/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb
new file mode 100644
index 000000000000..56cbb558a237
--- /dev/null
+++ b/easybuild/easyconfigs/p/pytest-forked/pytest-forked-1.6.0-GCCcore-13.2.0.eb
@@ -0,0 +1,22 @@
+easyblock = 'PythonPackage'
+
+name = 'pytest-forked'
+version = '1.6.0'
+
+homepage = 'https://github.com/pytest-dev/pytest-forked'
+description = "Run tests in isolated forked subprocesses."
+
+toolchain = {'name': 'GCCcore', 'version': '13.2.0'}
+
+sources = [SOURCE_TAR_GZ]
+checksums = ['4dafd46a9a600f65d822b8f605133ecf5b3e1941ebb3588e943b4e3eb71a5a3f']
+
+builddependencies = [
+    ('binutils', '2.40'),
+]
+dependencies = [
+    ('Python', '3.11.5'),
+    ('Python-bundle-PyPI', '2023.10'),
+]
+
+moduleclass = 'tools'

From ee377bcecf278fe48da009bc425e46520fd0167b Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 23 Mar 2026 16:47:56 +0100
Subject: [PATCH 09/12] Don't use pytest-forked

---
 .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index f36255209a85..350de6028eb1 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -14,7 +14,6 @@ toolchain = {'name': 'foss', 'version': '2023b'}
 builddependencies = [
     ('Ninja', '1.11.1'),
     ('Transformers', '4.44.0'),
-    ('pytest-forked', '1.6.0'),
 ]
 local_pytorch_version = '2.3.0'
 dependencies = [
@@ -49,6 +48,7 @@ components = [
         'easyblock': 'Tarball',
         'source_urls': ['https://github.com/NVIDIA/cutlass/archive/refs/tags'],
         'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}],
+        'checksums': ['8d4675b11e9e5207e3940eaac0f46db934ada371cbb3627c9fda642d912b6230'],
         'start_dir': '%(namelower)s-%(version)s',
         'target_dir': 'extra/cutlass',
     }),

From b580e99e08e80e129d5b94d5be78ae676096ee5c Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 23 Mar 2026 17:02:48 +0100
Subject: [PATCH 10/12] Skip test

---
 .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index 350de6028eb1..2a6e654d959d 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -41,6 +41,7 @@ local_excluded_ds_tests = (
     'TestCometMonitor',
     'TestQuantizedInt',  # Downloads model from internet
     'test_fp_quant[256-qbits8-bf16]',  # Error of 0.00909423828125 > 0.004
+    'test_DS4Sci_EvoformerAttention[tensor_shape1-dtype1]',  # Error of 0.05859375 > 0.05
 )
 
 components = [

From 16dd461c27915278978df539d689da40cec5d921 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 18 May 2026 12:46:51 +0200
Subject: [PATCH 11/12] Fix test failure

---
 ...DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb |  5 ++
 ...epSpeed-0.14.5_fix-test-parameterize.patch | 82 +++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index 2a6e654d959d..fac73f793108 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -80,15 +80,19 @@ exts_list = [
         'sources': [{'download_filename': V_VERSION_TAR_GZ, 'filename': SOURCE_TAR_GZ}],
         'patches': [
             'DeepSpeed-0.14.5_avoid-access-to-home.patch',
+            'DeepSpeed-0.14.5_fix-test-parameterize.patch',
             'DeepSpeed-0.14.5_no-ninja-dep.patch',
             'DeepSpeed-0.14.5_pdsh-env-vars.patch',
             'DeepSpeed-0.14.5_pic-compile.patch',
             'DeepSpeed-0.14.5_test-nvme-offload.patch',
+            'debug14.patch',
         ],
         'checksums': [
             {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
             {'DeepSpeed-0.14.5_avoid-access-to-home.patch':
              'edb39720a27b74170c87c8c51ecb8be6fd6fe2fa346f2a10b343a73884c5c412'},
+            {'DeepSpeed-0.14.5_fix-test-parameterize.patch':
+             '1df9c7ceeca0b37aff85390b7bd25e266ddf88c5b1380980e6c13a064840d1d8'},
             {'DeepSpeed-0.14.5_no-ninja-dep.patch':
              'e974a928b03a180da4e67da2f347c25968cb41f9c6037a9796ab776a9a4b0547'},
             {'DeepSpeed-0.14.5_pdsh-env-vars.patch':
@@ -110,6 +114,7 @@ exts_list = [
         'runtest': ' && '.join((
             'ln -s $PWD/tests/ ../tests',
             'cd ..',
+            'export DS_UNITTEST_TIMEOUT=1200',
             f"pytest tests/unit/ -k 'not {' and not '.join(local_excluded_ds_tests)}' --durations=0",
         )),
     }),
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch
new file mode 100644
index 000000000000..8776d6e3d61a
--- /dev/null
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_fix-test-parameterize.patch
@@ -0,0 +1,82 @@
+Avoid this failure during tests:
+>  assert not 'unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambFP16Pipeline::test[topo_config0]'
+
+Reason is that pytest-xdist doesn't seem to work well with having a dict in `pytest.mark.parameterize`
+See https://github.com/pytest-dev/pytest-xdist/issues/922
+
+It is not required at all here
+diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
+--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
++++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
+@@ -37,6 +37,8 @@ if get_accelerator().device_name() == 'hpu':
+     pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True)
+ 
+ 
++topo_config = {"num_pp": 2, "num_dp": 2}
++
+ @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
+ class TestOneBitAdamBasic(DistributedTest):
+     world_size = 2
+@@ -342,19 +344,10 @@ class TestOneBitAdamCheckpointing(DistributedTest):
+             model.save_checkpoint(save_folder, tag=None)
+ 
+ 
+-@pytest.mark.parametrize(
+-    "topo_config",
+-    [
+-        {
+-            "num_pp": 2,
+-            "num_dp": 2
+-        },
+-    ],
+-)
+ class TestOneBitAdamFP16Pipeline(DistributedTest):
+     world_size = 4
+ 
+-    def test(self, topo_config):
++    def test(self):
+         if not get_accelerator().is_fp16_supported():
+             pytest.skip("fp16 is not supported")
+         config_dict = {
+@@ -709,19 +702,10 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
+             model.save_checkpoint(save_folder, tag=None)
+ 
+ 
+-@pytest.mark.parametrize(
+-    "topo_config",
+-    [
+-        {
+-            "num_pp": 2,
+-            "num_dp": 2
+-        },
+-    ],
+-)
+ class TestZeroOneAdamFP16Pipeline(DistributedTest):
+     world_size = 4
+ 
+-    def test(self, topo_config):
++    def test(self):
+         if not get_accelerator().is_fp16_supported():
+             pytest.skip("fp16 is not supported")
+         config_dict = {
+@@ -1105,19 +1089,10 @@ class TestOneBitLambCheckpointing(DistributedTest):
+             model.save_checkpoint(save_folder, tag=None)
+ 
+ 
+-@pytest.mark.parametrize(
+-    "topo_config",
+-    [
+-        {
+-            "num_pp": 2,
+-            "num_dp": 2
+-        },
+-    ],
+-)
+ class TestOneBitLambFP16Pipeline(DistributedTest):
+     world_size = 4
+ 
+-    def test(self, topo_config):
++    def test(self):
+         if not get_accelerator().is_fp16_supported():
+             pytest.skip("fp16 is not supported")
+         config_dict = {

From 3e61dd519fdd0e30647db4d06980b79120c9ae15 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Mon, 18 May 2026 16:59:18 +0200
Subject: [PATCH 12/12] Fix version

---
 .../DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb         | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
index fac73f793108..c4869f07bd0b 100644
--- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
+++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023b-CUDA-12.4.0.eb
@@ -85,7 +85,6 @@ exts_list = [
             'DeepSpeed-0.14.5_pdsh-env-vars.patch',
             'DeepSpeed-0.14.5_pic-compile.patch',
             'DeepSpeed-0.14.5_test-nvme-offload.patch',
-            'debug14.patch',
         ],
         'checksums': [
             {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
@@ -108,7 +107,14 @@ exts_list = [
             # See https://github.com/deepspeedai/DeepSpeed/pull/7760
             'EVOFORMER_ATTN',
         ],
-        'preinstallopts': local_cutlass_opt,
+        'preinstallopts': ' && '.join((
+            # Use this version and no suffix
+            'echo "%(version)s" > version.txt',
+            'echo "" > build.txt',
+            # Disable use of git during build
+            'sed -i "s/command_exists(\'git\')/False/" setup.py',
+            local_cutlass_opt,
+        )),
         'pretestopts': local_cutlass_opt,
         'testinstall': True,
         'runtest': ' && '.join((